Skip to content

Commit a22f188

Browse files
committed
Add false negative check as well
1 parent 07f4254 commit a22f188

File tree

1 file changed

+47
-14
lines changed

1 file changed

+47
-14
lines changed

.github/workflows/ci.yml

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,15 @@ concurrency:
1111

1212
jobs:
1313
check-regressions:
14-
name: Sage agent always passes
14+
name: ${{ matrix.agent == 'sage' && 'Sage agent always passes' || 'None agent always fails' }}
1515
runs-on: ubuntu-latest
16+
strategy:
17+
matrix:
18+
include:
19+
- agent: sage
20+
expect_success: true
21+
- agent: none
22+
expect_success: false
1623
steps:
1724
- name: Checkout code
1825
uses: actions/checkout@v4
@@ -32,7 +39,7 @@ jobs:
3239
run: uv run --with gdown gdown --folder https://drive.google.com/drive/folders/1CNS_8mf81to02868HA-celmcPEFu4BPE -O shared/databases/duckdb
3340

3441
- name: Run benchmark
35-
run: uv run ade run airbnb001 --agent none --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 1
42+
run: uv run ade run all --agent ${{ matrix.agent }} --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 1
3643
env:
3744
USE_DYNAMIC_LOGGING: "FALSE"
3845

@@ -54,10 +61,10 @@ jobs:
5461
if: always()
5562
uses: actions/upload-artifact@v4
5663
with:
57-
name: benchmark-results
64+
name: benchmark-results-${{ matrix.agent }}
5865
path: experiments/
5966

60-
- name: Check all tasks passed
67+
- name: Check results match expectations
6168
run: |
6269
# Find the results.json file in the run directory
6370
RESULTS_FILE=$(find experiments -maxdepth 2 -name "results.json" | head -1)
@@ -68,31 +75,57 @@ jobs:
6875
fi
6976
7077
echo "Checking results in: $RESULTS_FILE"
78+
echo "Expecting success: ${{ matrix.expect_success }}"
7179
72-
# Check that all tasks are resolved
80+
# Check results against expectations
7381
python3 << EOF
7482
import json
7583
import sys
7684
85+
EXPECT_SUCCESS = "${{ matrix.expect_success }}" == "true"
86+
7787
with open("${RESULTS_FILE}") as f:
7888
results = json.load(f)
7989
90+
# Tasks that are allowed to pass even for the "none" agent
91+
ALLOWED_TO_PASS = {"analytics_engineering001"}
92+
8093
failed_tasks = []
94+
passed_tasks = []
8195
for task in results["results"]:
82-
if task.get("is_resolved") is not True:
96+
task_id = task["task_id"]
97+
if task.get("is_resolved") is True:
98+
passed_tasks.append(task_id)
99+
else:
83100
parser_results = task.get("parser_results") or {}
84101
failed_tests = [k for k, v in parser_results.items() if v != "passed"]
85102
failed_tasks.append({
86-
"task_id": task["task_id"],
103+
"task_id": task_id,
87104
"failed_tests": failed_tests
88105
})
89106
90-
if failed_tasks:
91-
print(f"❌ {len(failed_tasks)} task(s) failed:")
92-
for task in failed_tasks:
93-
print(f" - {task['task_id']}: {task['failed_tests']}")
94-
sys.exit(1)
107+
total = len(results["results"])
108+
109+
if EXPECT_SUCCESS:
110+
if not failed_tasks:
111+
print(f"✅ All {total} task(s) passed successfully (as expected)")
112+
else:
113+
print(f"❌ {len(failed_tasks)} task(s) failed (expected all to pass):")
114+
for task in failed_tasks:
115+
print(f" - {task['task_id']}: {task['failed_tests']}")
116+
sys.exit(1)
95117
else:
96-
total = len(results["results"])
97-
print(f"✅ All {total} task(s) passed successfully")
118+
# For "none" agent: fail if any task passes (except allowed ones)
119+
unexpected_passes = [t for t in passed_tasks if t not in ALLOWED_TO_PASS]
120+
if unexpected_passes:
121+
print(f"❌ {len(unexpected_passes)} task(s) unexpectedly passed:")
122+
for task_id in unexpected_passes:
123+
print(f" - {task_id}")
124+
sys.exit(1)
125+
else:
126+
allowed_passed = [t for t in passed_tasks if t in ALLOWED_TO_PASS]
127+
if allowed_passed:
128+
print(f"✅ {len(failed_tasks)} task(s) failed as expected, {len(allowed_passed)} allowed task(s) passed: {allowed_passed}")
129+
else:
130+
print(f"✅ All {len(failed_tasks)} task(s) failed (as expected)")
98131
EOF

0 commit comments

Comments
 (0)