@@ -11,8 +11,15 @@ concurrency:
1111
1212jobs :
1313 check-regressions :
14- name : Sage agent always passes
14+ name : ${{ matrix.agent == 'sage' && ' Sage agent always passes' || 'None agent always fails' }}
1515 runs-on : ubuntu-latest
16+ strategy :
17+ matrix :
18+ include :
19+ - agent : sage
20+ expect_success : true
21+ - agent : none
22+ expect_success : false
1623 steps :
1724 - name : Checkout code
1825 uses : actions/checkout@v4
3239 run : uv run --with gdown gdown --folder https://drive.google.com/drive/folders/1CNS_8mf81to02868HA-celmcPEFu4BPE -O shared/databases/duckdb
3340
3441 - name : Run benchmark
35- run : uv run ade run airbnb001 --agent none --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 1
42+ run : uv run ade run all --agent ${{ matrix.agent }} --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 1
3643 env :
3744 USE_DYNAMIC_LOGGING : " FALSE"
3845
@@ -54,10 +61,10 @@ jobs:
5461 if : always()
5562 uses : actions/upload-artifact@v4
5663 with :
57- name : benchmark-results
64+ name : benchmark-results-${{ matrix.agent }}
5865 path : experiments/
5966
60- - name : Check all tasks passed
67+ - name : Check results match expectations
6168 run : |
6269 # Find the results.json file in the run directory
6370 RESULTS_FILE=$(find experiments -maxdepth 2 -name "results.json" | head -1)
@@ -68,31 +75,57 @@ jobs:
6875 fi
6976
7077 echo "Checking results in: $RESULTS_FILE"
78+ echo "Expecting success: ${{ matrix.expect_success }}"
7179
72- # Check that all tasks are resolved
80+ # Check results against expectations
7381 python3 << EOF
7482 import json
7583 import sys
7684
85+ EXPECT_SUCCESS = "${{ matrix.expect_success }}" == "true"
86+
7787 with open("${RESULTS_FILE}") as f:
7888 results = json.load(f)
7989
90+ # Tasks that are allowed to pass even for the "none" agent
91+ ALLOWED_TO_PASS = {"analytics_engineering001"}
92+
8093 failed_tasks = []
94+ passed_tasks = []
8195 for task in results["results"]:
82- if task.get("is_resolved") is not True:
96+ task_id = task["task_id"]
97+ if task.get("is_resolved") is True:
98+ passed_tasks.append(task_id)
99+ else:
83100 parser_results = task.get("parser_results") or {}
84101 failed_tests = [k for k, v in parser_results.items() if v != "passed"]
85102 failed_tasks.append({
86- "task_id": task[" task_id"] ,
103+ "task_id": task_id,
87104 "failed_tests": failed_tests
88105 })
89106
90- if failed_tasks:
91- print(f"❌ {len(failed_tasks)} task(s) failed:")
92- for task in failed_tasks:
93- print(f" - {task['task_id']}: {task['failed_tests']}")
94- sys.exit(1)
107+ total = len(results["results"])
108+
109+ if EXPECT_SUCCESS:
110+ if not failed_tasks:
111+ print(f"✅ All {total} task(s) passed successfully (as expected)")
112+ else:
113+ print(f"❌ {len(failed_tasks)} task(s) failed (expected all to pass):")
114+ for task in failed_tasks:
115+ print(f" - {task['task_id']}: {task['failed_tests']}")
116+ sys.exit(1)
95117 else:
96- total = len(results["results"])
97- print(f"✅ All {total} task(s) passed successfully")
118+ # For "none" agent: fail if any task passes (except allowed ones)
119+ unexpected_passes = [t for t in passed_tasks if t not in ALLOWED_TO_PASS]
120+ if unexpected_passes:
121+ print(f"❌ {len(unexpected_passes)} task(s) unexpectedly passed:")
122+ for task_id in unexpected_passes:
123+ print(f" - {task_id}")
124+ sys.exit(1)
125+ else:
126+ allowed_passed = [t for t in passed_tasks if t in ALLOWED_TO_PASS]
127+ if allowed_passed:
128+ print(f"✅ {len(failed_tasks)} task(s) failed as expected, {len(allowed_passed)} allowed task(s) passed: {allowed_passed}")
129+ else:
130+ print(f"✅ All {len(failed_tasks)} task(s) failed (as expected)")
98131 EOF
0 commit comments