GHA to run sage and check things work #9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Check Regressions | |
| on: | |
| pull_request: | |
| branches: | |
| - main | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| check-regressions: | |
| name: ${{ matrix.agent == 'sage' && 'Sage agent always passes' || 'None agent always fails' }} | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: | |
| include: | |
| - agent: sage | |
| expect_success: true | |
| - agent: none | |
| expect_success: false | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.10' | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| - name: Install dependencies | |
| run: uv sync | |
| - name: Download DuckDB databases | |
| run: uv run --with gdown gdown --folder https://drive.google.com/drive/folders/1CNS_8mf81to02868HA-celmcPEFu4BPE -O shared/databases/duckdb | |
| - name: Run benchmark | |
| run: uv run ade run all --agent ${{ matrix.agent }} --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 8 | |
| env: | |
| USE_DYNAMIC_LOGGING: "FALSE" | |
| - name: Generate HTML report | |
| if: always() | |
| run: | | |
| RUN_DIR=$(find experiments -mindepth 1 -maxdepth 1 -type d | head -1) | |
| if [ -n "$RUN_DIR" ]; then | |
| echo "Generating HTML report for: $RUN_DIR" | |
| uv run python -c " | |
| from scripts_python.generate_results_html import ResultsHTMLGenerator | |
| from pathlib import Path | |
| generator = ResultsHTMLGenerator(Path('${RUN_DIR}')) | |
| generator.generate_all() | |
| " | |
| fi | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results-${{ matrix.agent }} | |
| path: experiments/ | |
| - name: Check results match expectations | |
| run: | | |
| # Find the results.json file in the run directory | |
| RESULTS_FILE=$(find experiments -maxdepth 2 -name "results.json" | head -1) | |
| if [ -z "$RESULTS_FILE" ]; then | |
| echo "Error: No results.json file found" | |
| exit 1 | |
| fi | |
| echo "Checking results in: $RESULTS_FILE" | |
| echo "Expecting success: ${{ matrix.expect_success }}" | |
| # Check results against expectations | |
| python3 << EOF | |
| import json | |
| import sys | |
| EXPECT_SUCCESS = "${{ matrix.expect_success }}" == "true" | |
| with open("${RESULTS_FILE}") as f: | |
| results = json.load(f) | |
| # Tasks that are allowed to pass even for the "none" agent | |
| ALLOWED_TO_PASS = {"analytics_engineering001"} | |
| failed_tasks = [] | |
| passed_tasks = [] | |
| for task in results["results"]: | |
| task_id = task["task_id"] | |
| if task.get("is_resolved") is True: | |
| passed_tasks.append(task_id) | |
| else: | |
| parser_results = task.get("parser_results") or {} | |
| failed_tests = [k for k, v in parser_results.items() if v != "passed"] | |
| failed_tasks.append({ | |
| "task_id": task_id, | |
| "failed_tests": failed_tests | |
| }) | |
| total = len(results["results"]) | |
| if EXPECT_SUCCESS: | |
| if not failed_tasks: | |
| print(f"✅ All {total} task(s) passed successfully (as expected)") | |
| else: | |
| print(f"❌ {len(failed_tasks)} task(s) failed (expected all to pass):") | |
| for task in failed_tasks: | |
| print(f" - {task['task_id']}: {task['failed_tests']}") | |
| sys.exit(1) | |
| else: | |
| # For "none" agent: fail if any task passes (except allowed ones) | |
| unexpected_passes = [t for t in passed_tasks if t not in ALLOWED_TO_PASS] | |
| if unexpected_passes: | |
| print(f"❌ {len(unexpected_passes)} task(s) unexpectedly passed:") | |
| for task_id in unexpected_passes: | |
| print(f" - {task_id}") | |
| sys.exit(1) | |
| else: | |
| allowed_passed = [t for t in passed_tasks if t in ALLOWED_TO_PASS] | |
| if allowed_passed: | |
| print(f"✅ {len(failed_tasks)} task(s) failed as expected, {len(allowed_passed)} allowed task(s) passed: {allowed_passed}") | |
| else: | |
| print(f"✅ All {len(failed_tasks)} task(s) failed (as expected)") | |
| EOF |