GHA to run sage and check things work #9

Workflow file for this run

	name: Check Regressions

	on:
	pull_request:
	branches:
	- main

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	check-regressions:
	name: ${{ matrix.agent == 'sage' && 'Sage agent always passes' \|\| 'None agent always fails' }}
	runs-on: ubuntu-latest
	strategy:
	matrix:
	include:
	- agent: sage
	expect_success: true
	- agent: none
	expect_success: false
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.10'

	- name: Install uv
	uses: astral-sh/setup-uv@v4

	- name: Install dependencies
	run: uv sync

	- name: Download DuckDB databases
	run: uv run --with gdown gdown --folder https://drive.google.com/drive/folders/1CNS_8mf81to02868HA-celmcPEFu4BPE -O shared/databases/duckdb

	- name: Run benchmark
	run: uv run ade run all --agent ${{ matrix.agent }} --db duckdb --project-type dbt --no-diffs --n-concurrent-trials 8
	env:
	USE_DYNAMIC_LOGGING: "FALSE"

	- name: Generate HTML report
	if: always()
	run: \|
	RUN_DIR=$(find experiments -mindepth 1 -maxdepth 1 -type d \| head -1)
	if [ -n "$RUN_DIR" ]; then
	echo "Generating HTML report for: $RUN_DIR"
	uv run python -c "
	from scripts_python.generate_results_html import ResultsHTMLGenerator
	from pathlib import Path
	generator = ResultsHTMLGenerator(Path('${RUN_DIR}'))
	generator.generate_all()
	"
	fi

	- name: Upload results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results-${{ matrix.agent }}
	path: experiments/

	- name: Check results match expectations
	run: \|
	# Find the results.json file in the run directory
	RESULTS_FILE=$(find experiments -maxdepth 2 -name "results.json" \| head -1)

	if [ -z "$RESULTS_FILE" ]; then
	echo "Error: No results.json file found"
	exit 1
	fi

	echo "Checking results in: $RESULTS_FILE"
	echo "Expecting success: ${{ matrix.expect_success }}"

	# Check results against expectations
	python3 << EOF
	import json
	import sys

	EXPECT_SUCCESS = "${{ matrix.expect_success }}" == "true"

	with open("${RESULTS_FILE}") as f:
	results = json.load(f)

	# Tasks that are allowed to pass even for the "none" agent
	ALLOWED_TO_PASS = {"analytics_engineering001"}

	failed_tasks = []
	passed_tasks = []
	for task in results["results"]:
	task_id = task["task_id"]
	if task.get("is_resolved") is True:
	passed_tasks.append(task_id)
	else:
	parser_results = task.get("parser_results") or {}
	failed_tests = [k for k, v in parser_results.items() if v != "passed"]
	failed_tasks.append({
	"task_id": task_id,
	"failed_tests": failed_tests
	})

	total = len(results["results"])

	if EXPECT_SUCCESS:
	if not failed_tasks:
	print(f"✅ All {total} task(s) passed successfully (as expected)")
	else:
	print(f"❌ {len(failed_tasks)} task(s) failed (expected all to pass):")
	for task in failed_tasks:
	print(f" - {task['task_id']}: {task['failed_tests']}")
	sys.exit(1)
	else:
	# For "none" agent: fail if any task passes (except allowed ones)
	unexpected_passes = [t for t in passed_tasks if t not in ALLOWED_TO_PASS]
	if unexpected_passes:
	print(f"❌ {len(unexpected_passes)} task(s) unexpectedly passed:")
	for task_id in unexpected_passes:
	print(f" - {task_id}")
	sys.exit(1)
	else:
	allowed_passed = [t for t in passed_tasks if t in ALLOWED_TO_PASS]
	if allowed_passed:
	print(f"✅ {len(failed_tasks)} task(s) failed as expected, {len(allowed_passed)} allowed task(s) passed: {allowed_passed}")
	else:
	print(f"✅ All {len(failed_tasks)} task(s) failed (as expected)")
	EOF

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GHA to run sage and check things work #9

Workflow file

GHA to run sage and check things work #9

Uh oh!

Workflow file for this run