added model quality gha #1329
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Python CI | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| on: | |
| push: | |
| branches: [main] | |
| paths-ignore: | |
| - "docs/**" | |
| - "*.md" | |
| pull_request: | |
| paths-ignore: | |
| - "docs/**" | |
| - "*.md" | |
| workflow_dispatch: | |
| jobs: | |
| lint-and-type-check: | |
| name: Lint & Type Check | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Fetch all history for all tags and branches | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| - name: Install the project | |
| run: uv sync --locked --all-extras --dev | |
| - name: Install tau2 for testing | |
| run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main | |
| - name: Ruff lint | |
| run: uv run ruff check . | |
| - name: Type check with pyright | |
| run: | | |
| # 'set +e' disables immediate exit on error so we can capture and report errors but exit 0 | |
| # Note: We currently suppress pyright failures to allow CI to pass while we iteratively fix all type issues. | |
| # Once all type errors are resolved, we will remove this suppression and enforce strict type checking. | |
| set +e | |
| uv run basedpyright || true | |
| test-core: | |
| name: Core Tests (Python ${{ matrix.python-version }}) | |
| runs-on: ubuntu-latest | |
| needs: lint-and-type-check | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.10", "3.11", "3.12"] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Fetch all history for all tags and branches | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| - name: Install the project | |
| run: uv sync --locked --all-extras --dev | |
| - name: Install tau2 for testing | |
| run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main | |
| - name: Run Core Tests with pytest-xdist | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| E2B_API_KEY: ${{ secrets.E2B_API_KEY }} | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| SUPABASE_PASSWORD: ${{ secrets.SUPABASE_PASSWORD }} | |
| SUPABASE_HOST: ${{ secrets.SUPABASE_HOST }} | |
| SUPABASE_PORT: ${{ secrets.SUPABASE_PORT }} | |
| SUPABASE_DATABASE: ${{ secrets.SUPABASE_DATABASE }} | |
| SUPABASE_USER: ${{ secrets.SUPABASE_USER }} | |
| PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" | |
| run: | | |
| # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow | |
| uv run pytest \ | |
| -n auto \ | |
| --ignore=tests/test_batch_evaluation.py \ | |
| --ignore=tests/pytest/test_frozen_lake.py \ | |
| --ignore=tests/pytest/test_lunar_lander.py \ | |
| --ignore=tests/pytest/test_tau_bench_airline.py \ | |
| --ignore=tests/pytest/test_apps_coding.py \ | |
| --ignore=tests/test_tau_bench_airline_smoke.py \ | |
| --ignore=tests/pytest/test_svgbench.py \ | |
| --ignore=tests/pytest/test_livesvgbench.py \ | |
| --ignore=tests/remote_server/test_remote_fireworks.py \ | |
| --ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \ | |
| --ignore=tests/logging/test_elasticsearch_direct_http_handler.py \ | |
| --ignore=eval_protocol/benchmarks/ \ | |
| --ignore=eval_protocol/quickstart/ \ | |
| --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 | |
| - name: Store coverage file | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-core-${{ matrix.python-version }} | |
| path: coverage.xml | |
| retention-days: 1 | |
| test-batch-evaluation: | |
| name: Batch Evaluation Tests | |
| runs-on: ubuntu-latest | |
| needs: lint-and-type-check | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Fetch all history for all tags and branches | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| - name: Install the project | |
| run: uv sync --locked --all-extras --dev | |
| - name: Install tau2 for testing | |
| run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main | |
| - name: Run Batch Evaluation Tests | |
| env: | |
| E2B_API_KEY: ${{ secrets.E2B_API_KEY }} | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" | |
| run: | | |
| # Run only this specific test file, WITHOUT xdist | |
| uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10 | |
| - name: Store coverage file | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-batch-eval | |
| path: coverage.xml | |
| retention-days: 1 | |
| test-mcp-e2e: | |
| name: MCP End-to-End Tests | |
| runs-on: ubuntu-latest | |
| needs: lint-and-type-check | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Fetch all history for all tags and branches | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| - name: Install the project | |
| run: uv sync --locked --all-extras --dev | |
| - name: Install tau2 for testing | |
| run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main | |
| - name: Store coverage file | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-mcp-e2e | |
| path: coverage.xml | |
| retention-days: 1 | |
| upload-coverage: | |
| name: Upload Coverage | |
| runs-on: ubuntu-latest | |
| needs: [test-core, test-batch-evaluation, test-mcp-e2e] | |
| steps: | |
| - name: Download all coverage artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: coverage-artifacts | |
| - name: Upload coverage to Codecov | |
| uses: codecov/codecov-action@v3 | |
| with: | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| directory: ./coverage-artifacts/ | |
| fail_ci_if_error: false | |
| verbose: true |