E2E Smoke Test #350
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E Smoke Test | |
| # Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC | |
| on: | |
| schedule: | |
| - cron: '0 */6 * * *' | |
| workflow_dispatch: # Allow manual triggering | |
| inputs: | |
| debug_mode: | |
| description: 'Enable debug output' | |
| required: false | |
| default: false | |
| type: boolean | |
| jobs: | |
| e2e-smoke-test: | |
| name: E2E Smoke Test | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| - name: Install the project | |
| run: uv sync --locked --all-extras --dev | |
| - name: Install tau2 for testing | |
| run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main | |
| - name: Run E2E Smoke Test | |
| id: run_test | |
| env: | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" | |
| run: | | |
| echo "Running e2e smoke test..." | |
| # Run the test and capture both stdout and exit code | |
| set +e # Don't exit on failure | |
| uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ | |
| -v --tb=short --durations=10 \ | |
| --ep-print-summary \ | |
| --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log | |
| TEST_EXIT_CODE=$? | |
| echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT | |
| # List generated files for debugging | |
| echo "π Generated files:" | |
| ls -la *.json 2>/dev/null || echo "No JSON files found" | |
| ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" | |
| # Parse EP summary from terminal output (more reliable than JSON files) | |
| if [ -f test_output.log ]; then | |
| echo "π Parsing EP summary from terminal output..." | |
| # Show the terminal output for debugging | |
| echo "Terminal output:" | |
| cat test_output.log | |
| echo "" | |
| # Extract the EP Summary section from the terminal output (now multi-line) | |
| # First get the header line, then get the following lines with agg_score | |
| EP_SUMMARY_HEADER=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "") | |
| EP_SUMMARY_SECTION=$(grep -A 5 "EP Summary |" test_output.log 2>/dev/null || echo "") | |
| if [ -n "$EP_SUMMARY_SECTION" ]; then | |
| echo "Found EP Summary section:" | |
| echo "$EP_SUMMARY_SECTION" | |
| # Parse the agg_score from the multi-line format: " agg_score=0.420 (valid scores only)" | |
| SUCCESS_RATE=$(echo "$EP_SUMMARY_SECTION" | grep "agg_score=" | grep -o "agg_score=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0") | |
| # Extract other info from the header line | |
| NUM_RUNS=$(echo "$EP_SUMMARY_HEADER" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") | |
| NUM_ROWS=$(echo "$EP_SUMMARY_HEADER" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") | |
| echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT | |
| # Check if success rate meets thresholds (36% - 60% acceptable range) | |
| LOWER_BOUND=0.36 # 36% | |
| UPPER_BOUND=0.6 # 60% | |
| LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0") | |
| UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0") | |
| THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0") | |
| echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT | |
| echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT | |
| echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT | |
| echo "π Evaluation Summary (from terminal output):" | |
| echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%" | |
| echo " - Dataset rows evaluated: $NUM_ROWS" | |
| echo " - Number of runs: $NUM_RUNS" | |
| echo " - Lower bound (β₯36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "β YES" || echo "β NO")" | |
| echo " - Upper bound (β€60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "β YES" || echo "β NO")" | |
| echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "β YES" || echo "β NO")" | |
| else | |
| echo "β No EP Summary section found in terminal output" | |
| echo "threshold_met=0" >> $GITHUB_OUTPUT | |
| echo "success_rate=0" >> $GITHUB_OUTPUT | |
| fi | |
| else | |
| echo "β No terminal output file found" | |
| echo "threshold_met=0" >> $GITHUB_OUTPUT | |
| echo "success_rate=0" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-smoke-test-results-${{ github.run_number }} | |
| path: | | |
| test_output.log | |
| ep_summary*.json | |
| *.log | |
| retention-days: 7 | |
| - name: Validate test results | |
| if: always() | |
| run: | | |
| echo "Validating test results against thresholds..." | |
| TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" | |
| THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" | |
| LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" | |
| UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" | |
| SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" | |
| echo "Test exit code: $TEST_EXIT_CODE" | |
| echo "Threshold met (36%-60%): $THRESHOLD_MET" | |
| echo "Lower bound met (β₯36%): $LOWER_BOUND_MET" | |
| echo "Upper bound met (β€60%): $UPPER_BOUND_MET" | |
| echo "Success rate: $SUCCESS_RATE" | |
| # Fail the job if tests didn't run successfully or thresholds weren't met | |
| if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then | |
| echo "β E2E smoke test FAILED" | |
| echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" | |
| echo " - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})" | |
| exit 1 | |
| elif [ "$TEST_EXIT_CODE" != "0" ]; then | |
| echo "β οΈ E2E smoke test had test execution issues but may have met thresholds" | |
| echo " - Test exit code: $TEST_EXIT_CODE" | |
| echo " - Thresholds met: $THRESHOLD_MET" | |
| # Don't exit with error if thresholds were actually met despite test issues | |
| if [ "$THRESHOLD_MET" = "1" ]; then | |
| echo "β Thresholds met despite execution issues - considering this a pass" | |
| else | |
| exit 1 | |
| fi | |
| elif [ "$THRESHOLD_MET" != "1" ]; then | |
| # Determine which bound was violated | |
| if [ "$LOWER_BOUND_MET" != "1" ]; then | |
| echo "β E2E smoke test FAILED - success rate too low" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Required: β₯36%" | |
| elif [ "$UPPER_BOUND_MET" != "1" ]; then | |
| echo "β E2E smoke test FAILED - success rate suspiciously high" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Maximum expected: β€60%" | |
| echo " - This may indicate test issues or unrealistic performance" | |
| else | |
| echo "β E2E smoke test FAILED - success rate outside acceptable range" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Required range: 36%-60%" | |
| fi | |
| exit 1 | |
| else | |
| echo "β E2E smoke test PASSED" | |
| echo " - Success rate: ${SUCCESS_RATE:-unknown}" | |
| echo " - Within acceptable range: 36%-60%" | |
| fi | |
| - name: Send failure notification to Slack | |
| uses: act10ns/slack@v1 | |
| if: failure() | |
| with: | |
| status: failure | |
| message: | | |
| E2E Smoke Test failed | |
| Success Rate: ${{ steps.run_test.outputs.success_rate || 'Unknown' }} | |
| Expected: 36%-60% to pass | |
| Test Exit Code: ${{ steps.run_test.outputs.test_exit_code || 'Unknown' }} | |
| Job: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} |