Skip to content

E2E Smoke Test

E2E Smoke Test #350

name: E2E Smoke Test
# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
on:
schedule:
- cron: '0 */6 * * *'
workflow_dispatch: # Allow manual triggering
inputs:
debug_mode:
description: 'Enable debug output'
required: false
default: false
type: boolean
jobs:
e2e-smoke-test:
name: E2E Smoke Test
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install the project
run: uv sync --locked --all-extras --dev
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
- name: Run E2E Smoke Test
id: run_test
env:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
run: |
echo "Running e2e smoke test..."
# Run the test and capture both stdout and exit code
set +e # Don't exit on failure
uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
-v --tb=short --durations=10 \
--ep-print-summary \
--ep-summary-json=ep_summary.json 2>&1 | tee test_output.log
TEST_EXIT_CODE=$?
echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
# List generated files for debugging
echo "πŸ“ Generated files:"
ls -la *.json 2>/dev/null || echo "No JSON files found"
ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
# Parse EP summary from terminal output (more reliable than JSON files)
if [ -f test_output.log ]; then
echo "πŸ“‹ Parsing EP summary from terminal output..."
# Show the terminal output for debugging
echo "Terminal output:"
cat test_output.log
echo ""
# Extract the EP Summary section from the terminal output (now multi-line)
# First get the header line, then get the following lines with agg_score
EP_SUMMARY_HEADER=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "")
EP_SUMMARY_SECTION=$(grep -A 5 "EP Summary |" test_output.log 2>/dev/null || echo "")
if [ -n "$EP_SUMMARY_SECTION" ]; then
echo "Found EP Summary section:"
echo "$EP_SUMMARY_SECTION"
# Parse the agg_score from the multi-line format: " agg_score=0.420 (valid scores only)"
SUCCESS_RATE=$(echo "$EP_SUMMARY_SECTION" | grep "agg_score=" | grep -o "agg_score=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0")
# Extract other info from the header line
NUM_RUNS=$(echo "$EP_SUMMARY_HEADER" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
NUM_ROWS=$(echo "$EP_SUMMARY_HEADER" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
# Check if success rate meets thresholds (36% - 60% acceptable range)
LOWER_BOUND=0.36 # 36%
UPPER_BOUND=0.6 # 60%
LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
echo "πŸ“Š Evaluation Summary (from terminal output):"
echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
echo " - Dataset rows evaluated: $NUM_ROWS"
echo " - Number of runs: $NUM_RUNS"
echo " - Lower bound (β‰₯36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "βœ… YES" || echo "❌ NO")"
echo " - Upper bound (≀60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "βœ… YES" || echo "❌ NO")"
echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "βœ… YES" || echo "❌ NO")"
else
echo "❌ No EP Summary section found in terminal output"
echo "threshold_met=0" >> $GITHUB_OUTPUT
echo "success_rate=0" >> $GITHUB_OUTPUT
fi
else
echo "❌ No terminal output file found"
echo "threshold_met=0" >> $GITHUB_OUTPUT
echo "success_rate=0" >> $GITHUB_OUTPUT
fi
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-smoke-test-results-${{ github.run_number }}
path: |
test_output.log
ep_summary*.json
*.log
retention-days: 7
- name: Validate test results
if: always()
run: |
echo "Validating test results against thresholds..."
TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}"
THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}"
LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}"
UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}"
SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
echo "Test exit code: $TEST_EXIT_CODE"
echo "Threshold met (36%-60%): $THRESHOLD_MET"
echo "Lower bound met (β‰₯36%): $LOWER_BOUND_MET"
echo "Upper bound met (≀60%): $UPPER_BOUND_MET"
echo "Success rate: $SUCCESS_RATE"
# Fail the job if tests didn't run successfully or thresholds weren't met
if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
echo "❌ E2E smoke test FAILED"
echo " - Test execution failed (exit code: $TEST_EXIT_CODE)"
echo " - Success rate outside acceptable range (required: 36%-60%, actual: ${SUCCESS_RATE:-unknown})"
exit 1
elif [ "$TEST_EXIT_CODE" != "0" ]; then
echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
echo " - Test exit code: $TEST_EXIT_CODE"
echo " - Thresholds met: $THRESHOLD_MET"
# Don't exit with error if thresholds were actually met despite test issues
if [ "$THRESHOLD_MET" = "1" ]; then
echo "βœ… Thresholds met despite execution issues - considering this a pass"
else
exit 1
fi
elif [ "$THRESHOLD_MET" != "1" ]; then
# Determine which bound was violated
if [ "$LOWER_BOUND_MET" != "1" ]; then
echo "❌ E2E smoke test FAILED - success rate too low"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Required: β‰₯36%"
elif [ "$UPPER_BOUND_MET" != "1" ]; then
echo "❌ E2E smoke test FAILED - success rate suspiciously high"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Maximum expected: ≀60%"
echo " - This may indicate test issues or unrealistic performance"
else
echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Required range: 36%-60%"
fi
exit 1
else
echo "βœ… E2E smoke test PASSED"
echo " - Success rate: ${SUCCESS_RATE:-unknown}"
echo " - Within acceptable range: 36%-60%"
fi
- name: Send failure notification to Slack
uses: act10ns/slack@v1
if: failure()
with:
status: failure
message: |
E2E Smoke Test failed
Success Rate: ${{ steps.run_test.outputs.success_rate || 'Unknown' }}
Expected: 36%-60% to pass
Test Exit Code: ${{ steps.run_test.outputs.test_exit_code || 'Unknown' }}
Job: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}