-
Notifications
You must be signed in to change notification settings - Fork 16
95 lines (83 loc) · 2.65 KB
/
evalview.yml
File metadata and controls
95 lines (83 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
name: EvalView Agent Tests
on:
push:
branches: [main]
pull_request:
branches: [main]
# Allow manual triggering
workflow_dispatch:
inputs:
test_filter:
description: 'Filter tests by name pattern'
required: false
default: ''
jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install uv
run: pip install uv
- name: Install EvalView
run: uv sync --all-extras
- name: Verify installation
run: uv run evalview --help
- name: Run agent tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Configure your agent endpoint
# AGENT_ENDPOINT: ${{ vars.AGENT_ENDPOINT }}
run: |
# Run tests with parallel execution (4 workers for CI)
uv run evalview run \
--max-workers 4 \
--max-retries 2 \
${{ github.event.inputs.test_filter && format('--filter "{0}"', github.event.inputs.test_filter) || '' }}
- name: Generate HTML report
if: always()
run: |
# Find the latest results file and generate HTML report
RESULTS_FILE=$(ls -t .evalview/results/*.json 2>/dev/null | head -1)
if [ -n "$RESULTS_FILE" ]; then
uv run evalview report "$RESULTS_FILE" --html report.html
fi
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: evalview-results
path: |
.evalview/results/*.json
report.html
retention-days: 30
# Optional: Post results as PR comment
- name: Post PR comment
if: github.event_name == 'pull_request'
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: uv run evalview ci comment
- name: Check for failures
if: always()
run: |
# Parse the latest results and fail if any tests failed
RESULTS_FILE=$(ls -t .evalview/results/*.json 2>/dev/null | head -1)
if [ -n "$RESULTS_FILE" ]; then
# Count failed tests
FAILED=$(python3 -c "
import json
with open('$RESULTS_FILE') as f:
results = json.load(f)
failed = sum(1 for r in results if not r.get('passed', False))
print(failed)
")
if [ "$FAILED" -gt 0 ]; then
echo "::error::$FAILED test(s) failed"
exit 1
fi
fi