-
Notifications
You must be signed in to change notification settings - Fork 0
76 lines (64 loc) · 2.29 KB
/
eval.yml
File metadata and controls
76 lines (64 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
name: Eval Matrix
on:
schedule:
- cron: "0 6 * * 1" # Weekly Monday 6am UTC
workflow_dispatch:
inputs:
models:
description: "Comma-separated model list (e.g. claude-sonnet-4-6,gpt-4o-mini)"
required: false
default: "claude-sonnet-4-6"
providers:
description: "Comma-separated provider per model (same order)"
required: false
default: "anthropic"
permissions:
contents: write
jobs:
eval:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
- uses: actions/setup-node@v4
with:
node-version: 22
cache: pnpm
- run: pnpm install --frozen-lockfile
- name: Build
run: pnpm build
# Clone spec repo for eval corpus
- name: Clone spec repo
run: git clone --depth 1 https://github.com/${{ github.repository_owner }}/graphrefly.git ~/src/graphrefly
- name: Run eval matrix
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
EVAL_MODELS: ${{ github.event.inputs.models || 'claude-sonnet-4-6' }}
EVAL_PROVIDERS: ${{ github.event.inputs.providers || 'anthropic' }}
SPEC_EVALS_PATH: ~/src/graphrefly/evals
run: pnpm eval:matrix
- name: Generate scorecard
run: pnpm eval:scorecard
- name: Regression gate
if: hashFiles('evals/results/*.json') != ''
run: |
# Find the two most recent result files for the same layer and compare
LATEST=$(ls -t evals/results/*.json | head -1)
PREV=$(ls -t evals/results/*.json | head -2 | tail -1)
if [ "$LATEST" != "$PREV" ]; then
pnpm eval:compare "$PREV" "$LATEST"
fi
- name: Commit results
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add evals/results/ evals/scorecard/
if git diff --cached --quiet; then
echo "No new results to commit"
else
git commit -m "chore(evals): automated eval run $(date -u +%Y-%m-%d)"
git push
fi