Skip to content

Commit 4f0cc50

Browse files
feat: add eval job that runs on CI (#1167)
1 parent c196a24 commit 4f0cc50

File tree

3 files changed

+297
-0
lines changed

3 files changed

+297
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[
2+
"claude-sonnet-4-5-20250929",
3+
"claude-haiku-4-5-20251001",
4+
"gpt-5-mini-2025-08-07",
5+
"deepseek-chat",
6+
"kimi-k2-thinking"
7+
]
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
mamoodi
2+
neubig
3+
rbren
4+
xingyaoww
5+
amanape
6+
enyst
7+
tofarr
8+
frankxu2004
9+
huybery
10+
li-boxuan
11+
malhotra5
12+
ryanhoangt
13+
csmith49
14+
simonrosenberg

.github/workflows/run-eval.yml

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
---
2+
name: Run Eval
3+
4+
on:
5+
pull_request_target:
6+
types: [labeled]
7+
release:
8+
types: [published]
9+
workflow_dispatch:
10+
inputs:
11+
branch:
12+
description: Branch or tag to evaluate
13+
required: true
14+
default: main
15+
eval_limit:
16+
description: Number of SWE-bench instances to run
17+
required: true
18+
default: '1'
19+
type: choice
20+
options:
21+
- '1'
22+
- '2'
23+
- '10'
24+
- '50'
25+
- '100'
26+
model_stubs:
27+
description: Comma-separated model stubs to evaluate (must be allowlisted)
28+
required: false
29+
default: ''
30+
type: string
31+
reason:
32+
description: Reason for manual trigger
33+
required: false
34+
default: ''
35+
36+
env:
37+
BENCHMARKS_REPO: OpenHands/benchmarks
38+
BENCHMARKS_REF: main
39+
EVAL_REPO: OpenHands/evaluation
40+
EVAL_WORKFLOW: eval-job.yml
41+
DATASET: princeton-nlp/SWE-bench_Verified
42+
SPLIT: test
43+
MAX_BUILD_WORKERS: '32'
44+
EVAL_AGENT_IMAGE: ghcr.io/openhands/eval-agent-server
45+
EVAL_AGENT_TARGET: source-minimal
46+
47+
jobs:
48+
build-and-evaluate:
49+
if: >
50+
github.event_name == 'release' ||
51+
github.event_name == 'workflow_dispatch' ||
52+
(github.event_name == 'pull_request_target' &&
53+
(github.event.label.name == 'run-eval-1' ||
54+
github.event.label.name == 'run-eval-2' ||
55+
github.event.label.name == 'run-eval-50' ||
56+
github.event.label.name == 'run-eval-100'))
57+
runs-on: blacksmith-32vcpu-ubuntu-2204
58+
permissions:
59+
contents: read
60+
packages: write
61+
actions: write
62+
issues: write
63+
pull-requests: write
64+
65+
steps:
66+
- name: Checkout sdk code (base for validation)
67+
uses: actions/checkout@v4
68+
with:
69+
ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.base.sha || (github.event_name ==
70+
'workflow_dispatch' && github.event.inputs.branch) || github.ref }}
71+
fetch-depth: 0
72+
73+
- name: Load allowlists
74+
id: allowlists
75+
run: |
76+
ALLOWED_MODELS_JSON=$(jq -c '.' .github/run-eval/allowed-model-stubs.json)
77+
DEFAULT_MODEL=$(echo "$ALLOWED_MODELS_JSON" | jq -r '.[0]')
78+
if [ -z "$DEFAULT_MODEL" ]; then
79+
echo "No default model stub configured" >&2
80+
exit 1
81+
fi
82+
echo "allowed_models=$ALLOWED_MODELS_JSON" >> "$GITHUB_OUTPUT"
83+
echo "default_model=$DEFAULT_MODEL" >> "$GITHUB_OUTPUT"
84+
85+
- name: Validate labeler
86+
if: github.event_name == 'pull_request_target'
87+
run: |
88+
LABELER="${{ github.actor }}"
89+
if ! grep -Fx "$LABELER" .github/run-eval/authorized-labelers.txt >/dev/null; then
90+
echo "User $LABELER is not authorized to trigger eval." >&2
91+
exit 1
92+
fi
93+
94+
- name: Resolve parameters
95+
id: params
96+
env:
97+
DEFAULT_MODEL: ${{ steps.allowlists.outputs.default_model }}
98+
ALLOWED_MODELS_JSON: ${{ steps.allowlists.outputs.allowed_models }}
99+
run: |
100+
set -euo pipefail
101+
102+
# Determine eval limit based on trigger
103+
if [ "${{ github.event_name }}" = "pull_request_target" ]; then
104+
LABEL="${{ github.event.label.name }}"
105+
case "$LABEL" in
106+
run-eval-1) EVAL_LIMIT=1 ;;
107+
run-eval-2) EVAL_LIMIT=2 ;;
108+
run-eval-50) EVAL_LIMIT=50 ;;
109+
run-eval-100) EVAL_LIMIT=100 ;;
110+
*) echo "Unsupported label $LABEL" >&2; exit 1 ;;
111+
esac
112+
SDK_REF="${{ github.event.pull_request.head.ref }}"
113+
PR_NUMBER="${{ github.event.pull_request.number }}"
114+
TRIGGER_DESCRIPTION="Label '${LABEL}' on PR #${PR_NUMBER}"
115+
elif [ "${{ github.event_name }}" = "release" ]; then
116+
EVAL_LIMIT=50
117+
SDK_REF="${{ github.event.release.tag_name }}"
118+
PR_NUMBER=""
119+
TRIGGER_DESCRIPTION="Release ${{ github.event.release.tag_name }}"
120+
else
121+
EVAL_LIMIT="${{ github.event.inputs.eval_limit }}"
122+
SDK_REF="${{ github.event.inputs.branch }}"
123+
PR_NUMBER=""
124+
REASON="${{ github.event.inputs.reason }}"
125+
if [ -z "$REASON" ]; then
126+
REASON="manual"
127+
fi
128+
TRIGGER_DESCRIPTION="Manual trigger: ${REASON}"
129+
fi
130+
131+
# Normalize and validate models
132+
MODELS_INPUT="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.model_stubs || '' }}"
133+
if [ -z "$MODELS_INPUT" ]; then
134+
MODELS_INPUT="$DEFAULT_MODEL"
135+
fi
136+
MODELS=$(printf '%s' "$MODELS_INPUT" | tr ', ' '\n' | sed '/^$/d' | paste -sd, -)
137+
ALLOWED_LIST=$(echo "$ALLOWED_MODELS_JSON" | jq -r '.[]')
138+
for MODEL in ${MODELS//,/ }; do
139+
if ! echo "$ALLOWED_LIST" | grep -Fx "$MODEL" >/dev/null; then
140+
echo "Model stub '$MODEL' is not allowlisted" >&2
141+
exit 1
142+
fi
143+
done
144+
145+
echo "eval_limit=$EVAL_LIMIT" >> "$GITHUB_OUTPUT"
146+
echo "sdk_ref=$SDK_REF" >> "$GITHUB_OUTPUT"
147+
echo "models=$MODELS" >> "$GITHUB_OUTPUT"
148+
echo "pr_number=$PR_NUMBER" >> "$GITHUB_OUTPUT"
149+
echo "trigger_desc=$TRIGGER_DESCRIPTION" >> "$GITHUB_OUTPUT"
150+
151+
- name: Checkout evaluated ref for PRs
152+
if: github.event_name == 'pull_request_target'
153+
run: |
154+
set -euo pipefail
155+
# Switch to the PR head for image build and SDK pinning.
156+
REF="${{ steps.params.outputs.sdk_ref }}"
157+
git fetch origin "$REF" --force
158+
git checkout FETCH_HEAD
159+
160+
- name: Checkout benchmarks repo
161+
uses: actions/checkout@v4
162+
with:
163+
repository: ${{ env.BENCHMARKS_REPO }}
164+
ref: ${{ env.BENCHMARKS_REF }}
165+
path: benchmarks
166+
submodules: recursive
167+
168+
- name: Pin benchmarks SDK to evaluated ref
169+
id: sdk-pin
170+
run: |
171+
set -euo pipefail
172+
SDK_REF="${{ steps.params.outputs.sdk_ref }}"
173+
git -C benchmarks/vendor/software-agent-sdk fetch origin "$SDK_REF"
174+
git -C benchmarks/vendor/software-agent-sdk checkout "$SDK_REF"
175+
SDK_SHA=$(git -C benchmarks/vendor/software-agent-sdk rev-parse HEAD)
176+
echo "Using SDK ref $SDK_REF ($SDK_SHA) for image build"
177+
echo "sdk_sha=$SDK_SHA" >> "$GITHUB_OUTPUT"
178+
179+
- name: Set up uv
180+
uses: astral-sh/setup-uv@v7
181+
with:
182+
enable-cache: true
183+
184+
- name: Log in to GitHub Container Registry
185+
uses: docker/login-action@v3
186+
with:
187+
registry: ghcr.io
188+
username: ${{ github.actor }}
189+
password: ${{ secrets.GITHUB_TOKEN }}
190+
191+
- name: Build and push SWE-Bench images
192+
env:
193+
SDK_SHA: ${{ steps.sdk-pin.outputs.sdk_sha }}
194+
SDK_REF: ${{ steps.params.outputs.sdk_ref }}
195+
EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }}
196+
DATASET: ${{ env.DATASET }}
197+
SPLIT: ${{ env.SPLIT }}
198+
MAX_BUILD_WORKERS: ${{ env.MAX_BUILD_WORKERS }}
199+
EVAL_AGENT_IMAGE: ${{ env.EVAL_AGENT_IMAGE }}
200+
EVAL_AGENT_TARGET: ${{ env.EVAL_AGENT_TARGET }}
201+
working-directory: benchmarks
202+
run: |
203+
set -euo pipefail
204+
echo "Building images for SDK $SDK_SHA (ref: $SDK_REF)"
205+
uv run benchmarks/swe_bench/build_images.py \
206+
--dataset "${DATASET}" \
207+
--split "${SPLIT}" \
208+
--image "${EVAL_AGENT_IMAGE}" \
209+
--target "${EVAL_AGENT_TARGET}" \
210+
--push \
211+
--max-workers "${MAX_BUILD_WORKERS}" \
212+
--n-limit "${EVAL_LIMIT}"
213+
214+
- name: Dispatch evaluation workflow
215+
env:
216+
PAT_TOKEN: ${{ secrets.ALLHANDS_BOT_GITHUB_PAT || secrets.PAT_TOKEN || secrets.GITHUB_TOKEN }}
217+
SDK_SHA: ${{ steps.sdk-pin.outputs.sdk_sha }}
218+
EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }}
219+
MODELS: ${{ steps.params.outputs.models }}
220+
EVAL_REPO: ${{ env.EVAL_REPO }}
221+
EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }}
222+
run: |
223+
if [ -z "$PAT_TOKEN" ]; then
224+
echo "Missing PAT_TOKEN for dispatching evaluation workflow" >&2
225+
exit 1
226+
fi
227+
PAYLOAD=$(jq -n \
228+
--arg sdk "$SDK_SHA" \
229+
--arg eval_limit "$EVAL_LIMIT" \
230+
--arg models "$MODELS" \
231+
'{ref: "main", inputs: {sdk_commit: $sdk, eval_limit: $eval_limit, models: $models}}')
232+
RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \
233+
-H "Authorization: Bearer $PAT_TOKEN" \
234+
-H "Accept: application/vnd.github+json" \
235+
-d "$PAYLOAD" \
236+
"https://api.github.com/repos/${EVAL_REPO}/actions/workflows/${EVAL_WORKFLOW}/dispatches")
237+
if [ "$RESPONSE" != "204" ]; then
238+
echo "Dispatch failed (status $RESPONSE):" >&2
239+
cat /tmp/dispatch.out >&2
240+
exit 1
241+
fi
242+
243+
- name: Comment on PR
244+
env:
245+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
246+
SDK_SHA: ${{ steps.sdk-pin.outputs.sdk_sha }}
247+
EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }}
248+
MODELS: ${{ steps.params.outputs.models }}
249+
TRIGGER_DESC: ${{ steps.params.outputs.trigger_desc }}
250+
EVENT_NAME: ${{ github.event_name }}
251+
PR_NUMBER_INPUT: ${{ steps.params.outputs.pr_number }}
252+
run: |
253+
set -euo pipefail
254+
PR_NUMBER="$PR_NUMBER_INPUT"
255+
if [ "$EVENT_NAME" = "release" ] && [ -z "$PR_NUMBER" ]; then
256+
# Attempt to find the merged PR for this commit
257+
PR_NUMBER=$(curl -sS \
258+
-H "Authorization: Bearer $GITHUB_TOKEN" \
259+
-H "Accept: application/vnd.github+json" \
260+
"https://api.github.com/repos/${{ github.repository }}/commits/${SDK_SHA}/pulls" \
261+
| jq -r '.[0].number // ""')
262+
fi
263+
264+
if [ -z "$PR_NUMBER" ]; then
265+
echo "No PR found to comment on; skipping comment"
266+
exit 0
267+
fi
268+
269+
COMMENT_BODY=$(printf '**Evaluation Triggered**\n\n- Trigger: %s\n- SDK: %s\n- Eval limit: %s\n- Models: %s\n' \
270+
"$TRIGGER_DESC" "$SDK_SHA" "$EVAL_LIMIT" "$MODELS")
271+
272+
curl -sS -X POST \
273+
-H "Accept: application/vnd.github+json" \
274+
-H "Authorization: Bearer $GITHUB_TOKEN" \
275+
"https://api.github.com/repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
276+
-d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')"

0 commit comments

Comments
 (0)