diff --git a/.github/run-eval/allowed-model-stubs.json b/.github/run-eval/allowed-model-stubs.json new file mode 100644 index 000000000..f5e5c2857 --- /dev/null +++ b/.github/run-eval/allowed-model-stubs.json @@ -0,0 +1,7 @@ +[ + "claude-sonnet-4-5-20250929", + "claude-haiku-4-5-20251001", + "gpt-5-mini-2025-08-07", + "deepseek-chat", + "kimi-k2-thinking" +] diff --git a/.github/run-eval/authorized-labelers.txt b/.github/run-eval/authorized-labelers.txt new file mode 100644 index 000000000..4a645b2e0 --- /dev/null +++ b/.github/run-eval/authorized-labelers.txt @@ -0,0 +1,14 @@ +mamoodi +neubig +rbren +xingyaoww +amanape +enyst +tofarr +frankxu2004 +huybery +li-boxuan +malhotra5 +ryanhoangt +csmith49 +simonrosenberg diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml new file mode 100644 index 000000000..55d673561 --- /dev/null +++ b/.github/workflows/run-eval.yml @@ -0,0 +1,276 @@ +--- +name: Run Eval + +on: + pull_request_target: + types: [labeled] + release: + types: [published] + workflow_dispatch: + inputs: + branch: + description: Branch or tag to evaluate + required: true + default: main + eval_limit: + description: Number of SWE-bench instances to run + required: true + default: '1' + type: choice + options: + - '1' + - '2' + - '10' + - '50' + - '100' + model_stubs: + description: Comma-separated model stubs to evaluate (must be allowlisted) + required: false + default: '' + type: string + reason: + description: Reason for manual trigger + required: false + default: '' + +env: + BENCHMARKS_REPO: OpenHands/benchmarks + BENCHMARKS_REF: main + EVAL_REPO: OpenHands/evaluation + EVAL_WORKFLOW: eval-job.yml + DATASET: princeton-nlp/SWE-bench_Verified + SPLIT: test + MAX_BUILD_WORKERS: '32' + EVAL_AGENT_IMAGE: ghcr.io/openhands/eval-agent-server + EVAL_AGENT_TARGET: source-minimal + +jobs: + build-and-evaluate: + if: > + github.event_name == 'release' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request_target' && + (github.event.label.name == 'run-eval-1' || + github.event.label.name == 'run-eval-2' || + github.event.label.name == 'run-eval-50' || + github.event.label.name == 'run-eval-100')) + runs-on: blacksmith-32vcpu-ubuntu-2204 + permissions: + contents: read + packages: write + actions: write + issues: write + pull-requests: write + + steps: + - name: Checkout sdk code (base for validation) + uses: actions/checkout@v4 + with: + ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.base.sha || (github.event_name == + 'workflow_dispatch' && github.event.inputs.branch) || github.ref }} + fetch-depth: 0 + + - name: Load allowlists + id: allowlists + run: | + ALLOWED_MODELS_JSON=$(jq -c '.' .github/run-eval/allowed-model-stubs.json) + DEFAULT_MODEL=$(echo "$ALLOWED_MODELS_JSON" | jq -r '.[0]') + if [ -z "$DEFAULT_MODEL" ]; then + echo "No default model stub configured" >&2 + exit 1 + fi + echo "allowed_models=$ALLOWED_MODELS_JSON" >> "$GITHUB_OUTPUT" + echo "default_model=$DEFAULT_MODEL" >> "$GITHUB_OUTPUT" + + - name: Validate labeler + if: github.event_name == 'pull_request_target' + run: | + LABELER="${{ github.actor }}" + if ! grep -Fx "$LABELER" .github/run-eval/authorized-labelers.txt >/dev/null; then + echo "User $LABELER is not authorized to trigger eval." >&2 + exit 1 + fi + + - name: Resolve parameters + id: params + env: + DEFAULT_MODEL: ${{ steps.allowlists.outputs.default_model }} + ALLOWED_MODELS_JSON: ${{ steps.allowlists.outputs.allowed_models }} + run: | + set -euo pipefail + + # Determine eval limit based on trigger + if [ "${{ github.event_name }}" = "pull_request_target" ]; then + LABEL="${{ github.event.label.name }}" + case "$LABEL" in + run-eval-1) EVAL_LIMIT=1 ;; + run-eval-2) EVAL_LIMIT=2 ;; + run-eval-50) EVAL_LIMIT=50 ;; + run-eval-100) EVAL_LIMIT=100 ;; + *) echo "Unsupported label $LABEL" >&2; exit 1 ;; + esac + SDK_REF="${{ github.event.pull_request.head.ref }}" + PR_NUMBER="${{ github.event.pull_request.number }}" + TRIGGER_DESCRIPTION="Label '${LABEL}' on PR #${PR_NUMBER}" + elif [ "${{ github.event_name }}" = "release" ]; then + EVAL_LIMIT=50 + SDK_REF="${{ github.event.release.tag_name }}" + PR_NUMBER="" + TRIGGER_DESCRIPTION="Release ${{ github.event.release.tag_name }}" + else + EVAL_LIMIT="${{ github.event.inputs.eval_limit }}" + SDK_REF="${{ github.event.inputs.branch }}" + PR_NUMBER="" + REASON="${{ github.event.inputs.reason }}" + if [ -z "$REASON" ]; then + REASON="manual" + fi + TRIGGER_DESCRIPTION="Manual trigger: ${REASON}" + fi + + # Normalize and validate models + MODELS_INPUT="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.model_stubs || '' }}" + if [ -z "$MODELS_INPUT" ]; then + MODELS_INPUT="$DEFAULT_MODEL" + fi + MODELS=$(printf '%s' "$MODELS_INPUT" | tr ', ' '\n' | sed '/^$/d' | paste -sd, -) + ALLOWED_LIST=$(echo "$ALLOWED_MODELS_JSON" | jq -r '.[]') + for MODEL in ${MODELS//,/ }; do + if ! echo "$ALLOWED_LIST" | grep -Fx "$MODEL" >/dev/null; then + echo "Model stub '$MODEL' is not allowlisted" >&2 + exit 1 + fi + done + + echo "eval_limit=$EVAL_LIMIT" >> "$GITHUB_OUTPUT" + echo "sdk_ref=$SDK_REF" >> "$GITHUB_OUTPUT" + echo "models=$MODELS" >> "$GITHUB_OUTPUT" + echo "pr_number=$PR_NUMBER" >> "$GITHUB_OUTPUT" + echo "trigger_desc=$TRIGGER_DESCRIPTION" >> "$GITHUB_OUTPUT" + + - name: Checkout evaluated ref for PRs + if: github.event_name == 'pull_request_target' + run: | + set -euo pipefail + # Switch to the PR head for image build and SDK pinning. + REF="${{ steps.params.outputs.sdk_ref }}" + git fetch origin "$REF" --force + git checkout FETCH_HEAD + + - name: Checkout benchmarks repo + uses: actions/checkout@v4 + with: + repository: ${{ env.BENCHMARKS_REPO }} + ref: ${{ env.BENCHMARKS_REF }} + path: benchmarks + submodules: recursive + + - name: Pin benchmarks SDK to evaluated ref + id: sdk-pin + run: | + set -euo pipefail + SDK_REF="${{ steps.params.outputs.sdk_ref }}" + git -C benchmarks/vendor/software-agent-sdk fetch origin "$SDK_REF" + git -C benchmarks/vendor/software-agent-sdk checkout "$SDK_REF" + SDK_SHA=$(git -C benchmarks/vendor/software-agent-sdk rev-parse HEAD) + echo "Using SDK ref $SDK_REF ($SDK_SHA) for image build" + echo "sdk_sha=$SDK_SHA" >> "$GITHUB_OUTPUT" + + - name: Set up uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push SWE-Bench images + env: + SDK_SHA: ${{ steps.sdk-pin.outputs.sdk_sha }} + SDK_REF: ${{ steps.params.outputs.sdk_ref }} + EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }} + DATASET: ${{ env.DATASET }} + SPLIT: ${{ env.SPLIT }} + MAX_BUILD_WORKERS: ${{ env.MAX_BUILD_WORKERS }} + EVAL_AGENT_IMAGE: ${{ env.EVAL_AGENT_IMAGE }} + EVAL_AGENT_TARGET: ${{ env.EVAL_AGENT_TARGET }} + working-directory: benchmarks + run: | + set -euo pipefail + echo "Building images for SDK $SDK_SHA (ref: $SDK_REF)" + uv run benchmarks/swe_bench/build_images.py \ + --dataset "${DATASET}" \ + --split "${SPLIT}" \ + --image "${EVAL_AGENT_IMAGE}" \ + --target "${EVAL_AGENT_TARGET}" \ + --push \ + --max-workers "${MAX_BUILD_WORKERS}" \ + --n-limit "${EVAL_LIMIT}" + + - name: Dispatch evaluation workflow + env: + PAT_TOKEN: ${{ secrets.ALLHANDS_BOT_GITHUB_PAT || secrets.PAT_TOKEN || secrets.GITHUB_TOKEN }} + SDK_SHA: ${{ steps.sdk-pin.outputs.sdk_sha }} + EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }} + MODELS: ${{ steps.params.outputs.models }} + EVAL_REPO: ${{ env.EVAL_REPO }} + EVAL_WORKFLOW: ${{ env.EVAL_WORKFLOW }} + run: | + if [ -z "$PAT_TOKEN" ]; then + echo "Missing PAT_TOKEN for dispatching evaluation workflow" >&2 + exit 1 + fi + PAYLOAD=$(jq -n \ + --arg sdk "$SDK_SHA" \ + --arg eval_limit "$EVAL_LIMIT" \ + --arg models "$MODELS" \ + '{ref: "main", inputs: {sdk_commit: $sdk, eval_limit: $eval_limit, models: $models}}') + RESPONSE=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" -X POST \ + -H "Authorization: Bearer $PAT_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + -d "$PAYLOAD" \ + "https://api.github.com/repos/${EVAL_REPO}/actions/workflows/${EVAL_WORKFLOW}/dispatches") + if [ "$RESPONSE" != "204" ]; then + echo "Dispatch failed (status $RESPONSE):" >&2 + cat /tmp/dispatch.out >&2 + exit 1 + fi + + - name: Comment on PR + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SDK_SHA: ${{ steps.sdk-pin.outputs.sdk_sha }} + EVAL_LIMIT: ${{ steps.params.outputs.eval_limit }} + MODELS: ${{ steps.params.outputs.models }} + TRIGGER_DESC: ${{ steps.params.outputs.trigger_desc }} + EVENT_NAME: ${{ github.event_name }} + PR_NUMBER_INPUT: ${{ steps.params.outputs.pr_number }} + run: | + set -euo pipefail + PR_NUMBER="$PR_NUMBER_INPUT" + if [ "$EVENT_NAME" = "release" ] && [ -z "$PR_NUMBER" ]; then + # Attempt to find the merged PR for this commit + PR_NUMBER=$(curl -sS \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${{ github.repository }}/commits/${SDK_SHA}/pulls" \ + | jq -r '.[0].number // ""') + fi + + if [ -z "$PR_NUMBER" ]; then + echo "No PR found to comment on; skipping comment" + exit 0 + fi + + COMMENT_BODY=$(printf '**Evaluation Triggered**\n\n- Trigger: %s\n- SDK: %s\n- Eval limit: %s\n- Models: %s\n' \ + "$TRIGGER_DESC" "$SDK_SHA" "$EVAL_LIMIT" "$MODELS") + + curl -sS -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + "https://api.github.com/repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \ + -d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')"