OpenHands · simonrosenberg · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/.github/scripts/build_eval_targets.py b/.github/scripts/build_eval_targets.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""Emit the model targets JSON for the run-eval workflow.
+
+The workflow needs to loop over models differently depending on trigger type:
+release triggers run every configured model, workflow_dispatch accepts a
+comma-separated selection, and PR labels fall back to a single default model.
+This helper centralizes that logic, validates model IDs, and writes the
+`targets` / `models_text` outputs for later steps.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+
+def emit(key: str, value: str) -> None:
+    output_path = os.environ.get("GITHUB_OUTPUT")
+    if not output_path:
+        raise SystemExit("GITHUB_OUTPUT is not set")
+    with Path(output_path).open("a", encoding="utf-8") as handle:
+        handle.write(f"{key}={value}\n")
+
+
+def parse_model_ids(
+    event_name: str,
+    raw_models: str,
+    all_models: list[dict],
+    default_model: str,
+) -> list[str]:
+    if event_name == "release":
+        return [entry["id"] for entry in all_models]
+    if event_name == "workflow_dispatch":
+        raw = raw_models or default_model
+        values = [value.strip() for value in raw.split(",")]
+        result = [value for value in values if value]
+        if not result:
+            raise SystemExit("No valid models provided in 'models' input")
+        return result
+    return [default_model]
+
+
+def main() -> None:
+    models = json.loads(os.environ["MODELS_JSON"])
+    event_name = os.environ["GITHUB_EVENT_NAME"]
+    instances = os.environ.get("EVAL_INSTANCES")
+    if not instances:
+        raise SystemExit("EVAL_INSTANCES is not set")
+    default_model = os.environ.get("DEFAULT_MODEL_ID")
+    if not default_model:
+        raise SystemExit("DEFAULT_MODEL_ID is not set")
+
+    model_map = {entry["id"]: entry for entry in models}
+    model_ids = parse_model_ids(
+        event_name,
+        os.environ.get("MODELS_INPUT", "").strip(),
+        models,
+        default_model,
+    )
+
+    invalid = [model_id for model_id in model_ids if model_id not in model_map]
+    if invalid:
+        raise SystemExit(f"Unsupported model(s): {', '.join(invalid)}")
+
+    targets = [
+        {
+            "model_id": model_id,
+            "display_name": model_map[model_id]["display_name"],
+            "llm_config": model_map[model_id]["llm_config"],
+            "eval_instances": instances,
+        }
+        for model_id in model_ids
+    ]
+
+    models_text = ", ".join(
+        f"{target['display_name']} ({target['eval_instances']})" for target in targets
+    )
+
+    emit("targets", json.dumps(targets))
+    emit("models_text", models_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml
@@ -0,0 +1,196 @@
+---
+# Run evaluation on a PR, after releases, or manually
+name: Run Eval
+
+# Runs when a PR is labeled with one of the "run-eval-" labels, after releases, or manually triggered
+on:
+    pull_request:
+        types: [labeled]
+    release:
+        types: [published]
+    workflow_dispatch:
+        inputs:
+            branch:
+                description: Branch to evaluate
+                required: true
+                default: main
+            eval_instances:
+                description: Number of evaluation instances
+                required: true
+                default: '50'
+                type: choice
+                options:
+                    - '1'
+                    - '5'
+                    - '50'
+                    - '200'
+            models:
+                description: Comma-separated model configurations to evaluate (e.g. "claude-sonnet-4-5-20250929,claude-haiku-4-5-20251001")
+                required: true
+                default: claude-sonnet-4-5-20250929
+                type: string
+            reason:
+                description: Reason for manual trigger
+                required: false
+                default: ''
+
+env:
+    # Environment variable for the master GitHub issue number where all evaluation results will be commented
+    # This should be set to the issue number where you want all evaluation results to be posted
+    MASTER_EVAL_ISSUE_NUMBER: ${{ vars.MASTER_EVAL_ISSUE_NUMBER || '0' }}
+    DEFAULT_MODEL_ID: claude-sonnet-4-5-20250929
+    MODELS_JSON: >-
+        [
+          {"id":"claude-sonnet-4-5-20250929","display_name":"Claude Sonnet 4.5","llm_config":{"model":"litellm_proxy/claude-sonnet-4-5-20250929","temperature":0.0}},
+          {"id":"claude-haiku-4-5-20251001","display_name":"Claude Haiku 4.5","llm_config":{"model":"litellm_proxy/claude-haiku-4-5-20251001","temperature":0.0}},
+          {"id":"gpt-5-mini-2025-08-07","display_name":"GPT-5 Mini","llm_config":{"model":"litellm_proxy/gpt-5-mini-2025-08-07","temperature":1.0}},
+          {"id":"deepseek-chat","display_name":"DeepSeek Chat","llm_config":{"model":"litellm_proxy/deepseek/deepseek-chat"}},
+          {"id":"kimi-k2-thinking","display_name":"Kimi K2 Thinking","llm_config":{"model":"litellm_proxy/moonshot/kimi-k2-thinking"}}
+        ]
+
+jobs:
+    trigger-eval:
+        name: Trigger remote eval
+        if: ${{ (github.event_name == 'pull_request' && (github.event.label.name == 'run-eval-1' || github.event.label.name == 'run-eval-2' || 
+            github.event.label.name == 'run-eval-50' || github.event.label.name == 'run-eval-100')) || github.event_name == 'release' || 
+            github.event_name == 'workflow_dispatch' }}
+        runs-on: blacksmith-4vcpu-ubuntu-2204
+
+        steps:
+            - name: Checkout branch
+              uses: actions/checkout@v4
+              with:
+                  ref: ${{ github.event_name == 'pull_request' && github.head_ref || (github.event_name == 'workflow_dispatch' && 
+                      github.event.inputs.branch) || github.ref }}
+
+            - name: Set evaluation parameters
+              id: eval_params
+              env:
+                  MODELS_JSON: ${{ env.MODELS_JSON }}
+                  MODELS_INPUT: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.models || '' }}
+                  DEFAULT_MODEL_ID: ${{ env.DEFAULT_MODEL_ID }}
+              run: |
+                  REPO_URL="https://github.com/${{ github.repository }}"
+                  echo "Repository URL: $REPO_URL"
+
+                  if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+                    EVAL_BRANCH="${{ github.head_ref }}"
+                    echo "PR Branch: $EVAL_BRANCH"
+                  elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+                    EVAL_BRANCH="${{ github.event.inputs.branch }}"
+                    echo "Manual Branch: $EVAL_BRANCH"
+                  else
+                    EVAL_BRANCH="${{ github.ref_name }}"
+                    echo "Release Branch/Tag: $EVAL_BRANCH"
+                  fi
+
+                  if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+                    if [[ "${{ github.event.label.name }}" == "run-eval-1" ]]; then
+                      EVAL_INSTANCES="1"
+                    elif [[ "${{ github.event.label.name }}" == "run-eval-2" ]]; then
+                      EVAL_INSTANCES="2"
+                    elif [[ "${{ github.event.label.name }}" == "run-eval-50" ]]; then
+                      EVAL_INSTANCES="50"
+                    elif [[ "${{ github.event.label.name }}" == "run-eval-100" ]]; then
+                      EVAL_INSTANCES="100"
+                    fi
+                  elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+                    EVAL_INSTANCES="${{ github.event.inputs.eval_instances }}"
+                  else
+                    EVAL_INSTANCES="50"
+                  fi
+
+                  echo "Evaluation instances: $EVAL_INSTANCES"
+                  echo "repo_url=$REPO_URL" >> "$GITHUB_OUTPUT"
+                  echo "eval_branch=$EVAL_BRANCH" >> "$GITHUB_OUTPUT"
+                  echo "eval_instances=$EVAL_INSTANCES" >> "$GITHUB_OUTPUT"
+
+                  export EVAL_INSTANCES
+                  python .github/scripts/build_eval_targets.py
+
+            - name: Trigger remote job
+              env:
+                  PAT_TOKEN: ${{ secrets.ALLHANDS_BOT_GITHUB_PAT }}
+                  TARGETS: ${{ steps.eval_params.outputs.targets }}
+                  EVAL_BRANCH: ${{ steps.eval_params.outputs.eval_branch }}
+                  MODELS_TEXT: ${{ steps.eval_params.outputs.models_text }}
+                  EVAL_INSTANCES: ${{ steps.eval_params.outputs.eval_instances }}
+                  REPO_URL: ${{ steps.eval_params.outputs.repo_url }}
+              run: |
+                  if [ -z "$PAT_TOKEN" ]; then
+                    echo "PAT_TOKEN is required to dispatch remote workflow"
+                    exit 1
+                  fi
+
+                  if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+                    PR_NUMBER="${{ github.event.pull_request.number }}"
+                  else
+                    PR_NUMBER="${{ env.MASTER_EVAL_ISSUE_NUMBER }}"
+                  fi
+
+                  echo "$TARGETS" | jq -c '.[]' | while read -r target; do
+                    model_id=$(echo "$target" | jq -r '.model_id')
+                    llm_config=$(echo "$target" | jq -c '.llm_config')
+                    eval_instances=$(echo "$target" | jq -r '.eval_instances')
+
+                    payload=$(jq -n \
+                      --arg repo "$REPO_URL" \
+                      --arg sdk_ref "$EVAL_BRANCH" \
+                      --arg pr "$PR_NUMBER" \
+                      --arg instances "$eval_instances" \
+                      --arg model_id "$model_id" \
+                      --arg llm_config "$llm_config" \
+                      '{
+                        "ref": "main",
+                        "inputs": {
+                          "sdk-repo": $repo,
+                          "sdk-ref": $sdk_ref,
+                          "benchmarks-repo": "https://github.com/OpenHands/benchmarks.git",
+                          "benchmarks-ref": "main",
+                          "pr-number": $pr,
+                          "eval-instances": $instances,
+                          "llm-model-id": $model_id,
+                          "llm-config": $llm_config,
+                          "trigger-description": "",
+                          "trigger_type": "manual"
+                        }
+                      }')
+
+                    curl -X POST \
+                      -H "Authorization: Bearer $PAT_TOKEN" \
+                      -H "Accept: application/vnd.github+json" \
+                      -d "$payload" \
+                      https://api.github.com/repos/OpenHands/evaluation/actions/workflows/create-branch-v1.yml/dispatches
+                  done
+
+                  if [ -n "${{ secrets.SLACK_TOKEN }}" ]; then
+                    if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+                      TRIGGER_URL="https://github.com/${{ github.repository }}/pull/${{ github.event.pull_request.number }}"
+                      slack_text="PR $TRIGGER_URL has triggered evaluation on $EVAL_INSTANCES instances with models: $MODELS_TEXT (branch $EVAL_BRANCH)."
+                    elif [[ "${{ github.event_name }}" == "release" ]]; then
+                      TRIGGER_URL="https://github.com/${{ github.repository }}/releases/tag/${{ github.ref_name }}"
+                      slack_text="Release $TRIGGER_URL has triggered evaluation on $EVAL_INSTANCES instances with models: $MODELS_TEXT (branch $EVAL_BRANCH)."
+                    else
+                      reason="${{ github.event.inputs.reason || 'No reason provided' }}"
+                      TRIGGER_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+                      slack_text="Manual trigger ($reason) has triggered evaluation on $EVAL_INSTANCES instances with models: $MODELS_TEXT for branch $EVAL_BRANCH."
+                    fi
+                    curl -X POST -H 'Content-type: application/json' --data '{"text":"'"$slack_text"'"}' \
+                      https://hooks.slack.com/services/${{ secrets.SLACK_TOKEN }}
+                  fi
+
+            - name: Comment on issue/PR
+              uses: KeisukeYamashita/create-comment@v1
+              with:
+                  number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || env.MASTER_EVAL_ISSUE_NUMBER }}
+                  unique: false
+                  comment: |
+                      **Evaluation Triggered**
+
+                      **Trigger:** ${{ github.event_name == 'pull_request' && format('Pull Request #{0}', github.event.pull_request.number) || (github.event_name == 'release' && 'Release') || format('Manual Trigger: {0}', github.event.inputs.reason || 'No reason provided') }}
+                      **Branch:** ${{ steps.eval_params.outputs.eval_branch }}
+                      **Instances:** ${{ steps.eval_params.outputs.eval_instances }}
+                      **Models:** ${{ steps.eval_params.outputs.models_text }}
+                      **Commit:** ${{ github.sha }}
+
+                      Running evaluation on the specified branch. Once eval is done, the results will be posted here.