Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions .github/scripts/build_eval_targets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""Emit the model targets JSON for the run-eval workflow.

The workflow needs to loop over models differently depending on trigger type:
release triggers run every configured model, workflow_dispatch accepts a
comma-separated selection, and PR labels fall back to a single default model.
This helper centralizes that logic, validates model IDs, and writes the
`targets` / `models_text` outputs for later steps.
"""

from __future__ import annotations

import json
import os
from pathlib import Path


def emit(key: str, value: str) -> None:
output_path = os.environ.get("GITHUB_OUTPUT")
if not output_path:
raise SystemExit("GITHUB_OUTPUT is not set")
with Path(output_path).open("a", encoding="utf-8") as handle:
handle.write(f"{key}={value}\n")


def parse_model_ids(
event_name: str,
raw_models: str,
all_models: list[dict],
default_model: str,
) -> list[str]:
if event_name == "release":
return [entry["id"] for entry in all_models]
if event_name == "workflow_dispatch":
raw = raw_models or default_model
values = [value.strip() for value in raw.split(",")]
result = [value for value in values if value]
if not result:
raise SystemExit("No valid models provided in 'models' input")
return result
return [default_model]


def main() -> None:
models = json.loads(os.environ["MODELS_JSON"])
event_name = os.environ["GITHUB_EVENT_NAME"]
instances = os.environ.get("EVAL_INSTANCES")
if not instances:
raise SystemExit("EVAL_INSTANCES is not set")
default_model = os.environ.get("DEFAULT_MODEL_ID")
if not default_model:
raise SystemExit("DEFAULT_MODEL_ID is not set")

model_map = {entry["id"]: entry for entry in models}
model_ids = parse_model_ids(
event_name,
os.environ.get("MODELS_INPUT", "").strip(),
models,
default_model,
)

invalid = [model_id for model_id in model_ids if model_id not in model_map]
if invalid:
raise SystemExit(f"Unsupported model(s): {', '.join(invalid)}")

targets = [
{
"model_id": model_id,
"display_name": model_map[model_id]["display_name"],
"llm_config": model_map[model_id]["llm_config"],
"eval_instances": instances,
}
for model_id in model_ids
]

models_text = ", ".join(
f"{target['display_name']} ({target['eval_instances']})" for target in targets
)

emit("targets", json.dumps(targets))
emit("models_text", models_text)


if __name__ == "__main__":
main()
196 changes: 196 additions & 0 deletions .github/workflows/run-eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
---
# Run evaluation on a PR, after releases, or manually
name: Run Eval

# Runs when a PR is labeled with one of the "run-eval-" labels, after releases, or manually triggered
on:
pull_request:
types: [labeled]
release:
types: [published]
workflow_dispatch:
inputs:
branch:
description: Branch to evaluate
required: true
default: main
eval_instances:
description: Number of evaluation instances
required: true
default: '50'
type: choice
options:
- '1'
- '5'
- '50'
- '200'
models:
description: Comma-separated model configurations to evaluate (e.g. "claude-sonnet-4-5-20250929,claude-haiku-4-5-20251001")
required: true
default: claude-sonnet-4-5-20250929
type: string
reason:
description: Reason for manual trigger
required: false
default: ''

env:
# Environment variable for the master GitHub issue number where all evaluation results will be commented
# This should be set to the issue number where you want all evaluation results to be posted
MASTER_EVAL_ISSUE_NUMBER: ${{ vars.MASTER_EVAL_ISSUE_NUMBER || '0' }}
DEFAULT_MODEL_ID: claude-sonnet-4-5-20250929
MODELS_JSON: >-
[
{"id":"claude-sonnet-4-5-20250929","display_name":"Claude Sonnet 4.5","llm_config":{"model":"litellm_proxy/claude-sonnet-4-5-20250929","temperature":0.0}},
{"id":"claude-haiku-4-5-20251001","display_name":"Claude Haiku 4.5","llm_config":{"model":"litellm_proxy/claude-haiku-4-5-20251001","temperature":0.0}},
{"id":"gpt-5-mini-2025-08-07","display_name":"GPT-5 Mini","llm_config":{"model":"litellm_proxy/gpt-5-mini-2025-08-07","temperature":1.0}},
{"id":"deepseek-chat","display_name":"DeepSeek Chat","llm_config":{"model":"litellm_proxy/deepseek/deepseek-chat"}},
{"id":"kimi-k2-thinking","display_name":"Kimi K2 Thinking","llm_config":{"model":"litellm_proxy/moonshot/kimi-k2-thinking"}}
]

jobs:
trigger-eval:
name: Trigger remote eval
if: ${{ (github.event_name == 'pull_request' && (github.event.label.name == 'run-eval-1' || github.event.label.name == 'run-eval-2' ||
github.event.label.name == 'run-eval-50' || github.event.label.name == 'run-eval-100')) || github.event_name == 'release' ||
github.event_name == 'workflow_dispatch' }}
runs-on: blacksmith-4vcpu-ubuntu-2204

steps:
- name: Checkout branch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.head_ref || (github.event_name == 'workflow_dispatch' &&
github.event.inputs.branch) || github.ref }}

- name: Set evaluation parameters
id: eval_params
env:
MODELS_JSON: ${{ env.MODELS_JSON }}
MODELS_INPUT: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.models || '' }}
DEFAULT_MODEL_ID: ${{ env.DEFAULT_MODEL_ID }}
run: |
REPO_URL="https://github.com/${{ github.repository }}"
echo "Repository URL: $REPO_URL"

if [[ "${{ github.event_name }}" == "pull_request" ]]; then
EVAL_BRANCH="${{ github.head_ref }}"
echo "PR Branch: $EVAL_BRANCH"
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
EVAL_BRANCH="${{ github.event.inputs.branch }}"
echo "Manual Branch: $EVAL_BRANCH"
else
EVAL_BRANCH="${{ github.ref_name }}"
echo "Release Branch/Tag: $EVAL_BRANCH"
fi

if [[ "${{ github.event_name }}" == "pull_request" ]]; then
if [[ "${{ github.event.label.name }}" == "run-eval-1" ]]; then
EVAL_INSTANCES="1"
elif [[ "${{ github.event.label.name }}" == "run-eval-2" ]]; then
EVAL_INSTANCES="2"
elif [[ "${{ github.event.label.name }}" == "run-eval-50" ]]; then
EVAL_INSTANCES="50"
elif [[ "${{ github.event.label.name }}" == "run-eval-100" ]]; then
EVAL_INSTANCES="100"
fi
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
EVAL_INSTANCES="${{ github.event.inputs.eval_instances }}"
else
EVAL_INSTANCES="50"
fi

echo "Evaluation instances: $EVAL_INSTANCES"
echo "repo_url=$REPO_URL" >> "$GITHUB_OUTPUT"
echo "eval_branch=$EVAL_BRANCH" >> "$GITHUB_OUTPUT"
echo "eval_instances=$EVAL_INSTANCES" >> "$GITHUB_OUTPUT"

export EVAL_INSTANCES
python .github/scripts/build_eval_targets.py

- name: Trigger remote job
env:
PAT_TOKEN: ${{ secrets.ALLHANDS_BOT_GITHUB_PAT }}
TARGETS: ${{ steps.eval_params.outputs.targets }}
EVAL_BRANCH: ${{ steps.eval_params.outputs.eval_branch }}
MODELS_TEXT: ${{ steps.eval_params.outputs.models_text }}
EVAL_INSTANCES: ${{ steps.eval_params.outputs.eval_instances }}
REPO_URL: ${{ steps.eval_params.outputs.repo_url }}
run: |
if [ -z "$PAT_TOKEN" ]; then
echo "PAT_TOKEN is required to dispatch remote workflow"
exit 1
fi

if [[ "${{ github.event_name }}" == "pull_request" ]]; then
PR_NUMBER="${{ github.event.pull_request.number }}"
else
PR_NUMBER="${{ env.MASTER_EVAL_ISSUE_NUMBER }}"
fi

echo "$TARGETS" | jq -c '.[]' | while read -r target; do
model_id=$(echo "$target" | jq -r '.model_id')
llm_config=$(echo "$target" | jq -c '.llm_config')
eval_instances=$(echo "$target" | jq -r '.eval_instances')

payload=$(jq -n \
--arg repo "$REPO_URL" \
--arg sdk_ref "$EVAL_BRANCH" \
--arg pr "$PR_NUMBER" \
--arg instances "$eval_instances" \
--arg model_id "$model_id" \
--arg llm_config "$llm_config" \
'{
"ref": "main",
"inputs": {
"sdk-repo": $repo,
"sdk-ref": $sdk_ref,
"benchmarks-repo": "https://github.com/OpenHands/benchmarks.git",
"benchmarks-ref": "main",
"pr-number": $pr,
"eval-instances": $instances,
"llm-model-id": $model_id,
"llm-config": $llm_config,
"trigger-description": "",
"trigger_type": "manual"
}
}')

curl -X POST \
-H "Authorization: Bearer $PAT_TOKEN" \
-H "Accept: application/vnd.github+json" \
-d "$payload" \
https://api.github.com/repos/OpenHands/evaluation/actions/workflows/create-branch-v1.yml/dispatches
done

if [ -n "${{ secrets.SLACK_TOKEN }}" ]; then
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
TRIGGER_URL="https://github.com/${{ github.repository }}/pull/${{ github.event.pull_request.number }}"
slack_text="PR $TRIGGER_URL has triggered evaluation on $EVAL_INSTANCES instances with models: $MODELS_TEXT (branch $EVAL_BRANCH)."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we post this in a public channel? I'm assuming it has results as well.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make sure that add a SLACK_TOKEN that redirects to a public channel

elif [[ "${{ github.event_name }}" == "release" ]]; then
TRIGGER_URL="https://github.com/${{ github.repository }}/releases/tag/${{ github.ref_name }}"
slack_text="Release $TRIGGER_URL has triggered evaluation on $EVAL_INSTANCES instances with models: $MODELS_TEXT (branch $EVAL_BRANCH)."
else
reason="${{ github.event.inputs.reason || 'No reason provided' }}"
TRIGGER_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
slack_text="Manual trigger ($reason) has triggered evaluation on $EVAL_INSTANCES instances with models: $MODELS_TEXT for branch $EVAL_BRANCH."
fi
curl -X POST -H 'Content-type: application/json' --data '{"text":"'"$slack_text"'"}' \
https://hooks.slack.com/services/${{ secrets.SLACK_TOKEN }}
fi

- name: Comment on issue/PR
uses: KeisukeYamashita/create-comment@v1
with:
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || env.MASTER_EVAL_ISSUE_NUMBER }}
unique: false
comment: |
**Evaluation Triggered**

**Trigger:** ${{ github.event_name == 'pull_request' && format('Pull Request #{0}', github.event.pull_request.number) || (github.event_name == 'release' && 'Release') || format('Manual Trigger: {0}', github.event.inputs.reason || 'No reason provided') }}
**Branch:** ${{ steps.eval_params.outputs.eval_branch }}
**Instances:** ${{ steps.eval_params.outputs.eval_instances }}
**Models:** ${{ steps.eval_params.outputs.models_text }}
**Commit:** ${{ github.sha }}

Running evaluation on the specified branch. Once eval is done, the results will be posted here.
Loading