Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ jobs:

- name: Run flake8
run: |
flake8 astabench/ tests/
flake8 agent_baselines/ tests/
23 changes: 18 additions & 5 deletions .github/workflows/ci-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,30 @@ jobs:
with:
context: .
file: ./docker/Dockerfile
tags: astabench:latest
target: astabench-base
cache-from: type=gha,scope=astabench # Use GitHub Actions cache
cache-to: type=gha,mode=max,scope=astabench
tags: agent-baselines:latest
target: agent-baselines-base
cache-from: type=gha,scope=agent-baselines # Use GitHub Actions cache
cache-to: type=gha,mode=max,scope=agent-baselines
load: true # Make the image available for later steps

# Install uv for Python package management
- name: Install uv
uses: astral-sh/setup-uv@v3

# Copy sandbox files from installed astabench package
- name: Copy astabench sandbox files
run: |
# Find the astabench installation and copy sandbox files
ASTABENCH_PATH=$(uv run python -c "import astabench; import os; print(os.path.dirname(astabench.__file__))")
mkdir -p ./temp-sandbox/
cp -r "$ASTABENCH_PATH/util/sandbox/"* ./temp-sandbox/
ls -la ./temp-sandbox/

# Build the image that will run the sandbox (needed for some tests)
- name: Build sandbox Docker image
uses: docker/build-push-action@v5
with:
context: ./astabench/util/sandbox/
context: ./temp-sandbox/
tags: astabench-sandbox:latest
cache-from: type=gha,scope=astabench-sandbox
cache-to: type=gha,mode=max,scope=astabench-sandbox
Expand Down
61 changes: 24 additions & 37 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
# allow passing extra pytest args, e.g. make test-expensive PYTEST_ARGS="-k EVAL_NAME"
PYTEST_ARGS ?=

ASTABENCH_TAG := astabench
CONTAINER_NAME := astabench-container
AGENT_BASELINES_TAG := agent-baselines
CONTAINER_NAME := agent-baselines-container
DOCKER_SOCKET_PATH ?= $(if $(XDG_RUNTIME_DIR),$(XDG_RUNTIME_DIR)/docker.sock,/var/run/docker.sock)

ENV_ARGS :=

# Name of solver to build Docker container for
SOLVER :=
# Docker image tag for the solver
TARGET := --target astabench-base
TARGET := --target agent-baselines-base

ifdef SOLVER
TARGET := --target $(SOLVER)
ASTABENCH_TAG := $(ASTABENCH_TAG)-$(SOLVER)
AGENT_BASELINES_TAG := $(AGENT_BASELINES_TAG)-$(SOLVER)
ENV_ARGS += --env-file solvers/$(SOLVER)/env
endif

Expand All @@ -33,17 +33,9 @@ ifdef HF_TOKEN
ENV_ARGS += -e HF_TOKEN
endif

ifdef GITHUB_ACCESS_TOKEN
ENV_ARGS += -e GITHUB_ACCESS_TOKEN
endif

# Also support .env file if it exists
ifneq ("$(wildcard .env)","")
ENV_ARGS += --env-file .env
# Load GITHUB_ACCESS_TOKEN from .env if not already set
ifndef GITHUB_ACCESS_TOKEN
GITHUB_ACCESS_TOKEN := $(shell grep '^GITHUB_ACCESS_TOKEN=' .env 2>/dev/null | cut -d'=' -f2)
endif
endif

# -----------------------------------------------------------------------------
Expand All @@ -52,45 +44,40 @@ endif
ifeq ($(IS_CI),true)
LOCAL_MOUNTS :=
ENV_ARGS += -e IS_CI
TEST_RUN := docker run --rm $(ENV_ARGS) -v /var/run/docker.sock:/var/run/docker.sock $(ASTABENCH_TAG)
TEST_RUN := docker run --rm $(ENV_ARGS) -v /var/run/docker.sock:/var/run/docker.sock $(AGENT_BASELINES_TAG)
BUILD_QUIET := --quiet
else
LOCAL_MOUNTS := \
-v $(DOCKER_SOCKET_PATH):/var/run/docker.sock \
-v $$(pwd)/pyproject.toml:/astabench/pyproject.toml:ro \
-v $$(pwd)/astabench:/astabench/astabench \
-v $$(pwd)/tests:/astabench/tests \
-v $$(pwd)/logs:/astabench/logs \
-v astabench-cache:/root/.cache
TEST_RUN := docker run --rm $(ENV_ARGS) $(LOCAL_MOUNTS) $(ASTABENCH_TAG)
-v $$(pwd)/pyproject.toml:/agent-baselines/pyproject.toml:ro \
-v $$(pwd)/agent_baselines:/agent-baselines/agent_baselines \
-v $$(pwd)/tests:/agent-baselines/tests \
-v $$(pwd)/logs:/agent-baselines/logs \
-v agent-baselines-cache:/root/.cache
TEST_RUN := docker run --rm $(ENV_ARGS) $(LOCAL_MOUNTS) $(AGENT_BASELINES_TAG)
BUILD_QUIET ?=
endif

# -----------------------------------------------------------------------------
# Build the Docker image (primary target)
# -----------------------------------------------------------------------------
# Build args for GitHub authentication
BUILD_ARGS :=
ifdef GITHUB_ACCESS_TOKEN
BUILD_ARGS := --build-arg GITHUB_ACCESS_TOKEN=$(GITHUB_ACCESS_TOKEN)
endif

build-image:
@if [ -z "$(GITHUB_ACCESS_TOKEN)" ]; then \
echo "Warning: GITHUB_ACCESS_TOKEN not set. This may cause issues with private GitHub repositories."; \
echo "To set it, export GITHUB_ACCESS_TOKEN=<your_token> or add it to .env file"; \
fi
docker build $(BUILD_QUIET) $(BUILD_ARGS) $(TARGET) . --tag $(ASTABENCH_TAG) -f ./docker/Dockerfile
docker build $(BUILD_QUIET) $(TARGET) . --tag $(AGENT_BASELINES_TAG) -f ./docker/Dockerfile

# -----------------------------------------------------------------------------
# Interactive shell in container
# -----------------------------------------------------------------------------
shell: build-image
@docker run --rm -it --name $(CONTAINER_NAME) \
$(LOCAL_MOUNTS) \
-v astabench-home:/root/.astabench \
-v agent-baselines-home:/root/.agent-baselines \
$(ENV_ARGS) -p 7575:7575 \
$(ASTABENCH_TAG) \
$(AGENT_BASELINES_TAG) \
/bin/bash

# -----------------------------------------------------------------------------
Expand All @@ -109,8 +96,8 @@ endif

format:
docker run --rm \
-v $$(pwd):/astabench \
$(ASTABENCH_TAG) \
-v $$(pwd):/agent-baselines \
$(AGENT_BASELINES_TAG) \
sh -c "pip install --no-cache-dir black && black ."

ifneq ($(IS_CI),true)
Expand All @@ -119,31 +106,31 @@ endif

mypy:
docker run --rm \
-v $$(pwd):/astabench \
$(ASTABENCH_TAG) \
uv run mypy astabench/ tests/
-v $$(pwd):/agent-baselines \
$(AGENT_BASELINES_TAG) \
uv run mypy agent-baselines/ tests/

ifneq ($(IS_CI),true)
flake: build-image
endif

flake:
docker run --rm \
$(ASTABENCH_TAG) \
uv run flake8 astabench/ tests/
$(AGENT_BASELINES_TAG) \
uv run flake8 agent-baselines/ tests/

ifneq ($(IS_CI),true)
test: build-image
endif

test:
@$(TEST_RUN) uv run --no-sync --extra dev --extra inspect_evals --extra smolagents \
-m pytest $(PYTEST_ARGS) -vv /astabench/tests
@$(TEST_RUN) uv run --no-sync --extra dev --extra smolagents \
-m pytest $(PYTEST_ARGS) -vv /agent-baselines/tests

ifneq ($(IS_CI),true)
test-expensive: build-image
endif

test-expensive:
@$(TEST_RUN) uv run --no-sync --extra dev --extra inspect_evals --extra smolagents \
-m pytest $(PYTEST_ARGS) -vv -o addopts= -m expensive /astabench/tests
-m pytest $(PYTEST_ARGS) -vv -o addopts= -m expensive /agent-baselines/tests
91 changes: 90 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# agent-baselines

The repo contains baseline implementations of a variety of agents to as reported in [AstaBench]().
The repo contains baseline implementations of a variety of agents to as reported in [AstaBench](https://github.com/allenai/asta-bench).

These agents are implemented as InspectAI solvers and can be run on the AstaBench suit with the `astabench eval` command.

Expand All @@ -21,3 +21,92 @@ root:/astabench# ./solvers/<solver_name>/demo.sh
```

See documentation in [asta-bench](https://github.com/allenai/asta-bench) for details on how to run the suite.


## Available Agents

AstaBench includes several built-in solvers. Look in the `solvers` directory for setup and demo scripts.

### ReAct Agent
The [basic react agent](/astabench/solvers/react/basic_agent.py) is a simple ReAct implementation that uses LLM tool-calling in a loop. It supports all [AstaBench tool options](/astabench/tools/__init__.py) and configurable `max_steps`.

### Code Agent
The [code agent](/astabench/solvers/code_agent/) is a ReAct-style agent optimized for coding tasks with sandbox execution. Supports multiple agent types (react, llm) with configurable retries.

### Smolagents
The [smolagents coder](/astabench/solvers/smolagents/agent.py) integrates HuggingFace's CodeAgent, which takes actions by writing Python code rather than using JSON tool-calling. Requires a sandbox and uses `SandboxToolManager` to access tools from sandboxed code.

### Task-Specific Solvers
- **SQA**: Long-form question answering system with retrieval and reranking, designed for the SQA task
- **DataVoyager**: Agent for data analysis and exploration tasks
- **STORM**: Knowledge curation system for comprehensive report generation
- **FutureHouse**: Literature review and scientific writing agent
- **SUPER**: Agent designed specifically for SUPER benchmark tasks

Each solver directory contains `setup.sh` for installation and `demo.sh` with example commands.

## Running ScholarQA evaluations with dvc
[`dvc`](dvc.org) is used to perform "release" runs of the SQA evaluation. If you are just debugging, there is no need to use `dvc`. The evalution suite is tracked in `dvc.yaml` where the stages of the pipeline are defined along with the inputs and outputs of each stage. These are cached on s3 and shared across systems.

### Getting started

First, install `dvc`:

``` bash
pip install 'dvc[s3]'
```

The main command for dvc is `dvc repro`. This checks to see which stages in your pipeline need to be run and runs them. It uses a cache to avoid rerunning stages unnecessarily. Please check the dvc documentation for more details if you are interested.

To force the pipeline to run (ignoring the cachhe):

``` bash
dvc repro --force
```

If you just want to run a single stage (the "@" specifies the loop value to run):

``` bash
dvc repro [email protected]
```

### When to force a stage to run
`dvc repro` only executes stages for which:
* Any of the `dependencies` changed. (This is determined by comparing the hash with the hash stored in `dvc.lock`)
* The definition of the stage changed (ie the `dependencies`, the `outputs` and the `cmd` as defined in `dvc.yaml`)

Sometimes we need to manually force a stage to run since the code for a stage can change (for example a bugfix or improvement to a solver or scorer) without the `cmd` for that stage changing. For example, I might change the prompt used in the scorer for SQA, in which case I should manually force the scoring stage (and all its descendants) to rerun:

``` bash
dvc repro score_sqa --force --downstream
```

I could also just rerun the `score_sqa` stage if I knew that the downstream stages shouldn't be affected by this:
``` bash
dvc repro score_sqa --force --single-item
```

You can check if the `dependencies` of any stages have changed by running `dvc status`:

``` bash
summarize_scores:
changed deps:
modified: dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval
```
This tells me that the scores for the claude 3.7 solver have changed and I need to rerun the stage which aggregates the results into a table (`summarize_scores`). You can do this by running `dvc repro`.

### Collaborating with others on the same project
When someone pushes changes to the lockfile `dvc.lock` then you can checkout those changing by pulling from the cache:

``` bash
dvc pull
```

This updates your local files with the versions pushed by your collaborator.

You can push your local changes using:

``` bash
dvc push
```
then your collaborators can pull your changes by running `dvc pull`.
4 changes: 4 additions & 0 deletions agent_baselines/solvers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from astabench.util.model import normalize_model_name, record_model_usage_with_inspect

from .futurehouse import futurehouse_solver
from .llm import llm_with_prompt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import uuid
from typing import List

from astabench.evals.arxivdigestables.task import Cell, Table
from astabench.util.model import record_model_usage_with_inspect
from inspect_ai.model import ChatMessageAssistant, ModelUsage
from inspect_ai.solver import Generate, Solver, TaskState, solver
from scholarqa.config.config_setup import LogsConfig
Expand All @@ -14,9 +16,6 @@
from scholarqa.table_generation.table_generator import TableGenerator
from scholarqa.table_generation.table_model import TableWidget

from astabench.evals.arxivdigestables.task import Cell, Table
from astabench.util.model import record_model_usage_with_inspect

logger = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pathlib import Path
from typing import Callable

from astabench.constants import ASTA_SOLVER_DATA_REPO, ASTA_SOLVER_DATA_REVISION
from huggingface_hub import hf_hub_download
from inspect_ai.model import (
ChatMessage,
Expand All @@ -20,8 +21,6 @@
from inspect_ai.tool import Tool, ToolCall, ToolDef, ToolError, tool
from pydantic import BaseModel

from astabench.constants import ASTA_SOLVER_DATA_REPO, ASTA_SOLVER_DATA_REVISION

logger = getLogger(__name__)


Expand Down Expand Up @@ -190,16 +189,19 @@ def fewshot_textsim_router() -> Solver:

async def solve(state: TaskState, generate: Generate) -> TaskState:
# Import solvers here to avoid dependency issues when just importing the module
import astabench.solvers.datavoyager.agent
from astabench.evals.e2e_discovery.solvers.autoasta.autoasta_cached import (
autoasta_cached_solver,
)
from astabench.solvers.arxivdigestables.asta_table_agent import tables_solver
from astabench.solvers.code_agent.agent import code_agent
from astabench.solvers.datavoyager.agent import datavoyager_solver
from astabench.solvers.react.basic_agent import instantiated_basic_agent
from astabench.solvers.search.paper_finder import ai2i_paper_finder
from astabench.solvers.sqa.sqa import sqa_solver

import agent_baselines.solvers.datavoyager.agent
from agent_baselines.solvers.arxivdigestables.asta_table_agent import (
tables_solver,
)
from agent_baselines.solvers.code_agent.agent import code_agent
from agent_baselines.solvers.datavoyager.agent import datavoyager_solver
from agent_baselines.solvers.react.basic_agent import instantiated_basic_agent
from agent_baselines.solvers.search.paper_finder import ai2i_paper_finder
from agent_baselines.solvers.sqa.sqa import sqa_solver

logger.info("Starting sample %s", state.sample_id)

Expand Down Expand Up @@ -241,7 +243,7 @@ def mkhandoff(name: str, inner_tool: Solver):
"discoverybench",
datavoyager_solver(
config_file=(
Path(astabench.solvers.datavoyager.agent.__file__).parent
Path(agent_baselines.solvers.datavoyager.agent.__file__).parent
/ "dv_core"
/ "config"
/ "datavoyager_modal_deployment_magentic_one_config_20250617_o3.yaml"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Using the code execution demo, one can run the following:
```sh
inspect eval \
--model openai/gpt-4.1 \
--solver astabench/solvers/code_agent/agent.py@code_agent \
--solver agent_baselines/solvers/code_agent/agent.py@code_agent \
astabench/evals/demo/code_execution/task.py \
--sandbox docker:astabench/util/sandbox/sandbox_compose.yaml
-S agent_type=react -S max_tries=10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def code_agent(

"""
try:
from astabench.solvers.code_agent.code_agent import InteractiveCodeAgent
from agent_baselines.solvers.code_agent.code_agent import InteractiveCodeAgent
except ImportError:
logger.exception(
(
Expand Down
Loading