allenai · rodneykinney · Aug 25, 2025 · Aug 21, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml
@@ -22,4 +22,4 @@ jobs:
 
       - name: Run flake8
         run: |
-          flake8 astabench/ tests/
+          flake8 agent_baselines/ tests/
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
@@ -61,17 +61,30 @@ jobs:
         with:
           context: .
           file: ./docker/Dockerfile
-          tags: astabench:latest
-          target: astabench-base
-          cache-from: type=gha,scope=astabench  # Use GitHub Actions cache
-          cache-to: type=gha,mode=max,scope=astabench
+          tags: agent-baselines:latest
+          target: agent-baselines-base
+          cache-from: type=gha,scope=agent-baselines  # Use GitHub Actions cache
+          cache-to: type=gha,mode=max,scope=agent-baselines
           load: true  # Make the image available for later steps
 
+      # Install uv for Python package management
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+
+      # Copy sandbox files from installed astabench package
+      - name: Copy astabench sandbox files
+        run: |
+          # Find the astabench installation and copy sandbox files
+          ASTABENCH_PATH=$(uv run python -c "import astabench; import os; print(os.path.dirname(astabench.__file__))")
+          mkdir -p ./temp-sandbox/
+          cp -r "$ASTABENCH_PATH/util/sandbox/"* ./temp-sandbox/
+          ls -la ./temp-sandbox/
+
       # Build the image that will run the sandbox (needed for some tests)
       - name: Build sandbox Docker image
         uses: docker/build-push-action@v5
         with:
-          context: ./astabench/util/sandbox/
+          context: ./temp-sandbox/
           tags: astabench-sandbox:latest
           cache-from: type=gha,scope=astabench-sandbox
           cache-to: type=gha,mode=max,scope=astabench-sandbox

diff --git a/Makefile b/Makefile
@@ -3,20 +3,20 @@
 # allow passing extra pytest args, e.g. make test-expensive PYTEST_ARGS="-k EVAL_NAME"
 PYTEST_ARGS ?=
 
-ASTABENCH_TAG    := astabench
-CONTAINER_NAME   := astabench-container
+AGENT_BASELINES_TAG	:= agent-baselines
+CONTAINER_NAME		:= agent-baselines-container
 DOCKER_SOCKET_PATH ?= $(if $(XDG_RUNTIME_DIR),$(XDG_RUNTIME_DIR)/docker.sock,/var/run/docker.sock)
 
 ENV_ARGS :=
 
 # Name of solver to build Docker container for
 SOLVER :=
 # Docker image tag for the solver
-TARGET := --target astabench-base
+TARGET := --target agent-baselines-base
 
 ifdef SOLVER
 	  TARGET := --target $(SOLVER)
-	  ASTABENCH_TAG := $(ASTABENCH_TAG)-$(SOLVER)
+	  AGENT_BASELINES_TAG := $(AGENT_BASELINES_TAG)-$(SOLVER)
 	  ENV_ARGS += --env-file solvers/$(SOLVER)/env
 endif
 
@@ -33,17 +33,9 @@ ifdef HF_TOKEN
   ENV_ARGS += -e HF_TOKEN
 endif
 
-ifdef GITHUB_ACCESS_TOKEN
-  ENV_ARGS += -e GITHUB_ACCESS_TOKEN
-endif
-
 # Also support .env file if it exists
 ifneq ("$(wildcard .env)","")
   ENV_ARGS += --env-file .env
-  # Load GITHUB_ACCESS_TOKEN from .env if not already set
-  ifndef GITHUB_ACCESS_TOKEN
-    GITHUB_ACCESS_TOKEN := $(shell grep '^GITHUB_ACCESS_TOKEN=' .env 2>/dev/null | cut -d'=' -f2)
-  endif
 endif
 
 # -----------------------------------------------------------------------------
@@ -52,45 +44,40 @@ endif
 ifeq ($(IS_CI),true)
   LOCAL_MOUNTS :=
   ENV_ARGS += -e IS_CI
-  TEST_RUN := docker run --rm $(ENV_ARGS) -v /var/run/docker.sock:/var/run/docker.sock $(ASTABENCH_TAG)
+  TEST_RUN := docker run --rm $(ENV_ARGS) -v /var/run/docker.sock:/var/run/docker.sock $(AGENT_BASELINES_TAG)
   BUILD_QUIET := --quiet
 else
   LOCAL_MOUNTS := \
     -v $(DOCKER_SOCKET_PATH):/var/run/docker.sock \
-    -v $$(pwd)/pyproject.toml:/astabench/pyproject.toml:ro \
-    -v $$(pwd)/astabench:/astabench/astabench \
-    -v $$(pwd)/tests:/astabench/tests \
-    -v $$(pwd)/logs:/astabench/logs \
-    -v astabench-cache:/root/.cache
-  TEST_RUN := docker run --rm $(ENV_ARGS) $(LOCAL_MOUNTS) $(ASTABENCH_TAG)
+    -v $$(pwd)/pyproject.toml:/agent-baselines/pyproject.toml:ro \
+    -v $$(pwd)/agent_baselines:/agent-baselines/agent_baselines \
+    -v $$(pwd)/tests:/agent-baselines/tests \
+    -v $$(pwd)/logs:/agent-baselines/logs \
+    -v agent-baselines-cache:/root/.cache
+  TEST_RUN := docker run --rm $(ENV_ARGS) $(LOCAL_MOUNTS) $(AGENT_BASELINES_TAG)
   BUILD_QUIET ?=
 endif
 
 # -----------------------------------------------------------------------------
 # Build the Docker image (primary target)
 # -----------------------------------------------------------------------------
-# Build args for GitHub authentication
-BUILD_ARGS :=
-ifdef GITHUB_ACCESS_TOKEN
-	BUILD_ARGS := --build-arg GITHUB_ACCESS_TOKEN=$(GITHUB_ACCESS_TOKEN)
-endif
 
 build-image:
 	@if [ -z "$(GITHUB_ACCESS_TOKEN)" ]; then \
 		echo "Warning: GITHUB_ACCESS_TOKEN not set. This may cause issues with private GitHub repositories."; \
 		echo "To set it, export GITHUB_ACCESS_TOKEN=<your_token> or add it to .env file"; \
 	fi
-	docker build $(BUILD_QUIET) $(BUILD_ARGS) $(TARGET) . --tag $(ASTABENCH_TAG) -f ./docker/Dockerfile
+	docker build $(BUILD_QUIET) $(TARGET) . --tag $(AGENT_BASELINES_TAG) -f ./docker/Dockerfile
 
 # -----------------------------------------------------------------------------
 # Interactive shell in container
 # -----------------------------------------------------------------------------
 shell: build-image
 	@docker run --rm -it --name $(CONTAINER_NAME) \
 		$(LOCAL_MOUNTS) \
-		-v astabench-home:/root/.astabench \
+		-v agent-baselines-home:/root/.agent-baselines \
 		$(ENV_ARGS) -p 7575:7575 \
-		$(ASTABENCH_TAG) \
+		$(AGENT_BASELINES_TAG) \
 		/bin/bash
 
 # -----------------------------------------------------------------------------
@@ -109,8 +96,8 @@ endif
 
 format:
 	docker run --rm \
-		-v $$(pwd):/astabench \
-		$(ASTABENCH_TAG) \
+		-v $$(pwd):/agent-baselines \
+		$(AGENT_BASELINES_TAG) \
 		sh -c "pip install --no-cache-dir black && black ."
 
 ifneq ($(IS_CI),true)
@@ -119,31 +106,31 @@ endif
 
 mypy:
 	docker run --rm \
-		-v $$(pwd):/astabench \
-		$(ASTABENCH_TAG) \
-		uv run mypy astabench/ tests/
+		-v $$(pwd):/agent-baselines \
+		$(AGENT_BASELINES_TAG) \
+		uv run mypy agent-baselines/ tests/
 
 ifneq ($(IS_CI),true)
 flake: build-image
 endif
 
 flake:
 	docker run --rm \
-		$(ASTABENCH_TAG) \
-		uv run flake8 astabench/ tests/
+		$(AGENT_BASELINES_TAG) \
+		uv run flake8 agent-baselines/ tests/
 
 ifneq ($(IS_CI),true)
 test: build-image
 endif
 
 test:
-	@$(TEST_RUN) uv run --no-sync --extra dev --extra inspect_evals --extra smolagents \
-		-m pytest $(PYTEST_ARGS) -vv /astabench/tests
+	@$(TEST_RUN) uv run --no-sync --extra dev --extra smolagents \
+		-m pytest $(PYTEST_ARGS) -vv /agent-baselines/tests
 
 ifneq ($(IS_CI),true)
 test-expensive: build-image
 endif
 
 test-expensive:
 	@$(TEST_RUN) uv run --no-sync --extra dev --extra inspect_evals --extra smolagents \
-		-m pytest $(PYTEST_ARGS) -vv -o addopts= -m expensive /astabench/tests
+		-m pytest $(PYTEST_ARGS) -vv -o addopts= -m expensive /agent-baselines/tests
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # agent-baselines
 
-The repo contains baseline implementations of a variety of agents to as reported in [AstaBench]().  
+The repo contains baseline implementations of a variety of agents to as reported in [AstaBench](https://github.com/allenai/asta-bench).
 
 These agents are implemented as InspectAI solvers and can be run on the AstaBench suit with the `astabench eval` command.
 
@@ -21,3 +21,92 @@ root:/astabench# ./solvers/<solver_name>/demo.sh
 ```
 
 See documentation in [asta-bench](https://github.com/allenai/asta-bench) for details on how to run the suite.
+
+
+## Available Agents
+
+AstaBench includes several built-in solvers. Look in the `solvers` directory for setup and demo scripts.
+
+### ReAct Agent
+The [basic react agent](/astabench/solvers/react/basic_agent.py) is a simple ReAct implementation that uses LLM tool-calling in a loop. It supports all [AstaBench tool options](/astabench/tools/__init__.py) and configurable `max_steps`.
+
+### Code Agent
+The [code agent](/astabench/solvers/code_agent/) is a ReAct-style agent optimized for coding tasks with sandbox execution. Supports multiple agent types (react, llm) with configurable retries.
+
+### Smolagents
+The [smolagents coder](/astabench/solvers/smolagents/agent.py) integrates HuggingFace's CodeAgent, which takes actions by writing Python code rather than using JSON tool-calling. Requires a sandbox and uses `SandboxToolManager` to access tools from sandboxed code.
+
+### Task-Specific Solvers
+- **SQA**: Long-form question answering system with retrieval and reranking, designed for the SQA task
+- **DataVoyager**: Agent for data analysis and exploration tasks
+- **STORM**: Knowledge curation system for comprehensive report generation
+- **FutureHouse**: Literature review and scientific writing agent
+- **SUPER**: Agent designed specifically for SUPER benchmark tasks
+
+Each solver directory contains `setup.sh` for installation and `demo.sh` with example commands.
+
+## Running ScholarQA evaluations with dvc
+[`dvc`](dvc.org) is used to perform "release" runs of the SQA evaluation. If you are just debugging, there is no need to use `dvc`. The evalution suite is tracked in `dvc.yaml` where the stages of the pipeline are defined along with the inputs and outputs of each stage. These are cached on s3 and shared across systems.
+
+### Getting started
+
+First, install `dvc`:
+
+``` bash
+pip install 'dvc[s3]'
+```
+
+The main command for dvc is `dvc repro`. This checks to see which stages in your pipeline need to be run and runs them. It uses a cache to avoid rerunning stages unnecessarily. Please check the dvc documentation for more details if you are interested.
+
+To force the pipeline to run (ignoring the cachhe):
+
+``` bash
+dvc repro --force
+```
+
+If you just want to run a single stage (the "@" specifies the loop value to run):
+
+``` bash
+dvc repro [email protected]
+```
+
+### When to force a stage to run
+`dvc repro` only executes stages for which:
+* Any of the `dependencies` changed. (This is determined by comparing the hash with the hash stored in `dvc.lock`)
+* The definition of the stage changed (ie the `dependencies`, the `outputs` and the `cmd` as defined in `dvc.yaml`)
+
+Sometimes we need to manually force a stage to run since the code for a stage can change (for example a bugfix or improvement to a solver or scorer) without the `cmd` for that stage changing. For example, I might change the prompt used in the scorer for SQA, in which case I should manually force the scoring stage (and all its descendants) to rerun:
+
+``` bash
+dvc repro score_sqa --force --downstream
+```
+
+I could also just rerun the `score_sqa` stage if I knew that the downstream stages shouldn't be affected by this:
+``` bash
+dvc repro score_sqa --force --single-item
+```
+
+You can check if the `dependencies` of any stages have changed by running `dvc status`:
+
+``` bash
+summarize_scores:
+  changed deps:
+    modified:           dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval
+```
+This tells me that the scores for the claude 3.7 solver have changed and I need to rerun the stage which aggregates the results into a table (`summarize_scores`). You can do this by running `dvc repro`.
+
+### Collaborating with others on the same project
+When someone pushes changes to the lockfile `dvc.lock` then you can checkout those changing by pulling from the cache:
+
+``` bash
+dvc pull
+```
+
+This updates your local files with the versions pushed by your collaborator.
+
+You can push your local changes using:
+
+``` bash
+dvc push
+```
+then your collaborators can pull your changes by running `dvc pull`.
diff --git a/agent_baselines/solvers/__init__.py b/agent_baselines/solvers/__init__.py
@@ -0,0 +1,4 @@
+from astabench.util.model import normalize_model_name, record_model_usage_with_inspect
+
+from .futurehouse import futurehouse_solver
+from .llm import llm_with_prompt
diff --git a/...vers/arxivdigestables/asta_table_agent.py → ...vers/arxivdigestables/asta_table_agent.py b/...vers/arxivdigestables/asta_table_agent.py → ...vers/arxivdigestables/asta_table_agent.py
@@ -4,6 +4,8 @@
 import uuid
 from typing import List
 
+from astabench.evals.arxivdigestables.task import Cell, Table
+from astabench.util.model import record_model_usage_with_inspect
 from inspect_ai.model import ChatMessageAssistant, ModelUsage
 from inspect_ai.solver import Generate, Solver, TaskState, solver
 from scholarqa.config.config_setup import LogsConfig
@@ -14,9 +16,6 @@
 from scholarqa.table_generation.table_generator import TableGenerator
 from scholarqa.table_generation.table_model import TableWidget
 
-from astabench.evals.arxivdigestables.task import Cell, Table
-from astabench.util.model import record_model_usage_with_inspect
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/astabench/solvers/asta/v0/asta.py → agent_baselines/solvers/asta/v0/asta.py b/astabench/solvers/asta/v0/asta.py → agent_baselines/solvers/asta/v0/asta.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from typing import Callable
 
+from astabench.constants import ASTA_SOLVER_DATA_REPO, ASTA_SOLVER_DATA_REVISION
 from huggingface_hub import hf_hub_download
 from inspect_ai.model import (
     ChatMessage,
@@ -20,8 +21,6 @@
 from inspect_ai.tool import Tool, ToolCall, ToolDef, ToolError, tool
 from pydantic import BaseModel
 
-from astabench.constants import ASTA_SOLVER_DATA_REPO, ASTA_SOLVER_DATA_REVISION
-
 logger = getLogger(__name__)
 
 
@@ -190,16 +189,19 @@ def fewshot_textsim_router() -> Solver:
 
     async def solve(state: TaskState, generate: Generate) -> TaskState:
         # Import solvers here to avoid dependency issues when just importing the module
-        import astabench.solvers.datavoyager.agent
         from astabench.evals.e2e_discovery.solvers.autoasta.autoasta_cached import (
             autoasta_cached_solver,
         )
-        from astabench.solvers.arxivdigestables.asta_table_agent import tables_solver
-        from astabench.solvers.code_agent.agent import code_agent
-        from astabench.solvers.datavoyager.agent import datavoyager_solver
-        from astabench.solvers.react.basic_agent import instantiated_basic_agent
-        from astabench.solvers.search.paper_finder import ai2i_paper_finder
-        from astabench.solvers.sqa.sqa import sqa_solver
+
+        import agent_baselines.solvers.datavoyager.agent
+        from agent_baselines.solvers.arxivdigestables.asta_table_agent import (
+            tables_solver,
+        )
+        from agent_baselines.solvers.code_agent.agent import code_agent
+        from agent_baselines.solvers.datavoyager.agent import datavoyager_solver
+        from agent_baselines.solvers.react.basic_agent import instantiated_basic_agent
+        from agent_baselines.solvers.search.paper_finder import ai2i_paper_finder
+        from agent_baselines.solvers.sqa.sqa import sqa_solver
 
         logger.info("Starting sample %s", state.sample_id)
 
@@ -241,7 +243,7 @@ def mkhandoff(name: str, inner_tool: Solver):
             "discoverybench",
             datavoyager_solver(
                 config_file=(
-                    Path(astabench.solvers.datavoyager.agent.__file__).parent
+                    Path(agent_baselines.solvers.datavoyager.agent.__file__).parent
                     / "dv_core"
                     / "config"
                     / "datavoyager_modal_deployment_magentic_one_config_20250617_o3.yaml"

diff --git a/astabench/solvers/code_agent/README.md → agent_baselines/solvers/code_agent/README.md b/astabench/solvers/code_agent/README.md → agent_baselines/solvers/code_agent/README.md
@@ -12,7 +12,7 @@ Using the code execution demo, one can run the following:
 ```sh
 inspect eval     \
     --model openai/gpt-4.1    \
-    --solver astabench/solvers/code_agent/agent.py@code_agent \
+    --solver agent_baselines/solvers/code_agent/agent.py@code_agent \
     astabench/evals/demo/code_execution/task.py \
     --sandbox docker:astabench/util/sandbox/sandbox_compose.yaml
     -S agent_type=react -S max_tries=10 

diff --git a/astabench/solvers/code_agent/__init__.py → ..._baselines/solvers/code_agent/__init__.py b/astabench/solvers/code_agent/__init__.py → ..._baselines/solvers/code_agent/__init__.py
diff --git a/astabench/solvers/code_agent/agent.py → agent_baselines/solvers/code_agent/agent.py b/astabench/solvers/code_agent/agent.py → agent_baselines/solvers/code_agent/agent.py
@@ -36,7 +36,7 @@ def code_agent(
 
     """
     try:
-        from astabench.solvers.code_agent.code_agent import InteractiveCodeAgent
+        from agent_baselines.solvers.code_agent.code_agent import InteractiveCodeAgent
     except ImportError:
         logger.exception(
             (