diff --git a/src/vox/agents/pi.py b/src/vox/agents/pi.py new file mode 100644 index 0000000..4926967 --- /dev/null +++ b/src/vox/agents/pi.py @@ -0,0 +1,22 @@ +"""Pi coding agent wrapper. + +Pi is a minimal terminal coding harness by Mario Zechner (@badlogic). +https://github.com/badlogic/pi-mono/tree/main/packages/coding-agent + +Install: npm install -g @mariozechner/pi-coding-agent +""" + +from __future__ import annotations + +from vox.agents.base import AgentResult, BaseAgent + + +class PiAgent(BaseAgent): + name = "pi" + binary = "pi" + description = "Pi coding agent — minimal, extensible terminal coding harness" + + @classmethod + def run(cls, task: str, **kwargs: object) -> AgentResult: + cmd = [cls.binary, "--print", task] + return cls._exec(cmd) diff --git a/src/vox/agents/router.py b/src/vox/agents/router.py index 3317741..f846bbb 100644 --- a/src/vox/agents/router.py +++ b/src/vox/agents/router.py @@ -9,6 +9,7 @@ from vox.agents.codex import CodexAgent from vox.agents.droid import DroidAgent from vox.agents.gemini import GeminiAgent +from vox.agents.pi import PiAgent if TYPE_CHECKING: from vox.agents.base import AgentResult, BaseAgent @@ -20,6 +21,7 @@ GeminiAgent, AmpAgent, DroidAgent, + PiAgent, ] ROUTE_SYSTEM_PROMPT = """\ @@ -34,6 +36,7 @@ - For research, summarization, questions → prefer gemini - For complex multi-step engineering → prefer droid - For codebase search and understanding → prefer amp +- For minimal/lightweight terminal tasks → prefer pi - If unsure, use the preferred agent """ diff --git a/src/vox/bench/__init__.py b/src/vox/bench/__init__.py new file mode 100644 index 0000000..d53af36 --- /dev/null +++ b/src/vox/bench/__init__.py @@ -0,0 +1,5 @@ +"""Terminal-Bench 2.0 evaluation harness for Vox.""" + +from vox.bench.harness import BenchResult, Harness, Task + +__all__ = ["BenchResult", "Harness", "Task"] diff --git a/src/vox/bench/harness.py b/src/vox/bench/harness.py new file mode 100644 index 0000000..900bc11 --- /dev/null +++ b/src/vox/bench/harness.py @@ -0,0 +1,212 @@ +"""Terminal-Bench 2.0 harness — task definitions, runner, and scorer. + +Each Task has: + - description: plain-English description of what to accomplish + - verify_cmd: shell command that exits 0 if the task succeeded, non-zero otherwise + - solution: reference human-written solution (used only for documentation) + - category: task category (file, process, git, network, package, …) + - timeout: seconds to allow for agent + execution + +Usage:: + + from vox.bench.harness import Harness + from vox.bench.tasks import BUILTIN_TASKS + from vox.config import VoxConfig + + harness = Harness(VoxConfig()) + results = harness.run_all(BUILTIN_TASKS) + harness.print_summary(results) +""" + +from __future__ import annotations + +import contextlib +import subprocess +import time +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +from rich.console import Console +from rich.table import Table + +if TYPE_CHECKING: + from vox.config import VoxConfig + +console = Console() + + +@dataclass +class Task: + """A single Terminal-Bench task.""" + + id: str + description: str + verify_cmd: str + solution: str = "" + category: str = "general" + timeout: int = 30 + setup_cmd: str = "" + teardown_cmd: str = "" + + +@dataclass +class BenchResult: + """Result of running a single benchmark task.""" + + task_id: str + description: str + category: str + passed: bool + generated_cmd: str = "" + exit_code: int = -1 + verify_exit_code: int = -1 + elapsed_s: float = 0.0 + error: str = "" + + +@dataclass +class Harness: + """Run Terminal-Bench tasks using the Vox translation engine.""" + + cfg: VoxConfig + dry_run: bool = False + results: list[BenchResult] = field(default_factory=list) + + def _setup(self, task: Task) -> None: + if task.setup_cmd: + subprocess.run(task.setup_cmd, shell=True, check=False, timeout=30) + + def _teardown(self, task: Task) -> None: + if task.teardown_cmd: + subprocess.run(task.teardown_cmd, shell=True, check=False, timeout=30) + + def run_task(self, task: Task) -> BenchResult: + """Translate, execute, then verify a single task. Returns a BenchResult.""" + from vox.engine import translate + + start = time.monotonic() + + try: + self._setup(task) + except Exception as exc: + return BenchResult( + task_id=task.id, + description=task.description, + category=task.category, + passed=False, + error=f"setup failed: {exc}", + elapsed_s=time.monotonic() - start, + ) + + cmd = "" + exit_code = -1 + verify_exit_code = -1 + error = "" + + try: + cmd = translate(task.description, self.cfg) or "" + if not cmd: + error = "translation returned empty command" + elif not self.dry_run: + result = subprocess.run( + cmd, + shell=True, + check=False, + timeout=task.timeout, + capture_output=True, + text=True, + ) + exit_code = result.returncode + except subprocess.TimeoutExpired: + error = f"command timed out after {task.timeout}s" + exit_code = 124 + except Exception as exc: + error = str(exc) + + if not error and not self.dry_run and task.verify_cmd: + try: + vr = subprocess.run( + task.verify_cmd, + shell=True, + check=False, + timeout=task.timeout, + capture_output=True, + text=True, + ) + verify_exit_code = vr.returncode + except subprocess.TimeoutExpired: + error = f"verify timed out after {task.timeout}s" + verify_exit_code = 124 + except Exception as exc: + error = str(exc) + + passed = ( + not error + and not self.dry_run + and verify_exit_code == 0 + ) + + elapsed = time.monotonic() - start + + with contextlib.suppress(Exception): + self._teardown(task) + + return BenchResult( + task_id=task.id, + description=task.description, + category=task.category, + passed=passed, + generated_cmd=cmd, + exit_code=exit_code, + verify_exit_code=verify_exit_code, + elapsed_s=elapsed, + error=error, + ) + + def run_all(self, tasks: list[Task]) -> list[BenchResult]: + """Run all tasks and return results.""" + results: list[BenchResult] = [] + for task in tasks: + with console.status(f"[dim]Running {task.id}: {task.description[:60]}…[/dim]"): + result = self.run_task(task) + icon = "[green]✓[/green]" if result.passed else "[red]✗[/red]" + console.print(f" {icon} [{result.category}] {task.id}: {task.description[:60]}") + if result.error: + console.print(f" [dim red]{result.error}[/dim red]") + results.append(result) + self.results = results + return results + + def print_summary(self, results: list[BenchResult] | None = None) -> None: + """Print a rich summary table of all results.""" + if results is None: + results = self.results + + total = len(results) + passed = sum(1 for r in results if r.passed) + score = (passed / total * 100) if total else 0.0 + + table = Table(title="Terminal-Bench Results", show_header=True, header_style="bold cyan") + table.add_column("ID", style="dim", width=20) + table.add_column("Category", width=12) + table.add_column("Description", width=50) + table.add_column("Pass", justify="center", width=6) + table.add_column("Time(s)", justify="right", width=8) + + for r in results: + status = "[green]✓[/green]" if r.passed else "[red]✗[/red]" + table.add_row( + r.task_id, + r.category, + r.description[:48], + status, + f"{r.elapsed_s:.1f}", + ) + + console.print(table) + console.print( + f"\n[bold]Score: {passed}/{total} ({score:.1f}%)[/bold] " + f"— [dim]avg {sum(r.elapsed_s for r in results) / total:.1f}s/task[/dim]" + if total + else "\n[bold]No tasks run.[/bold]" + ) diff --git a/src/vox/bench/tasks.py b/src/vox/bench/tasks.py new file mode 100644 index 0000000..a9a26e7 --- /dev/null +++ b/src/vox/bench/tasks.py @@ -0,0 +1,217 @@ +"""Built-in Terminal-Bench 2.0 task definitions. + +These tasks are inspired by the Terminal-Bench 2.0 benchmark +(https://www.tbench.ai/) and cover common real-world terminal scenarios. +Each task verifies its own success via a shell command. + +Categories: file, text, process, system, archive, network, git, shell +""" + +from __future__ import annotations + +from vox.bench.harness import Task + +BUILTIN_TASKS: list[Task] = [ + # ── file operations ──────────────────────────────────────────────────── + Task( + id="file-list-hidden", + description="list all files including hidden ones in the current directory", + verify_cmd="ls -a | grep -q '\\.'", + solution="ls -la", + category="file", + ), + Task( + id="file-count-lines", + description="count the number of lines in /etc/hosts", + verify_cmd="[ -f /etc/hosts ]", + solution="wc -l /etc/hosts", + category="file", + ), + Task( + id="file-find-large", + description="find files larger than 1MB in the current directory", + verify_cmd="true", + solution="find . -size +1M -type f", + category="file", + ), + Task( + id="file-disk-usage", + description="show disk usage of each directory in the current folder sorted by size", + verify_cmd="true", + solution="du -sh */ 2>/dev/null | sort -rh", + category="file", + ), + Task( + id="file-touch-create", + description="create an empty file named bench_test_file.txt", + verify_cmd="test -f bench_test_file.txt", + solution="touch bench_test_file.txt", + category="file", + teardown_cmd="rm -f bench_test_file.txt", + ), + Task( + id="file-mkdir-nested", + description="create nested directories bench_dir/sub/deep", + verify_cmd="test -d bench_dir/sub/deep", + solution="mkdir -p bench_dir/sub/deep", + category="file", + teardown_cmd="rm -rf bench_dir", + ), + Task( + id="file-find-py", + description="find all Python files in the current directory recursively", + verify_cmd="true", + solution="find . -name '*.py' -type f", + category="file", + ), + # ── text processing ──────────────────────────────────────────────────── + Task( + id="text-word-count", + description="count the number of words in /etc/hosts", + verify_cmd="[ -f /etc/hosts ]", + solution="wc -w /etc/hosts", + category="text", + ), + Task( + id="text-grep-pattern", + description="search for the word 'localhost' in /etc/hosts", + verify_cmd="grep -q localhost /etc/hosts", + solution="grep localhost /etc/hosts", + category="text", + ), + Task( + id="text-sort-file", + description="sort the lines of /etc/hosts alphabetically and show the result", + verify_cmd="[ -f /etc/hosts ]", + solution="sort /etc/hosts", + category="text", + ), + # ── process management ───────────────────────────────────────────────── + Task( + id="proc-list-all", + description="list all running processes with their PIDs", + verify_cmd="true", + solution="ps aux", + category="process", + ), + Task( + id="proc-top-cpu", + description="show the top 5 processes by CPU usage", + verify_cmd="true", + solution="ps aux --sort=-%cpu | head -6", + category="process", + ), + Task( + id="proc-current-shell", + description="show the current shell process ID", + verify_cmd="true", + solution="echo $$", + category="process", + ), + # ── system info ──────────────────────────────────────────────────────── + Task( + id="sys-free-space", + description="show free disk space in human readable format", + verify_cmd="true", + solution="df -h", + category="system", + ), + Task( + id="sys-memory-usage", + description="show current memory usage", + verify_cmd="true", + solution="free -h", + category="system", + ), + Task( + id="sys-hostname", + description="print the current hostname", + verify_cmd="true", + solution="hostname", + category="system", + ), + Task( + id="sys-env-path", + description="print the PATH environment variable", + verify_cmd="true", + solution="echo $PATH", + category="system", + ), + # ── archive operations ───────────────────────────────────────────────── + Task( + id="archive-create-tar", + description="create a gzip compressed tar archive of /etc/hosts named hosts.tar.gz", + verify_cmd="test -f hosts.tar.gz", + solution="tar czf hosts.tar.gz /etc/hosts", + category="archive", + teardown_cmd="rm -f hosts.tar.gz", + ), + Task( + id="archive-list-tar", + description="list the contents of an archive file hosts.tar.gz", + setup_cmd="tar czf hosts.tar.gz /etc/hosts", + verify_cmd="test -f hosts.tar.gz", + solution="tar tzf hosts.tar.gz", + category="archive", + teardown_cmd="rm -f hosts.tar.gz", + ), + # ── network ──────────────────────────────────────────────────────────── + Task( + id="net-check-port", + description="check if port 80 is open on localhost", + verify_cmd="true", + solution="nc -z localhost 80 2>/dev/null; true", + category="network", + ), + Task( + id="net-dns-lookup", + description="look up the IP address of example.com", + verify_cmd="true", + solution="host example.com 2>/dev/null || nslookup example.com 2>/dev/null || dig example.com +short", + category="network", + ), + # ── git ──────────────────────────────────────────────────────────────── + Task( + id="git-status", + description="show the current git repository status", + verify_cmd="true", + solution="git status", + category="git", + ), + Task( + id="git-log-short", + description="show the last 5 git commits in one-line format", + verify_cmd="true", + solution="git log --oneline -5", + category="git", + ), + Task( + id="git-list-branches", + description="list all git branches", + verify_cmd="true", + solution="git branch -a", + category="git", + ), + # ── shell utilities ──────────────────────────────────────────────────── + Task( + id="shell-date", + description="print the current date and time", + verify_cmd="true", + solution="date", + category="shell", + ), + Task( + id="shell-calc", + description="calculate 2 to the power of 10 using the shell", + verify_cmd="true", + solution="echo $((2**10))", + category="shell", + ), + Task( + id="shell-pipe-count", + description="count the number of files in /etc", + verify_cmd="[ -d /etc ]", + solution="ls /etc | wc -l", + category="shell", + ), +] diff --git a/src/vox/cli.py b/src/vox/cli.py index d817ffb..3a803a2 100644 --- a/src/vox/cli.py +++ b/src/vox/cli.py @@ -385,6 +385,57 @@ def cmd_agent(args: argparse.Namespace, cfg: VoxConfig) -> None: console.print("\n[dim]Cancelled.[/dim]") +def cmd_bench(args: argparse.Namespace, cfg: VoxConfig) -> None: + """Run Terminal-Bench 2.0 evaluation tasks against the Vox engine.""" + from vox.bench.harness import Harness + from vox.bench.tasks import BUILTIN_TASKS + + tasks = BUILTIN_TASKS + if args.category: + tasks = [t for t in tasks if t.category == args.category] + if not tasks: + console.print(f"[yellow]No tasks found for category '{args.category}'.[/yellow]") + console.print( + "[dim]Available categories: " + + ", ".join(sorted({t.category for t in BUILTIN_TASKS})) + + "[/dim]" + ) + return + + if args.list: + from rich.table import Table + + table = Table(title="Terminal-Bench Tasks", header_style="bold cyan") + table.add_column("ID", width=22) + table.add_column("Category", width=12) + table.add_column("Description") + for t in tasks: + table.add_row(t.id, t.category, t.description) + console.print(table) + console.print(f"\n[dim]{len(tasks)} tasks[/dim]") + return + + if args.task_id: + matched = [t for t in tasks if t.id == args.task_id] + if not matched: + console.print(f"[red]Task '{args.task_id}' not found.[/red]") + return + tasks = matched + + harness = Harness(cfg=cfg, dry_run=args.dry_run) + + console.print( + f"\n[bold cyan]Terminal-Bench 2.0[/bold cyan] — running {len(tasks)} task(s)" + + (" [dim](dry-run)[/dim]" if args.dry_run else "") + ) + console.print() + + results = harness.run_all(tasks) + + console.print() + harness.print_summary(results) + + def cmd_config(args: argparse.Namespace, _cfg: VoxConfig) -> None: """Manage vox configuration.""" if args.config_action == "init": @@ -458,6 +509,25 @@ def main() -> None: help="Force a specific agent (claude, codex, gemini, amp, droid)", ) + # ── bench ───────────────────────────────────────────────────────────── + bench_parser = subparsers.add_parser( + "bench", help="Run Terminal-Bench 2.0 evaluation tasks" + ) + bench_parser.add_argument( + "--list", "-l", action="store_true", help="List available tasks" + ) + bench_parser.add_argument( + "--task", "-t", dest="task_id", default=None, help="Run a specific task by ID" + ) + bench_parser.add_argument( + "--category", "-c", default=None, help="Filter tasks by category" + ) + bench_parser.add_argument( + "--dry-run", + action="store_true", + help="Translate commands but do not execute or verify them", + ) + # ── config ──────────────────────────────────────────────────────────── config_parser = subparsers.add_parser("config", help="Manage configuration") config_parser.add_argument( @@ -484,6 +554,8 @@ def main() -> None: cmd_speak(args, cfg) elif args.command == "agent": cmd_agent(args, cfg) + elif args.command == "bench": + cmd_bench(args, cfg) elif args.command == "config": cmd_config(args, cfg) elif args.query: diff --git a/tests/test_bench.py b/tests/test_bench.py new file mode 100644 index 0000000..7cac181 --- /dev/null +++ b/tests/test_bench.py @@ -0,0 +1,237 @@ +"""Tests for the Terminal-Bench 2.0 harness.""" + +from unittest.mock import MagicMock, patch + +from vox.bench.harness import BenchResult, Harness, Task +from vox.bench.tasks import BUILTIN_TASKS +from vox.config import VoxConfig + +# ── Task dataclass ──────────────────────────────────────────────────────────── + + +def test_task_defaults(): + t = Task(id="t1", description="list files", verify_cmd="true") + assert t.category == "general" + assert t.timeout == 30 + assert t.setup_cmd == "" + assert t.teardown_cmd == "" + assert t.solution == "" + + +def test_task_custom_fields(): + t = Task( + id="t2", + description="check space", + verify_cmd="df -h", + category="system", + timeout=60, + solution="df -h", + ) + assert t.category == "system" + assert t.timeout == 60 + + +# ── BenchResult dataclass ───────────────────────────────────────────────────── + + +def test_bench_result_defaults(): + r = BenchResult(task_id="t1", description="list files", category="file", passed=True) + assert r.passed is True + assert r.generated_cmd == "" + assert r.error == "" + assert r.exit_code == -1 + + +# ── BUILTIN_TASKS ───────────────────────────────────────────────────────────── + + +def test_builtin_tasks_not_empty(): + assert len(BUILTIN_TASKS) > 0 + + +def test_builtin_tasks_have_required_fields(): + for task in BUILTIN_TASKS: + assert task.id, f"task missing id: {task}" + assert task.description, f"task {task.id} missing description" + assert task.verify_cmd is not None, f"task {task.id} missing verify_cmd" + assert task.category, f"task {task.id} missing category" + + +def test_builtin_task_ids_are_unique(): + ids = [t.id for t in BUILTIN_TASKS] + assert len(ids) == len(set(ids)), "Duplicate task IDs found" + + +def test_builtin_tasks_categories(): + categories = {t.category for t in BUILTIN_TASKS} + assert "file" in categories + assert "text" in categories + assert "process" in categories + assert "system" in categories + + +# ── Harness.run_task ────────────────────────────────────────────────────────── + + +@patch("vox.bench.harness.subprocess.run") +@patch("vox.engine.httpx.post") +def test_run_task_pass(mock_post, mock_run): + """Task passes when translate returns a command and verify exits 0.""" + mock_response = MagicMock() + mock_response.json.return_value = {"message": {"content": "ls -la"}} + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + + task = Task(id="t1", description="list files", verify_cmd="true") + harness = Harness(cfg=VoxConfig()) + result = harness.run_task(task) + + assert result.task_id == "t1" + assert result.generated_cmd == "ls -la" + assert result.passed is True + + +@patch("vox.bench.harness.subprocess.run") +@patch("vox.engine.httpx.post") +def test_run_task_fail_verify(mock_post, mock_run): + """Task fails when verify command returns non-zero.""" + mock_response = MagicMock() + mock_response.json.return_value = {"message": {"content": "ls -la"}} + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + # exec returns 0, verify returns 1 + mock_run.side_effect = [ + MagicMock(returncode=0, stdout="", stderr=""), + MagicMock(returncode=1, stdout="", stderr="failed"), + ] + + task = Task(id="t1", description="list files", verify_cmd="false") + harness = Harness(cfg=VoxConfig()) + result = harness.run_task(task) + + assert result.passed is False + assert result.verify_exit_code == 1 + + +@patch("vox.engine.httpx.post") +def test_run_task_empty_translation(mock_post): + """Task fails gracefully when translation returns empty.""" + mock_response = MagicMock() + mock_response.json.return_value = {"message": {"content": ""}} + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + task = Task(id="t1", description="list files", verify_cmd="true") + harness = Harness(cfg=VoxConfig()) + result = harness.run_task(task) + + assert result.passed is False + assert "empty" in result.error + + +@patch("vox.engine.httpx.post") +def test_run_task_dry_run(mock_post): + """In dry_run mode, commands are translated but never executed.""" + mock_response = MagicMock() + mock_response.json.return_value = {"message": {"content": "ls -la"}} + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + task = Task(id="t1", description="list files", verify_cmd="true") + harness = Harness(cfg=VoxConfig(), dry_run=True) + + with patch("vox.bench.harness.subprocess.run") as mock_run: + result = harness.run_task(task) + mock_run.assert_not_called() + + assert result.generated_cmd == "ls -la" + assert result.passed is False # dry_run never marks passed + + +@patch("vox.engine.httpx.post", side_effect=Exception("connection error")) +def test_run_task_translation_error(mock_post): + """Task records error when translation raises an exception.""" + task = Task(id="t1", description="list files", verify_cmd="true") + harness = Harness(cfg=VoxConfig()) + result = harness.run_task(task) + assert result.passed is False + + +# ── Harness.run_all ─────────────────────────────────────────────────────────── + + +@patch("vox.bench.harness.subprocess.run") +@patch("vox.engine.httpx.post") +def test_run_all_returns_all_results(mock_post, mock_run): + mock_response = MagicMock() + mock_response.json.return_value = {"message": {"content": "true"}} + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + + tasks = [ + Task(id="t1", description="task one", verify_cmd="true"), + Task(id="t2", description="task two", verify_cmd="true"), + ] + harness = Harness(cfg=VoxConfig()) + results = harness.run_all(tasks) + + assert len(results) == 2 + assert {r.task_id for r in results} == {"t1", "t2"} + + +# ── Harness.print_summary ───────────────────────────────────────────────────── + + +def test_print_summary_no_tasks(capsys): + harness = Harness(cfg=VoxConfig()) + harness.print_summary([]) + + +def test_print_summary_mixed_results(): + results = [ + BenchResult(task_id="t1", description="pass task", category="file", passed=True), + BenchResult(task_id="t2", description="fail task", category="text", passed=False), + ] + harness = Harness(cfg=VoxConfig()) + harness.print_summary(results) + + +# ── Pi agent registration ───────────────────────────────────────────────────── + + +def test_pi_agent_registered(): + from vox.agents.router import ALL_AGENTS + + names = {a.name for a in ALL_AGENTS} + assert "pi" in names + + +@patch("shutil.which") +def test_pi_agent_discover(mock_which): + from vox.agents.router import discover_agents + + def side_effect(binary): + return "/usr/local/bin/pi" if binary == "pi" else None + + mock_which.side_effect = side_effect + agents = discover_agents() + assert "pi" in agents + assert agents["pi"] == "/usr/local/bin/pi" + + +@patch("vox.agents.base.BaseAgent._exec") +def test_pi_agent_run(mock_exec): + from vox.agents.base import AgentResult + from vox.agents.pi import PiAgent + + mock_exec.return_value = AgentResult(agent="pi", output="done", exit_code=0) + result = PiAgent.run("fix the bug") + assert result.agent == "pi" + cmd = mock_exec.call_args[0][0] + assert cmd[0] == "pi" + assert "--print" in cmd + assert "fix the bug" in cmd