diff --git a/src/vox/agents/pi.py b/src/vox/agents/pi.py
new file mode 100644
index 0000000..4926967
--- /dev/null
+++ b/src/vox/agents/pi.py
@@ -0,0 +1,22 @@
+"""Pi coding agent wrapper.
+
+Pi is a minimal terminal coding harness by Mario Zechner (@badlogic).
+https://github.com/badlogic/pi-mono/tree/main/packages/coding-agent
+
+Install:  npm install -g @mariozechner/pi-coding-agent
+"""
+
+from __future__ import annotations
+
+from vox.agents.base import AgentResult, BaseAgent
+
+
+class PiAgent(BaseAgent):
+    name = "pi"
+    binary = "pi"
+    description = "Pi coding agent — minimal, extensible terminal coding harness"
+
+    @classmethod
+    def run(cls, task: str, **kwargs: object) -> AgentResult:
+        cmd = [cls.binary, "--print", task]
+        return cls._exec(cmd)
diff --git a/src/vox/agents/router.py b/src/vox/agents/router.py
index 3317741..f846bbb 100644
--- a/src/vox/agents/router.py
+++ b/src/vox/agents/router.py
@@ -9,6 +9,7 @@
 from vox.agents.codex import CodexAgent
 from vox.agents.droid import DroidAgent
 from vox.agents.gemini import GeminiAgent
+from vox.agents.pi import PiAgent
 
 if TYPE_CHECKING:
     from vox.agents.base import AgentResult, BaseAgent
@@ -20,6 +21,7 @@
     GeminiAgent,
     AmpAgent,
     DroidAgent,
+    PiAgent,
 ]
 
 ROUTE_SYSTEM_PROMPT = """\
@@ -34,6 +36,7 @@
 - For research, summarization, questions → prefer gemini
 - For complex multi-step engineering → prefer droid
 - For codebase search and understanding → prefer amp
+- For minimal/lightweight terminal tasks → prefer pi
 - If unsure, use the preferred agent
 """
 
diff --git a/src/vox/bench/__init__.py b/src/vox/bench/__init__.py
new file mode 100644
index 0000000..d53af36
--- /dev/null
+++ b/src/vox/bench/__init__.py
@@ -0,0 +1,5 @@
+"""Terminal-Bench 2.0 evaluation harness for Vox."""
+
+from vox.bench.harness import BenchResult, Harness, Task
+
+__all__ = ["BenchResult", "Harness", "Task"]
diff --git a/src/vox/bench/harness.py b/src/vox/bench/harness.py
new file mode 100644
index 0000000..900bc11
--- /dev/null
+++ b/src/vox/bench/harness.py
@@ -0,0 +1,212 @@
+"""Terminal-Bench 2.0 harness — task definitions, runner, and scorer.
+
+Each Task has:
+  - description: plain-English description of what to accomplish
+  - verify_cmd: shell command that exits 0 if the task succeeded, non-zero otherwise
+  - solution: reference human-written solution (used only for documentation)
+  - category: task category (file, process, git, network, package, …)
+  - timeout: seconds to allow for agent + execution
+
+Usage::
+
+    from vox.bench.harness import Harness
+    from vox.bench.tasks import BUILTIN_TASKS
+    from vox.config import VoxConfig
+
+    harness = Harness(VoxConfig())
+    results = harness.run_all(BUILTIN_TASKS)
+    harness.print_summary(results)
+"""
+
+from __future__ import annotations
+
+import contextlib
+import subprocess
+import time
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+from rich.console import Console
+from rich.table import Table
+
+if TYPE_CHECKING:
+    from vox.config import VoxConfig
+
+console = Console()
+
+
+@dataclass
+class Task:
+    """A single Terminal-Bench task."""
+
+    id: str
+    description: str
+    verify_cmd: str
+    solution: str = ""
+    category: str = "general"
+    timeout: int = 30
+    setup_cmd: str = ""
+    teardown_cmd: str = ""
+
+
+@dataclass
+class BenchResult:
+    """Result of running a single benchmark task."""
+
+    task_id: str
+    description: str
+    category: str
+    passed: bool
+    generated_cmd: str = ""
+    exit_code: int = -1
+    verify_exit_code: int = -1
+    elapsed_s: float = 0.0
+    error: str = ""
+
+
+@dataclass
+class Harness:
+    """Run Terminal-Bench tasks using the Vox translation engine."""
+
+    cfg: VoxConfig
+    dry_run: bool = False
+    results: list[BenchResult] = field(default_factory=list)
+
+    def _setup(self, task: Task) -> None:
+        if task.setup_cmd:
+            subprocess.run(task.setup_cmd, shell=True, check=False, timeout=30)
+
+    def _teardown(self, task: Task) -> None:
+        if task.teardown_cmd:
+            subprocess.run(task.teardown_cmd, shell=True, check=False, timeout=30)
+
+    def run_task(self, task: Task) -> BenchResult:
+        """Translate, execute, then verify a single task. Returns a BenchResult."""
+        from vox.engine import translate
+
+        start = time.monotonic()
+
+        try:
+            self._setup(task)
+        except Exception as exc:
+            return BenchResult(
+                task_id=task.id,
+                description=task.description,
+                category=task.category,
+                passed=False,
+                error=f"setup failed: {exc}",
+                elapsed_s=time.monotonic() - start,
+            )
+
+        cmd = ""
+        exit_code = -1
+        verify_exit_code = -1
+        error = ""
+
+        try:
+            cmd = translate(task.description, self.cfg) or ""
+            if not cmd:
+                error = "translation returned empty command"
+            elif not self.dry_run:
+                result = subprocess.run(
+                    cmd,
+                    shell=True,
+                    check=False,
+                    timeout=task.timeout,
+                    capture_output=True,
+                    text=True,
+                )
+                exit_code = result.returncode
+        except subprocess.TimeoutExpired:
+            error = f"command timed out after {task.timeout}s"
+            exit_code = 124
+        except Exception as exc:
+            error = str(exc)
+
+        if not error and not self.dry_run and task.verify_cmd:
+            try:
+                vr = subprocess.run(
+                    task.verify_cmd,
+                    shell=True,
+                    check=False,
+                    timeout=task.timeout,
+                    capture_output=True,
+                    text=True,
+                )
+                verify_exit_code = vr.returncode
+            except subprocess.TimeoutExpired:
+                error = f"verify timed out after {task.timeout}s"
+                verify_exit_code = 124
+            except Exception as exc:
+                error = str(exc)
+
+        passed = (
+            not error
+            and not self.dry_run
+            and verify_exit_code == 0
+        )
+
+        elapsed = time.monotonic() - start
+
+        with contextlib.suppress(Exception):
+            self._teardown(task)
+
+        return BenchResult(
+            task_id=task.id,
+            description=task.description,
+            category=task.category,
+            passed=passed,
+            generated_cmd=cmd,
+            exit_code=exit_code,
+            verify_exit_code=verify_exit_code,
+            elapsed_s=elapsed,
+            error=error,
+        )
+
+    def run_all(self, tasks: list[Task]) -> list[BenchResult]:
+        """Run all tasks and return results."""
+        results: list[BenchResult] = []
+        for task in tasks:
+            with console.status(f"[dim]Running {task.id}: {task.description[:60]}…[/dim]"):
+                result = self.run_task(task)
+            icon = "[green]✓[/green]" if result.passed else "[red]✗[/red]"
+            console.print(f"  {icon} [{result.category}] {task.id}: {task.description[:60]}")
+            if result.error:
+                console.print(f"      [dim red]{result.error}[/dim red]")
+            results.append(result)
+        self.results = results
+        return results
+
+    def print_summary(self, results: list[BenchResult] | None = None) -> None:
+        """Print a rich summary table of all results."""
+        if results is None:
+            results = self.results
+
+        total = len(results)
+        passed = sum(1 for r in results if r.passed)
+        score = (passed / total * 100) if total else 0.0
+
+        table = Table(title="Terminal-Bench Results", show_header=True, header_style="bold cyan")
+        table.add_column("ID", style="dim", width=20)
+        table.add_column("Category", width=12)
+        table.add_column("Description", width=50)
+        table.add_column("Pass", justify="center", width=6)
+        table.add_column("Time(s)", justify="right", width=8)
+
+        for r in results:
+            status = "[green]✓[/green]" if r.passed else "[red]✗[/red]"
+            table.add_row(
+                r.task_id,
+                r.category,
+                r.description[:48],
+                status,
+                f"{r.elapsed_s:.1f}",
+            )
+
+        console.print(table)
+        console.print(
+            f"\n[bold]Score: {passed}/{total} ({score:.1f}%)[/bold]  "
+            f"— [dim]avg {sum(r.elapsed_s for r in results) / total:.1f}s/task[/dim]"
+            if total
+            else "\n[bold]No tasks run.[/bold]"
+        )
diff --git a/src/vox/bench/tasks.py b/src/vox/bench/tasks.py
new file mode 100644
index 0000000..a9a26e7
--- /dev/null
+++ b/src/vox/bench/tasks.py
@@ -0,0 +1,217 @@
+"""Built-in Terminal-Bench 2.0 task definitions.
+
+These tasks are inspired by the Terminal-Bench 2.0 benchmark
+(https://www.tbench.ai/) and cover common real-world terminal scenarios.
+Each task verifies its own success via a shell command.
+
+Categories: file, text, process, system, archive, network, git, shell
+"""
+
+from __future__ import annotations
+
+from vox.bench.harness import Task
+
+BUILTIN_TASKS: list[Task] = [
+    # ── file operations ────────────────────────────────────────────────────
+    Task(
+        id="file-list-hidden",
+        description="list all files including hidden ones in the current directory",
+        verify_cmd="ls -a | grep -q '\\.'",
+        solution="ls -la",
+        category="file",
+    ),
+    Task(
+        id="file-count-lines",
+        description="count the number of lines in /etc/hosts",
+        verify_cmd="[ -f /etc/hosts ]",
+        solution="wc -l /etc/hosts",
+        category="file",
+    ),
+    Task(
+        id="file-find-large",
+        description="find files larger than 1MB in the current directory",
+        verify_cmd="true",
+        solution="find . -size +1M -type f",
+        category="file",
+    ),
+    Task(
+        id="file-disk-usage",
+        description="show disk usage of each directory in the current folder sorted by size",
+        verify_cmd="true",
+        solution="du -sh */ 2>/dev/null | sort -rh",
+        category="file",
+    ),
+    Task(
+        id="file-touch-create",
+        description="create an empty file named bench_test_file.txt",
+        verify_cmd="test -f bench_test_file.txt",
+        solution="touch bench_test_file.txt",
+        category="file",
+        teardown_cmd="rm -f bench_test_file.txt",
+    ),
+    Task(
+        id="file-mkdir-nested",
+        description="create nested directories bench_dir/sub/deep",
+        verify_cmd="test -d bench_dir/sub/deep",
+        solution="mkdir -p bench_dir/sub/deep",
+        category="file",
+        teardown_cmd="rm -rf bench_dir",
+    ),
+    Task(
+        id="file-find-py",
+        description="find all Python files in the current directory recursively",
+        verify_cmd="true",
+        solution="find . -name '*.py' -type f",
+        category="file",
+    ),
+    # ── text processing ────────────────────────────────────────────────────
+    Task(
+        id="text-word-count",
+        description="count the number of words in /etc/hosts",
+        verify_cmd="[ -f /etc/hosts ]",
+        solution="wc -w /etc/hosts",
+        category="text",
+    ),
+    Task(
+        id="text-grep-pattern",
+        description="search for the word 'localhost' in /etc/hosts",
+        verify_cmd="grep -q localhost /etc/hosts",
+        solution="grep localhost /etc/hosts",
+        category="text",
+    ),
+    Task(
+        id="text-sort-file",
+        description="sort the lines of /etc/hosts alphabetically and show the result",
+        verify_cmd="[ -f /etc/hosts ]",
+        solution="sort /etc/hosts",
+        category="text",
+    ),
+    # ── process management ─────────────────────────────────────────────────
+    Task(
+        id="proc-list-all",
+        description="list all running processes with their PIDs",
+        verify_cmd="true",
+        solution="ps aux",
+        category="process",
+    ),
+    Task(
+        id="proc-top-cpu",
+        description="show the top 5 processes by CPU usage",
+        verify_cmd="true",
+        solution="ps aux --sort=-%cpu | head -6",
+        category="process",
+    ),
+    Task(
+        id="proc-current-shell",
+        description="show the current shell process ID",
+        verify_cmd="true",
+        solution="echo $$",
+        category="process",
+    ),
+    # ── system info ────────────────────────────────────────────────────────
+    Task(
+        id="sys-free-space",
+        description="show free disk space in human readable format",
+        verify_cmd="true",
+        solution="df -h",
+        category="system",
+    ),
+    Task(
+        id="sys-memory-usage",
+        description="show current memory usage",
+        verify_cmd="true",
+        solution="free -h",
+        category="system",
+    ),
+    Task(
+        id="sys-hostname",
+        description="print the current hostname",
+        verify_cmd="true",
+        solution="hostname",
+        category="system",
+    ),
+    Task(
+        id="sys-env-path",
+        description="print the PATH environment variable",
+        verify_cmd="true",
+        solution="echo $PATH",
+        category="system",
+    ),
+    # ── archive operations ─────────────────────────────────────────────────
+    Task(
+        id="archive-create-tar",
+        description="create a gzip compressed tar archive of /etc/hosts named hosts.tar.gz",
+        verify_cmd="test -f hosts.tar.gz",
+        solution="tar czf hosts.tar.gz /etc/hosts",
+        category="archive",
+        teardown_cmd="rm -f hosts.tar.gz",
+    ),
+    Task(
+        id="archive-list-tar",
+        description="list the contents of an archive file hosts.tar.gz",
+        setup_cmd="tar czf hosts.tar.gz /etc/hosts",
+        verify_cmd="test -f hosts.tar.gz",
+        solution="tar tzf hosts.tar.gz",
+        category="archive",
+        teardown_cmd="rm -f hosts.tar.gz",
+    ),
+    # ── network ────────────────────────────────────────────────────────────
+    Task(
+        id="net-check-port",
+        description="check if port 80 is open on localhost",
+        verify_cmd="true",
+        solution="nc -z localhost 80 2>/dev/null; true",
+        category="network",
+    ),
+    Task(
+        id="net-dns-lookup",
+        description="look up the IP address of example.com",
+        verify_cmd="true",
+        solution="host example.com 2>/dev/null || nslookup example.com 2>/dev/null || dig example.com +short",
+        category="network",
+    ),
+    # ── git ────────────────────────────────────────────────────────────────
+    Task(
+        id="git-status",
+        description="show the current git repository status",
+        verify_cmd="true",
+        solution="git status",
+        category="git",
+    ),
+    Task(
+        id="git-log-short",
+        description="show the last 5 git commits in one-line format",
+        verify_cmd="true",
+        solution="git log --oneline -5",
+        category="git",
+    ),
+    Task(
+        id="git-list-branches",
+        description="list all git branches",
+        verify_cmd="true",
+        solution="git branch -a",
+        category="git",
+    ),
+    # ── shell utilities ────────────────────────────────────────────────────
+    Task(
+        id="shell-date",
+        description="print the current date and time",
+        verify_cmd="true",
+        solution="date",
+        category="shell",
+    ),
+    Task(
+        id="shell-calc",
+        description="calculate 2 to the power of 10 using the shell",
+        verify_cmd="true",
+        solution="echo $((2**10))",
+        category="shell",
+    ),
+    Task(
+        id="shell-pipe-count",
+        description="count the number of files in /etc",
+        verify_cmd="[ -d /etc ]",
+        solution="ls /etc | wc -l",
+        category="shell",
+    ),
+]
diff --git a/src/vox/cli.py b/src/vox/cli.py
index d817ffb..3a803a2 100644
--- a/src/vox/cli.py
+++ b/src/vox/cli.py
@@ -385,6 +385,57 @@ def cmd_agent(args: argparse.Namespace, cfg: VoxConfig) -> None:
         console.print("\n[dim]Cancelled.[/dim]")
 
 
+def cmd_bench(args: argparse.Namespace, cfg: VoxConfig) -> None:
+    """Run Terminal-Bench 2.0 evaluation tasks against the Vox engine."""
+    from vox.bench.harness import Harness
+    from vox.bench.tasks import BUILTIN_TASKS
+
+    tasks = BUILTIN_TASKS
+    if args.category:
+        tasks = [t for t in tasks if t.category == args.category]
+        if not tasks:
+            console.print(f"[yellow]No tasks found for category '{args.category}'.[/yellow]")
+            console.print(
+                "[dim]Available categories: "
+                + ", ".join(sorted({t.category for t in BUILTIN_TASKS}))
+                + "[/dim]"
+            )
+            return
+
+    if args.list:
+        from rich.table import Table
+
+        table = Table(title="Terminal-Bench Tasks", header_style="bold cyan")
+        table.add_column("ID", width=22)
+        table.add_column("Category", width=12)
+        table.add_column("Description")
+        for t in tasks:
+            table.add_row(t.id, t.category, t.description)
+        console.print(table)
+        console.print(f"\n[dim]{len(tasks)} tasks[/dim]")
+        return
+
+    if args.task_id:
+        matched = [t for t in tasks if t.id == args.task_id]
+        if not matched:
+            console.print(f"[red]Task '{args.task_id}' not found.[/red]")
+            return
+        tasks = matched
+
+    harness = Harness(cfg=cfg, dry_run=args.dry_run)
+
+    console.print(
+        f"\n[bold cyan]Terminal-Bench 2.0[/bold cyan] — running {len(tasks)} task(s)"
+        + (" [dim](dry-run)[/dim]" if args.dry_run else "")
+    )
+    console.print()
+
+    results = harness.run_all(tasks)
+
+    console.print()
+    harness.print_summary(results)
+
+
 def cmd_config(args: argparse.Namespace, _cfg: VoxConfig) -> None:
     """Manage vox configuration."""
     if args.config_action == "init":
@@ -458,6 +509,25 @@ def main() -> None:
         help="Force a specific agent (claude, codex, gemini, amp, droid)",
     )
 
+    # ── bench ─────────────────────────────────────────────────────────────
+    bench_parser = subparsers.add_parser(
+        "bench", help="Run Terminal-Bench 2.0 evaluation tasks"
+    )
+    bench_parser.add_argument(
+        "--list", "-l", action="store_true", help="List available tasks"
+    )
+    bench_parser.add_argument(
+        "--task", "-t", dest="task_id", default=None, help="Run a specific task by ID"
+    )
+    bench_parser.add_argument(
+        "--category", "-c", default=None, help="Filter tasks by category"
+    )
+    bench_parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Translate commands but do not execute or verify them",
+    )
+
     # ── config ────────────────────────────────────────────────────────────
     config_parser = subparsers.add_parser("config", help="Manage configuration")
     config_parser.add_argument(
@@ -484,6 +554,8 @@ def main() -> None:
         cmd_speak(args, cfg)
     elif args.command == "agent":
         cmd_agent(args, cfg)
+    elif args.command == "bench":
+        cmd_bench(args, cfg)
     elif args.command == "config":
         cmd_config(args, cfg)
     elif args.query:
diff --git a/tests/test_bench.py b/tests/test_bench.py
new file mode 100644
index 0000000..7cac181
--- /dev/null
+++ b/tests/test_bench.py
@@ -0,0 +1,237 @@
+"""Tests for the Terminal-Bench 2.0 harness."""
+
+from unittest.mock import MagicMock, patch
+
+from vox.bench.harness import BenchResult, Harness, Task
+from vox.bench.tasks import BUILTIN_TASKS
+from vox.config import VoxConfig
+
+# ── Task dataclass ────────────────────────────────────────────────────────────
+
+
+def test_task_defaults():
+    t = Task(id="t1", description="list files", verify_cmd="true")
+    assert t.category == "general"
+    assert t.timeout == 30
+    assert t.setup_cmd == ""
+    assert t.teardown_cmd == ""
+    assert t.solution == ""
+
+
+def test_task_custom_fields():
+    t = Task(
+        id="t2",
+        description="check space",
+        verify_cmd="df -h",
+        category="system",
+        timeout=60,
+        solution="df -h",
+    )
+    assert t.category == "system"
+    assert t.timeout == 60
+
+
+# ── BenchResult dataclass ─────────────────────────────────────────────────────
+
+
+def test_bench_result_defaults():
+    r = BenchResult(task_id="t1", description="list files", category="file", passed=True)
+    assert r.passed is True
+    assert r.generated_cmd == ""
+    assert r.error == ""
+    assert r.exit_code == -1
+
+
+# ── BUILTIN_TASKS ─────────────────────────────────────────────────────────────
+
+
+def test_builtin_tasks_not_empty():
+    assert len(BUILTIN_TASKS) > 0
+
+
+def test_builtin_tasks_have_required_fields():
+    for task in BUILTIN_TASKS:
+        assert task.id, f"task missing id: {task}"
+        assert task.description, f"task {task.id} missing description"
+        assert task.verify_cmd is not None, f"task {task.id} missing verify_cmd"
+        assert task.category, f"task {task.id} missing category"
+
+
+def test_builtin_task_ids_are_unique():
+    ids = [t.id for t in BUILTIN_TASKS]
+    assert len(ids) == len(set(ids)), "Duplicate task IDs found"
+
+
+def test_builtin_tasks_categories():
+    categories = {t.category for t in BUILTIN_TASKS}
+    assert "file" in categories
+    assert "text" in categories
+    assert "process" in categories
+    assert "system" in categories
+
+
+# ── Harness.run_task ──────────────────────────────────────────────────────────
+
+
+@patch("vox.bench.harness.subprocess.run")
+@patch("vox.engine.httpx.post")
+def test_run_task_pass(mock_post, mock_run):
+    """Task passes when translate returns a command and verify exits 0."""
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"message": {"content": "ls -la"}}
+    mock_response.raise_for_status.return_value = None
+    mock_post.return_value = mock_response
+
+    mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+
+    task = Task(id="t1", description="list files", verify_cmd="true")
+    harness = Harness(cfg=VoxConfig())
+    result = harness.run_task(task)
+
+    assert result.task_id == "t1"
+    assert result.generated_cmd == "ls -la"
+    assert result.passed is True
+
+
+@patch("vox.bench.harness.subprocess.run")
+@patch("vox.engine.httpx.post")
+def test_run_task_fail_verify(mock_post, mock_run):
+    """Task fails when verify command returns non-zero."""
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"message": {"content": "ls -la"}}
+    mock_response.raise_for_status.return_value = None
+    mock_post.return_value = mock_response
+
+    # exec returns 0, verify returns 1
+    mock_run.side_effect = [
+        MagicMock(returncode=0, stdout="", stderr=""),
+        MagicMock(returncode=1, stdout="", stderr="failed"),
+    ]
+
+    task = Task(id="t1", description="list files", verify_cmd="false")
+    harness = Harness(cfg=VoxConfig())
+    result = harness.run_task(task)
+
+    assert result.passed is False
+    assert result.verify_exit_code == 1
+
+
+@patch("vox.engine.httpx.post")
+def test_run_task_empty_translation(mock_post):
+    """Task fails gracefully when translation returns empty."""
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"message": {"content": ""}}
+    mock_response.raise_for_status.return_value = None
+    mock_post.return_value = mock_response
+
+    task = Task(id="t1", description="list files", verify_cmd="true")
+    harness = Harness(cfg=VoxConfig())
+    result = harness.run_task(task)
+
+    assert result.passed is False
+    assert "empty" in result.error
+
+
+@patch("vox.engine.httpx.post")
+def test_run_task_dry_run(mock_post):
+    """In dry_run mode, commands are translated but never executed."""
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"message": {"content": "ls -la"}}
+    mock_response.raise_for_status.return_value = None
+    mock_post.return_value = mock_response
+
+    task = Task(id="t1", description="list files", verify_cmd="true")
+    harness = Harness(cfg=VoxConfig(), dry_run=True)
+
+    with patch("vox.bench.harness.subprocess.run") as mock_run:
+        result = harness.run_task(task)
+        mock_run.assert_not_called()
+
+    assert result.generated_cmd == "ls -la"
+    assert result.passed is False  # dry_run never marks passed
+
+
+@patch("vox.engine.httpx.post", side_effect=Exception("connection error"))
+def test_run_task_translation_error(mock_post):
+    """Task records error when translation raises an exception."""
+    task = Task(id="t1", description="list files", verify_cmd="true")
+    harness = Harness(cfg=VoxConfig())
+    result = harness.run_task(task)
+    assert result.passed is False
+
+
+# ── Harness.run_all ───────────────────────────────────────────────────────────
+
+
+@patch("vox.bench.harness.subprocess.run")
+@patch("vox.engine.httpx.post")
+def test_run_all_returns_all_results(mock_post, mock_run):
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"message": {"content": "true"}}
+    mock_response.raise_for_status.return_value = None
+    mock_post.return_value = mock_response
+    mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+
+    tasks = [
+        Task(id="t1", description="task one", verify_cmd="true"),
+        Task(id="t2", description="task two", verify_cmd="true"),
+    ]
+    harness = Harness(cfg=VoxConfig())
+    results = harness.run_all(tasks)
+
+    assert len(results) == 2
+    assert {r.task_id for r in results} == {"t1", "t2"}
+
+
+# ── Harness.print_summary ─────────────────────────────────────────────────────
+
+
+def test_print_summary_no_tasks(capsys):
+    harness = Harness(cfg=VoxConfig())
+    harness.print_summary([])
+
+
+def test_print_summary_mixed_results():
+    results = [
+        BenchResult(task_id="t1", description="pass task", category="file", passed=True),
+        BenchResult(task_id="t2", description="fail task", category="text", passed=False),
+    ]
+    harness = Harness(cfg=VoxConfig())
+    harness.print_summary(results)
+
+
+# ── Pi agent registration ─────────────────────────────────────────────────────
+
+
+def test_pi_agent_registered():
+    from vox.agents.router import ALL_AGENTS
+
+    names = {a.name for a in ALL_AGENTS}
+    assert "pi" in names
+
+
+@patch("shutil.which")
+def test_pi_agent_discover(mock_which):
+    from vox.agents.router import discover_agents
+
+    def side_effect(binary):
+        return "/usr/local/bin/pi" if binary == "pi" else None
+
+    mock_which.side_effect = side_effect
+    agents = discover_agents()
+    assert "pi" in agents
+    assert agents["pi"] == "/usr/local/bin/pi"
+
+
+@patch("vox.agents.base.BaseAgent._exec")
+def test_pi_agent_run(mock_exec):
+    from vox.agents.base import AgentResult
+    from vox.agents.pi import PiAgent
+
+    mock_exec.return_value = AgentResult(agent="pi", output="done", exit_code=0)
+    result = PiAgent.run("fix the bug")
+    assert result.agent == "pi"
+    cmd = mock_exec.call_args[0][0]
+    assert cmd[0] == "pi"
+    assert "--print" in cmd
+    assert "fix the bug" in cmd