diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json
new file mode 100644
index 0000000..1b088b0
--- /dev/null
+++ b/.agents/plugins/marketplace.json
@@ -0,0 +1,22 @@
+{
+  "name": "skill-optimizer",
+  "interface": {
+    "displayName": "Skill Optimizer",
+    "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs"
+  },
+  "plugins": [
+    {
+      "name": "skill-optimizer",
+      "source": {
+        "source": "local",
+        "path": "./"
+      },
+      "policy": {
+        "installation": "AVAILABLE",
+        "authentication": "ON_INSTALL"
+      },
+      "category": "Coding"
+    }
+  ]
+}
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
new file mode 100644
index 0000000..5bf84bf
--- /dev/null
+++ b/.claude-plugin/marketplace.json
@@ -0,0 +1,21 @@
+{
+  "name": "skill-optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "owner": {
+    "name": "Fast"
+  },
+  "plugins": [
+    {
+      "name": "skill-optimizer",
+      "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+      "version": "2.0.0",
+      "source": "./",
+      "author": {
+        "name": "Fast"
+      },
+      "skills": [
+        "./skills/skill-optimizer"
+      ]
+    }
+  ]
+}
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
new file mode 100644
index 0000000..7aada51
--- /dev/null
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,19 @@
+{
+  "name": "skill-optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "version": "2.0.0",
+  "author": {
+    "name": "Fast"
+  },
+  "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
+  "repository": "https://github.com/fastxyz/skill-optimizer",
+  "license": "MIT",
+  "keywords": [
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation",
+    "optimization"
+  ],
+  "skills": "./skills/"
+}
diff --git a/.codex b/.codex
deleted file mode 100644
index e69de29..0000000
diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json
new file mode 100644
index 0000000..71fbfc5
--- /dev/null
+++ b/.codex-plugin/plugin.json
@@ -0,0 +1,35 @@
+{
+  "name": "skill-optimizer",
+  "version": "2.0.0",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "author": {
+    "name": "Fast"
+  },
+  "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
+  "repository": "https://github.com/fastxyz/skill-optimizer",
+  "license": "MIT",
+  "keywords": [
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation",
+    "optimization"
+  ],
+  "skills": "./skills/",
+  "interface": {
+    "displayName": "Skill Optimizer",
+    "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "developerName": "Fast",
+    "category": "Coding",
+    "capabilities": [
+      "Read",
+      "Write"
+    ],
+    "defaultPrompt": [
+      "Use Skill Optimizer to design and run an eval suite for this agent skill.",
+      "Use Skill Optimizer to inspect failing traces and improve the skill."
+    ],
+    "brandColor": "#3B82F6"
+  }
+}
diff --git a/.codex/INSTALL.md b/.codex/INSTALL.md
new file mode 100644
index 0000000..489e8ee
--- /dev/null
+++ b/.codex/INSTALL.md
@@ -0,0 +1,31 @@
+# Installing skill-optimizer for Codex
+
+Use `skill-optimizer` in Codex as either a plugin or a native skill.
+
+## Plugin Install
+
+Register this repository as a plugin marketplace:
+
+```bash
+codex plugin marketplace add fastxyz/skill-optimizer
+```
+
+Open `/plugins`, select the `skill-optimizer` marketplace, and install the `skill-optimizer` plugin.
+
+The marketplace file is `.agents/plugins/marketplace.json`. It exposes the repository root as the plugin source so Codex can load `.codex-plugin/plugin.json` and the bundled `skills/` directory.
+
+To pin a Git ref while installing the marketplace:
+
+```bash
+codex plugin marketplace add fastxyz/skill-optimizer --ref main
+```
+
+## Skill-Only Install
+
+Install the canonical skill with the open skills CLI:
+
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a codex -y
+```
+
+Restart Codex if the skill does not appear immediately.
diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json
new file mode 100644
index 0000000..c65a94a
--- /dev/null
+++ b/.cursor-plugin/plugin.json
@@ -0,0 +1,36 @@
+{
+  "name": "skill-optimizer",
+  "displayName": "Skill Optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "version": "2.0.0",
+  "author": {
+    "name": "Fast"
+  },
+  "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
+  "repository": "https://github.com/fastxyz/skill-optimizer",
+  "license": "MIT",
+  "keywords": [
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation",
+    "optimization"
+  ],
+  "skills": "./skills/",
+  "interface": {
+    "displayName": "Skill Optimizer",
+    "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "developerName": "Fast",
+    "category": "Coding",
+    "capabilities": [
+      "Read",
+      "Write"
+    ],
+    "defaultPrompt": [
+      "Use Skill Optimizer to design and run an eval suite for this agent skill.",
+      "Use Skill Optimizer to inspect failing traces and improve the skill."
+    ],
+    "brandColor": "#3B82F6"
+  }
+}
diff --git a/.cursor/INSTALL.md b/.cursor/INSTALL.md
new file mode 100644
index 0000000..c6d8de6
--- /dev/null
+++ b/.cursor/INSTALL.md
@@ -0,0 +1,15 @@
+# Installing skill-optimizer for Cursor
+
+## Skill install
+
+Install the skill into Cursor's project or global skill directory through the open skills CLI:
+
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a cursor -y
+```
+
+Cursor can also import remote skills from GitHub in Settings -> Rules -> Project Rules -> Add Rule -> Remote Rule (Github).
+
+## Plugin metadata
+
+This repository includes `.cursor-plugin/plugin.json` for Cursor-compatible plugin metadata. The canonical skill remains `skills/skill-optimizer/SKILL.md`.
diff --git a/.gitignore b/.gitignore
index 1a93e1c..955a362 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,10 @@ dist/
 
 # Cache directories
 .cache/
-.opencode/
+.opencode/*
+!.opencode/INSTALL.md
+!.opencode/plugins/
+!.opencode/plugins/skill-optimizer.js
 
 # Test results and benchmarks
 results/
@@ -55,6 +58,8 @@ docs/specs/
 
 # Skill-optimizer generated artifacts
 .skill-optimizer/
+.skill-eval/
+.results/
 
 # Local user config (personal paths, model choices — not repo artifacts)
 skill-optimizer.json
diff --git a/.opencode/INSTALL.md b/.opencode/INSTALL.md
new file mode 100644
index 0000000..c93409a
--- /dev/null
+++ b/.opencode/INSTALL.md
@@ -0,0 +1,21 @@
+# Installing skill-optimizer for OpenCode
+
+Add the plugin to `opencode.json` at user or project scope:
+
+```json
+{
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git"]
+}
+```
+
+Restart OpenCode. The plugin registers the repository `skills/` directory so the native `skill` tool can load `skill-optimizer`.
+
+Verify with the skill tool by listing skills or loading `skill-optimizer`.
+
+To pin a version, append a tag or commit ref:
+
+```json
+{
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git#v2.0.0"]
+}
+```
diff --git a/.opencode/plugins/skill-optimizer.js b/.opencode/plugins/skill-optimizer.js
new file mode 100644
index 0000000..f733c67
--- /dev/null
+++ b/.opencode/plugins/skill-optimizer.js
@@ -0,0 +1,25 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const pluginDir = path.dirname(fileURLToPath(import.meta.url));
+const skillsDir = path.resolve(pluginDir, "..", "..", "skills");
+
+function registerSkillsDir(config) {
+  config.skills = config.skills || {};
+  config.skills.paths = config.skills.paths || [];
+
+  if (!config.skills.paths.includes(skillsDir)) {
+    config.skills.paths.push(skillsDir);
+  }
+}
+
+export const SkillOptimizerPlugin = async () => ({
+  config: async (config) => {
+    registerSkillsDir(config);
+  },
+});
+
+export default {
+  id: "skill-optimizer",
+  server: SkillOptimizerPlugin,
+};
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..0d09351
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,61 @@
+# AGENTS.md
+
+## Project Overview
+
+`skill-optimizer` is a Docker workbench for running and grading agent skill eval cases. The current public CLI centers on `run-case` and `run-suite`.
+
+## Key Commands
+
+```bash
+npm run build
+npm run typecheck
+npm test
+npx tsx src/cli.ts --help
+npx tsx src/cli.ts run-case --help
+npx tsx src/cli.ts run-suite --help
+```
+
+## Important Files
+
+- `src/cli.ts`: public CLI entrypoint
+- `src/workbench/`: workbench case loading, suite loading, Docker runner, Pi agent, graders, and traces
+- `docker/workbench-runner.Dockerfile`: generic non-root container image for setup, agent, grade, and cleanup phases
+- `skills/skill-optimizer/SKILL.md`: canonical distributable Agent Skill
+- `.claude-plugin/`, `.codex-plugin/`, `.cursor-plugin/`, `.opencode/`: cross-agent plugin manifests and install support
+- `.agents/plugins/marketplace.json`: Codex repo marketplace entry for the root plugin
+- `gemini-extension.json`, `GEMINI.md`: Gemini extension metadata and context file
+- `examples/workbench/`: tracked example eval suites
+- `README.md`: provider-specific installation instructions for Claude Code, Codex, Cursor, OpenCode, Gemini CLI, and skill-only installs
+- `CONTRIBUTING.md`: contributor workflow and current workbench invariants
+
+## Installation Docs
+
+Keep the README installation section aligned with packaged plugin metadata:
+
+- Claude Code: `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json`
+- Codex: `.agents/plugins/marketplace.json` and `.codex-plugin/plugin.json`
+- Cursor: `.cursor-plugin/plugin.json` and `.cursor/INSTALL.md`
+- OpenCode: `.opencode/plugins/skill-optimizer.js` and `.opencode/INSTALL.md`
+- Gemini CLI: `gemini-extension.json` and `GEMINI.md`
+- Skill-only installs: `npx skills add fastxyz/skill-optimizer --skill skill-optimizer ...`
+
+## Invariants
+
+- Keep evaluation static: extraction and matching are allowed; do not execute model-produced code outside the Docker workbench as part of evaluation.
+- `run-suite` uses models from `suite.yml`; do not add a `run-suite --models` override.
+- Keep OpenRouter model refs as `openrouter/...`; real model runs require `OPENROUTER_API_KEY`.
+- Cases use `graders: [{ name, command }]`; legacy `check:` and `artifacts:` are invalid.
+- Graders are the acceptance contract; evaluate outputs from `/work`, generated artifacts, `answer.json`, `trace.jsonl`, and result state.
+- The agent phase sees only `/work`, not `/case` or `/results`.
+- Keep plugin metadata pointed at the canonical `skills/skill-optimizer/SKILL.md`; do not create divergent skill copies.
+- Codex plugin metadata lives in `.codex-plugin/plugin.json`; the repo marketplace lives in `.agents/plugins/marketplace.json` and points at `./`.
+- Provider install docs should link to the same canonical skill/plugin metadata, not separate skill copies.
+- Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials.
+
+## Testing Guidance
+
+- Run `npm run typecheck` after TypeScript changes.
+- Run `npm test` before finishing behavior changes.
+- For Docker runner or image changes, also run `docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .`.
+- For CLI/docs changes, verify `npx tsx src/cli.ts --help` if touched docs mention CLI behavior.
+- For plugin/package metadata changes, run `npx tsx tests/smoke-skill-distribution.ts` and verify `npm pack --dry-run --json` includes required plugin files without result/cache directories.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d52754a..5dc20ab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## 2.0.0 — 2026-05-01
+
+### Changed
+
+- Rebuilt Skill Optimizer around the eval workbench: realistic skill cases, model matrices, isolated agent workspaces, trace inspection, deterministic grader evidence, and iterative skill improvement.
+- Repositioned package and plugin metadata around the skill eval lab workflow instead of implementation mechanics.
+
+### Breaking Changes
+
+- Removed the legacy reference-solution preflight flow and `verify-suite`; graders are now the sole acceptance contract.
+- Removed reference-solution SDK exports and packaged example solution scripts.
+
+### Added
+
+- Hidden MCP services for eval cases, exposed to agents through the workbench `mcp` command.
+- Post-run optimization guidance for inspecting failures, updating skills or supporting code, and re-running evals.
+
 ## 1.1.0 — 2026-04-16
 
 ### Breaking Changes
@@ -25,7 +42,7 @@ The config file `skill-benchmark.json` is no longer auto-detected. Rename it to
 ### Added
 - **prompt surface type** — benchmark and optimize prompt templates, Claude Code skills, and agent instructions. Discovers phases and capabilities from markdown, evaluates output quality with content-based criteria.
 - **Codex auth** — direct OpenAI model runs can use browser-login tokens stored by Codex (`~/.codex/auth.json`) instead of requiring `OPENAI_API_KEY`. Set `benchmark.authMode: "codex"` and use `openai/<model>` IDs.
-- **SKILL folder** — bundled AI-agent guidance (`SKILL/SKILL.md`) so agents can use skill-optimizer reliably without extra setup.
+- **skills folder** — bundled AI-agent guidance (`skills/skill-optimizer/SKILL.md`) so agents can use skill-optimizer reliably without extra setup.
 - **Optimizer loop diagram** — README now includes a visual workflow diagram of the optimizer loop.
 - **Stable task IDs** — task IDs are now derived from a SHA-1 hash of the action names (SDK/CLI/MCP surfaces) or prompt text (prompt surface). For SDK/CLI/MCP surfaces, where action names come from discovered code rather than LLM output, IDs are stable across regenerations and the `--task <id>` filter works reliably. For the prompt surface, IDs are stable when the LLM produces identical wording; if it rephrases a task the ID changes (fixes [#17](https://github.com/fastxyz/skill-optimizer/issues/17)).
 
diff --git a/CLAUDE.md b/CLAUDE.md
index f5fd890..e91b35e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,17 +2,9 @@
 
 ## Project Overview
 
-`skill-optimizer` measures whether LLMs pick the right SDK methods, CLI commands, or MCP tools from docs and task prompts, and can run a benchmark-driven optimization loop over an allowed target repo.
+`skill-optimizer` is a Docker workbench for running and grading agent skill eval cases. The current public CLI centers on `run-case` and `run-suite`.
 
-The repo has five layers:
-
-- `src/project/`: unified `skill-optimizer.json` config loading, validation, and path resolution
-- `src/runtime/pi/`: shared Pi auth/model/runtime helpers
-- `src/tasks/`: shared task generation, grounding, and artifact freezing from discovered surfaces
-- `src/benchmark/`: loads tasks and surface definitions, builds prompts, calls models, extracts actions, evaluates them, and writes reports
-- `src/optimizer/`: runs a benchmark-driven optimization loop against a constrained target repo
-
-The benchmark is static. Do not change behavior in ways that execute model-produced code or shell commands as part of evaluation.
+The workbench gives an agent an isolated Docker `/work` directory, captures traces, and grades deterministic local outcomes from files, command logs, generated artifacts, or other workspace state.
 
 ## Key Commands
 
@@ -21,101 +13,52 @@ npm run build
 npm run typecheck
 npm test
 npx tsx src/cli.ts --help
-npx tsx src/cli.ts generate-tasks --help
-npx tsx src/cli.ts optimize --help
-```
-
-Typical benchmark run:
-
-```bash
-export OPENROUTER_API_KEY=...
-npx tsx src/cli.ts run --config ./skill-optimizer.json
-```
-
-Generate tasks only:
-
-```bash
-npx tsx src/cli.ts generate-tasks --config ./skill-optimizer.json
-```
-
-Typical optimizer run:
-
-```bash
-tsx src/optimizer/materialize-mock-repo.ts mcp-tracker-demo ./.tmp/mock-repos
-npx tsx src/cli.ts optimize --config ./.tmp/mock-repos/mcp-tracker-demo/skill-optimizer.json
+npx tsx src/cli.ts run-case --help
+npx tsx src/cli.ts run-suite --help
 ```
 
 ## Important Files
 
-- `src/cli.ts`: public CLI entrypoint (`init`, `run`, `optimize`, `compare`)
-- `src/project/types.ts`: unified public project config types
-- `src/project/load.ts`: unified `skill-optimizer.json` loader
-- `src/runtime/pi/models.ts`: shared Pi model/auth resolution
-- `src/tasks/index.ts`: shared task generation entrypoint over discovered surfaces
-- `src/benchmark/runner.ts`: orchestration for benchmark execution
-- `src/benchmark/types.ts`: benchmark report, metric, and extraction types
-- `src/benchmark/init.ts`: scaffolded starter `skill-optimizer.json`
-- `src/optimizer/loop.ts`: accept/reject iteration loop
-- `src/optimizer/manifest.ts`: adapter from unified project config into the current optimizer loop
-- `src/optimizer/mock-repos.ts`: tracked template materialization and isolated git init
-- `mock-repos/mcp-tracker-demo/`: current richer demo target for optimizer testing
+- `src/cli.ts`: public CLI entrypoint
+- `src/workbench/`: workbench case loading, suite loading, Docker runner, Pi agent, graders, and traces
+- `docker/workbench-runner.Dockerfile`: generic non-root container image for setup, agent, grade, and cleanup phases
+- `skills/skill-optimizer/SKILL.md`: canonical distributable Agent Skill
+- `skills/skill-optimizer/references/workbench.md`: detailed workbench schema and usage reference
+- `.claude-plugin/`, `.codex-plugin/`, `.cursor-plugin/`, `.opencode/`: cross-agent plugin manifests and install support
+- `.agents/plugins/marketplace.json`: Codex repo marketplace entry for the root plugin
+- `gemini-extension.json`, `GEMINI.md`: Gemini extension metadata and context file
+- `examples/workbench/`: tracked example eval suites
+- `README.md`: provider-specific installation instructions for Claude Code, Codex, Cursor, OpenCode, Gemini CLI, and skill-only installs
+- `CONTRIBUTING.md`: contributor workflow and current workbench invariants
+
+## Installation Docs
+
+Keep the README installation section aligned with packaged plugin metadata:
+
+- Claude Code: `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json`
+- Codex: `.agents/plugins/marketplace.json` and `.codex-plugin/plugin.json`
+- Cursor: `.cursor-plugin/plugin.json` and `.cursor/INSTALL.md`
+- OpenCode: `.opencode/plugins/skill-optimizer.js` and `.opencode/INSTALL.md`
+- Gemini CLI: `gemini-extension.json` and `GEMINI.md`
+- Skill-only installs: `npx skills add fastxyz/skill-optimizer --skill skill-optimizer ...`
 
 ## Invariants
 
-- Keep benchmark evaluation static. Extraction and matching are allowed; executing generated code is not.
-- Keep path resolution relative to the unified config file being loaded.
-- `targetRepo.allowedPaths` is the optimizer safety boundary. Do not widen edits outside it during mutation.
-- `requireCleanGit` must remain effectively enforced for optimizer targets.
-- Optimizer-owned artifacts under the configured task-generation output dir must not be treated as target-repo mutations.
-- **The target repo's skill file is never modified.** The optimizer copies it to `.skill-optimizer/skill-v0.md` on start and creates versioned copies per accepted iteration. The mutation agent writes to these local copies; `skillOverride` makes the benchmark read from them.
-- Stable-surface optimize runs assume the callable surface is frozen for the duration of the run. If a change renames commands/tools/APIs, the surface must be rediscovered and the benchmark snapshot regenerated before further comparisons are meaningful.
-- Materialized mock repos must stay isolated from tracked templates.
-- Documentation examples should match the current CLI and config schema.
-
-## Editing Guidance
-
-- Prefer small changes in the existing architecture over broad refactors.
-- When updating config or project types, also update the README examples and any scaffolding in `src/benchmark/init.ts` if needed.
-- When changing optimizer behavior, verify both the loop and the unified project defaults still agree.
-- Code-first surface discovery is the preferred mode for `sdk`, `cli`, and `mcp` via `target.discovery.sources`. Explicit manifest files (`target.cli.commands`, `target.mcp.tools`, `target.discovery.fallbackManifest`) are supported for projects that cannot use code-first discovery.
-- Be careful around mock repo references: code may support template names that are not currently present in the working tree.
+- Keep evaluation static: extraction and matching are allowed; do not execute model-produced code outside the Docker workbench as part of evaluation.
+- `run-suite` uses models from `suite.yml`; do not add a `run-suite --models` override.
+- Keep OpenRouter model refs as `openrouter/...`; real model runs require `OPENROUTER_API_KEY`.
+- Cases use `graders: [{ name, command }]`; legacy `check:` and `artifacts:` are invalid.
+- Graders are the acceptance contract; evaluate outputs from `/work`, generated artifacts, `answer.json`, `trace.jsonl`, and result state.
+- The agent phase sees only `/work`, not `/case` or `/results`.
+- Keep plugin metadata pointed at the canonical `skills/skill-optimizer/SKILL.md`; do not create divergent skill copies.
+- Codex plugin metadata lives in `.codex-plugin/plugin.json`; the repo marketplace lives in `.agents/plugins/marketplace.json` and points at `./`.
+- Provider install docs should link to the same canonical skill/plugin metadata, not separate skill copies.
+- Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials.
 
 ## Testing Guidance
 
 - Run `npm run typecheck` after TypeScript changes.
-- Run `npm test` before finishing when behavior changes may affect extraction, evaluation, reporting, or optimizer flow.
-- For CLI-only or docs-only changes, at minimum verify `npx tsx src/cli.ts --help` still works if the touched docs reference CLI behavior.
-
-## Model ID Convention
-
-Model IDs use a provider-prefixed format. The prefix determines how the request is routed:
-
-```
-openrouter/<provider>/<model-slug>   — routed through OpenRouter
-anthropic/<model-slug>               — direct Anthropic API
-openai/<model-slug>                  — direct OpenAI API
-```
-
-**For `openrouter/` model IDs, preserve the exact slug from OpenRouter's catalog** — these are passed verbatim to OpenRouter's API and must match exactly, including dots in version numbers:
-- `openrouter/anthropic/claude-sonnet-4.6` ✓ (dots — OpenRouter's catalog format)
-- `openrouter/openai/gpt-5.4` ✓ (dots)
-- `openrouter/deepseek/deepseek-v3.2` ✓ (dots)
-- `openrouter/google/gemini-2.5-flash` ✓
-
-**For `anthropic/` direct-API model IDs, use hyphens** — Anthropic's own API slugs use hyphens:
-- `anthropic/claude-sonnet-4-6` ✓ (hyphens)
-- `anthropic/claude-opus-4-6` ✓ (hyphens)
-
-**For `openai/` direct-API model IDs, use dots in version segments** — OpenAI's API slugs use dots:
-- `openai/gpt-5.4` ✓ (dot)
-- `openai/gpt-4.1` ✓ (dot)
-
-`src/project/validate.ts` warns on dot-notation for `anthropic/` model IDs only (`model-id-bad-format`) and `src/project/fix.ts` auto-corrects them. Both `openai/` and `openrouter/` are fully exempt from any dot→hyphen rewriting. When adding new model presets to `src/init/scaffold.ts`, `src/init/wizard.ts`, or `src/benchmark/init.ts`, copy the slug exactly from the OpenRouter catalog for `openrouter/` models.
-
-Display names (`name:` / `label:` fields) are human-readable and should keep dots (e.g. `'Claude Sonnet 4.6'`, `'Gemini 2.5 Flash'`).
-
-## Environment Notes
-
-- Do not commit `.env` or secrets.
-- Pi-based examples use `benchmark.format: "pi"` and typically expect `OPENROUTER_API_KEY`.
-- The current unified config also allows the optimizer model to use `OPENROUTER_API_KEY`.
+- Run `npm test` before finishing behavior changes.
+- For Docker runner or image changes, also run `docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .`.
+- For CLI/docs changes, verify `npx tsx src/cli.ts --help` if touched docs mention CLI behavior.
+- For plugin/package metadata changes, run `npx tsx tests/smoke-skill-distribution.ts` and verify `npm pack --dry-run --json` includes required plugin files without result/cache directories.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2a4cd71..870e01f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,10 @@
 # Contributing to skill-optimizer
 
-Thanks for contributing! This project is a small, opinionated tool — changes should preserve its core invariants (static evaluation, `allowedPaths` safety boundary, per-model universality).
+Thanks for contributing! This project is a small, opinionated Docker workbench for evaluating agent skills. Changes should preserve deterministic grading, isolated agent workspaces, and the canonical `skills/skill-optimizer/SKILL.md` distribution path.
+
+## Installing The Skill
+
+See `README.md#installation` for provider-specific install instructions for Claude Code, OpenAI Codex CLI/App, Cursor, OpenCode, Gemini CLI, and skill-only installs.
 
 ## Local workflow
 
@@ -13,42 +17,48 @@ npm test
 npm run build
 ```
 
-All three commands must pass before opening a PR.
+All three commands must pass before opening a PR when code changes are involved.
 
 ## Project layout
 
-- `src/cli.ts` — CLI entry point (single source of truth; all `npm run <script>` aliases go through it).
-- `src/project/` — config load / validate / resolve.
-- `src/tasks/` — scope filtering, coverage-guaranteed task generation.
-- `src/benchmark/` — runner, evaluator, reporter, scoring.
-- `src/optimizer/` — mutation loop, feedback pipeline, ledger.
-- `src/verdict/` — recommendations critic + rendering.
+- `src/cli.ts` — public CLI entry point for `run-case` and `run-suite`.
+- `src/workbench/` — case/suite loading, Docker runner, Pi agent wiring, graders, traces, metrics, MCP support, and trial aggregation.
+- `docker/workbench-runner.Dockerfile` — non-root container image for setup, agent, grade, and cleanup phases.
+- `skills/skill-optimizer/SKILL.md` — canonical distributable Agent Skill.
+- `skills/skill-optimizer/references/workbench.md` — detailed workbench schema and authoring reference.
+- `examples/workbench/` — packaged example suites.
+- `.claude-plugin/`, `.codex-plugin/`, `.cursor-plugin/`, `.opencode/`, `.agents/plugins/marketplace.json`, `gemini-extension.json`, `GEMINI.md` — cross-agent plugin and extension metadata.
 - `tests/` — hand-rolled smoke tests (`tsx tests/smoke-*.ts`).
 
 ## Pre-submit expectations
 
 - One feature per PR.
 - TDD: write the failing test first, implement, confirm green, commit.
-- Update `CHANGELOG.md` under the next release section.
+- Update docs and examples when behavior or install flow changes.
 - No new npm dependencies without discussion.
 - Error messages name the next step.
+- Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials.
 
-## Adding a surface type
+## Workbench invariants
 
-A surface discoverer returns `ActionDefinition[]`. To add one:
+- Keep evaluation static: extraction and matching are allowed; do not execute model-produced code outside the Docker workbench as part of evaluation.
+- Use only `openrouter/...` model refs; real model runs require `OPENROUTER_API_KEY`.
+- `run-suite` uses models from `suite.yml`; do not add a `run-suite --models` override.
+- Cases use `graders: [{ name, command }]`; legacy `check:` and `artifacts:` are invalid.
+- The agent phase sees only `/work`, not `/case`, `/results`, graders, hidden answers, or hidden metadata.
+- Keep plugin metadata pointed at the canonical `skills/skill-optimizer/SKILL.md`; do not create divergent skill copies.
 
-1. Extend `BenchmarkSurface` in `src/benchmark/types.ts`.
-2. Add a branch to `src/project/validate.ts` and `src/project/resolve.ts`.
-3. Implement the new code-first discoverer in `src/actions/discover.ts`, then register it in `src/project/snapshot.ts` (the dispatcher that routes surfaces to discoverers).
-4. Add a discovery smoke test.
+## Testing guidance
 
-## Adding an LLM provider
+- Run `npm run typecheck` after TypeScript changes.
+- Run `npm test` before finishing behavior changes.
+- For Docker runner or image changes, also run `docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .`.
+- For CLI/docs changes, verify `npx tsx src/cli.ts --help` if touched docs mention CLI behavior.
+- For plugin/package metadata changes, run `npx tsx tests/smoke-skill-distribution.ts` and verify `npm pack --dry-run --json` includes required plugin files without result/cache directories.
 
-Current transport is pi-ai + OpenRouter. To add a provider:
+## Adding workbench capabilities
 
-1. Add a new format value to `LLMConfig.format` in `src/benchmark/types.ts`.
-2. Implement the transport adapter alongside `src/runtime/pi/`.
-3. Update `createDefaultPiTaskGenerator`, `createDefaultPiCritic`, and the benchmark runner to branch on the new format.
+Keep new capabilities small and deterministic. Add validation in the relevant loader, tests in `tests/smoke-workbench-*.ts`, and docs in `skills/skill-optimizer/references/workbench.md` or `docs/workbench.md` when users need to author new YAML fields or understand new runtime behavior.
 
 ## Commit style
 
diff --git a/GEMINI.md b/GEMINI.md
new file mode 100644
index 0000000..8f6d532
--- /dev/null
+++ b/GEMINI.md
@@ -0,0 +1,5 @@
+@./AGENTS.md
+@./README.md
+@./CONTRIBUTING.md
+@./skills/skill-optimizer/SKILL.md
+@./skills/skill-optimizer/references/workbench.md
diff --git a/README.md b/README.md
index 404058f..4dc6a6b 100644
--- a/README.md
+++ b/README.md
@@ -1,229 +1,182 @@
 # skill-optimizer
 
-Benchmark and self-optimize SDK, CLI, and MCP guidance so every agent model can use your tool reliably.
+Docker workbench and Agent Skill for running deterministic evals against agent skills.
 
-skill-optimizer runs your SDK / CLI / MCP docs against multiple LLMs, measures whether they call the right actions with the right arguments, and iteratively rewrites your `SKILL.md` / docs until a floor score is met across every model.
+Use this repo in two ways:
 
-Built by the team at [Fast](https://fast.xyz/) — payment infrastructure for AI agents. [Give your agent a wallet](https://github.com/fastxyz/fast-sdk) in 3 lines of code.
+- Install the `skill-optimizer` skill/plugin into your agent so it can author and debug eval suites.
+- Run the local CLI to execute cases and suites in Docker against OpenRouter models.
 
-**Requirements:** Node.js 20+, plus either an [OpenRouter](https://openrouter.ai) API key or a local Codex login when using direct OpenAI models.
+## Installation
 
-## How it works — at a glance
+Installation differs by agent. The canonical skill is `skills/skill-optimizer/SKILL.md`; every plugin manifest points at that same file.
 
-![Optimizer Loop](https://raw.githubusercontent.com/fastxyz/skill-optimizer/main/docs/images/optimizer-loop.svg)
+### Claude Code
 
-`skill-optimizer run` benchmarks your callable surface against multiple LLMs — it discovers actions, generates tasks, calls each model, and statically evaluates action recall and argument accuracy to produce a PASS/FAIL verdict (exit 0/1) usable in CI.
+Register this repository as a Claude Code plugin marketplace:
 
-`skill-optimizer optimize` runs the benchmark as a feedback loop: it copies your SKILL.md, mutates it with an LLM agent, re-benchmarks, accepts only when scores improve, and repeats until stable. Your original SKILL.md is never modified.
+```text
+/plugin marketplace add fastxyz/skill-optimizer
+```
 
-## Installation
+Then install the plugin:
 
-```bash
-git clone https://github.com/fastxyz/skill-optimizer
-cd skill-optimizer
-npm install
-npm run build
-npm link        # makes `skill-optimizer` available globally
+```text
+/plugin install skill-optimizer@skill-optimizer
 ```
 
-## Quickstart
+### OpenAI Codex CLI
+
+Register this repository as a Codex plugin marketplace:
 
 ```bash
-export OPENROUTER_API_KEY=sk-or-...
+codex plugin marketplace add fastxyz/skill-optimizer
 ```
 
-For direct OpenAI API calls you can use your local Codex browser login instead of exporting `OPENAI_API_KEY` — set `format: "openai"` and `authMode: "codex"`:
+Then open the plugin search interface:
 
-```json
-{
-  "benchmark": {
-    "format": "openai",
-    "authMode": "codex",
-    "models": [
-      { "id": "openai/gpt-5.4", "name": "GPT-5.4", "tier": "flagship" }
-    ]
-  }
-}
+```text
+/plugins
 ```
 
-Codex auth reads a browser-login JWT or a static `OPENAI_API_KEY` from `~/.codex/auth.json`. It only applies to `openai/` model refs; `openrouter/` models always use `OPENROUTER_API_KEY`.
+Select `skill-optimizer` and install it.
 
-**Step 1 — Scaffold config** (run from your project root):
+### OpenAI Codex App
 
-```bash
-npx skill-optimizer init cli       # or: init sdk, init mcp, init prompt
-```
-
-The wizard asks for your repo path, models to benchmark, and where your `SKILL.md` lives. It creates a `.skill-optimizer/` directory:
-- `.skill-optimizer/skill-optimizer.json` — the main config (commit this)
-- `.skill-optimizer/cli-commands.json` — CLI surface manifest (template to edit, or auto-extracted)
-- `.skill-optimizer/tools.json` — MCP surface manifest (template to edit)
+In the Codex app, open Plugins from the sidebar, search for `skill-optimizer`, and install it from the Coding section.
 
-**Step 2 — (CLI/MCP only) Extract your surface** if code-first discovery yields nothing:
+If it is not listed, install it from Codex CLI first:
 
 ```bash
-npx skill-optimizer import-commands --from ./src/cli.ts
-# or for a compiled binary:
-npx skill-optimizer import-commands --from my-cli --scrape
+codex plugin marketplace add fastxyz/skill-optimizer
 ```
 
-**Step 3 — Run a benchmark:**
-
-```bash
-npx skill-optimizer run --config ./.skill-optimizer/skill-optimizer.json
-```
+### Cursor
 
-**Step 4 — Run the optimizer** (iteratively improves your `SKILL.md`):
+Install the skill with the open skills CLI:
 
 ```bash
-npx skill-optimizer optimize --config ./.skill-optimizer/skill-optimizer.json
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a cursor -y
 ```
 
-The optimizer never modifies your original `SKILL.md` — it works from versioned local copies in `.skill-optimizer/` and prints a progress table at the end showing per-model improvement.
+Cursor can also import the skill from GitHub via Settings -> Rules -> Project Rules -> Add Rule -> Remote Rule (Github). The Cursor plugin metadata lives at `.cursor-plugin/plugin.json`.
 
----
+### OpenCode
 
-**Non-interactive / CI mode:**
+Tell OpenCode:
 
-```bash
-# Accept all wizard defaults without prompts
-npx skill-optimizer init cli --yes
-
-# Load answers from a JSON file
-npx skill-optimizer init --answers answers.json
+```text
+Fetch and follow instructions from https://raw.githubusercontent.com/fastxyz/skill-optimizer/refs/heads/main/.opencode/INSTALL.md
 ```
 
-`answers.json` format:
+Or add the plugin to `opencode.json` at user or project scope:
+
 ```json
 {
-  "surface": "cli",
-  "repoPath": "/absolute/path/to/your-repo",
-  "models": [
-    "openrouter/anthropic/claude-sonnet-4.6",
-    "openrouter/deepseek/deepseek-v3.2",
-    "openrouter/google/gemini-2.5-flash",
-    "openrouter/qwen/qwen3.5-397b-a17b",
-    "openrouter/moonshotai/kimi-k2.5",
-    "openrouter/z-ai/glm-5.1",
-    "openrouter/minimax/minimax-m2.7",
-    "openrouter/google/gemma-4-31b-it",
-    "openrouter/meta-llama/llama-4-maverick"
-  ],
-  "maxTasks": 20,
-  "maxIterations": 5,
-  "entryFile": "src/cli.ts"
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git"]
 }
 ```
 
-**Key config fields** in `.skill-optimizer/skill-optimizer.json`:
+Restart OpenCode. See `docs/README.opencode.md` for details.
 
-| Field | What it does | Set it to |
-|-------|-------------|-----------|
-| `target.repoPath` | Root of the project being benchmarked | Absolute or relative path to your repo |
-| `target.discovery.sources` | Source files to scan for callable methods/commands/tools | e.g. `["../src/index.ts"]` or `["../src/server.ts"]` |
-| `target.skill` | Docs file the optimizer will edit | Path to your `SKILL.md` or equivalent guidance doc |
-| `benchmark.models` | Models to benchmark | Model IDs with provider prefix: `openrouter/<provider>/<model>` (via OpenRouter), `anthropic/<model>` (direct Anthropic), `openai/<model>` (direct OpenAI) |
-| `benchmark.authMode` | How model auth is resolved | `env` (default), `codex`, or `auto` |
+### Gemini CLI
 
-### Prompt templates / Claude Code skills
-
-Benchmark how well models follow your prompt templates:
+Install the Gemini extension from GitHub:
 
 ```bash
-skill-optimizer init prompt
-skill-optimizer run
+gemini extensions install https://github.com/fastxyz/skill-optimizer
 ```
 
-The prompt surface discovers phases and capabilities from your SKILL.md,
-generates scenario-based tasks, and evaluates output quality — not just
-tool calls. Each task is tagged with the specific capability it exercises
-(`capabilityId`), and scoring is performed against that capability's
-criteria — not the first discovered capability. It scores responses on
-required sections, format patterns, forbidden keywords, and structural
-elements (code blocks, numbered lists, tables). Coverage violations do
-not hard-fail prompt runs; coverage is informational for the prompt
-surface. This lets you optimize prompt templates the same way you
-optimize SDK/CLI/MCP guidance.
-
-## How it works
+To update:
 
-1. **Discover** callable surface (SDK methods / CLI commands / MCP tools / prompt phases) via tree-sitter, manifest, or markdown parsing.
-2. **Scope** the surface with `target.scope.include` / `target.scope.exclude` globs.
-3. **Generate tasks** — one prompt per in-scope action, coverage-guaranteed.
-4. **Benchmark** — every configured model attempts every task; static evaluator checks action calls + args.
-5. **Verdict** — PASS/FAIL against two gates (per-model floor, weighted average).
-6. **Optimize** — create a local versioned copy of your `SKILL.md` (`skill-v{N}.md` in `.skill-optimizer/`), mutate it, re-benchmark, accept only if both gates hold, rollback if not. The target repo's original skill file is never modified.
-7. **Recommendations** — on FAIL, one critic call summarizes what to improve manually.
-8. **Progress table** — after the optimizer finishes, a per-model table shows Baseline → each iteration → Final → Δ so you can see exactly where each model improved.
+```bash
+gemini extensions update skill-optimizer
+```
 
-## Configuration reference
+### Skill-Only Install
 
-See [docs/reference/config-schema.md](https://github.com/fastxyz/skill-optimizer/blob/main/docs/reference/config-schema.md) for the full generated config reference — auto-updated at every build.
+If you only want the skill files without plugin metadata, use the open skills CLI:
 
-See [docs/reference/errors.md](https://github.com/fastxyz/skill-optimizer/blob/main/docs/reference/errors.md) for all error codes, descriptions, and fix instructions.
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a claude-code -a opencode -a codex -a cursor -y
+```
 
-## Interpreting the verdict
+## Local CLI Setup
 
-Every benchmark run produces one of two verdicts: **PASS** or **FAIL**.
+Requirements:
 
-Two gates must both be satisfied for a PASS:
+- Node.js 20+
+- Docker
+- `OPENROUTER_API_KEY` for real model runs
 
-- **`benchmark.verdict.perModelFloor`** (default `0.6`): every model must pass at least this fraction of tasks. A single model below the floor fails the run, regardless of the average.
-- **`benchmark.verdict.targetWeightedAverage`** (default `0.7`): the weighted average score across all models must reach this threshold.
+Install and build:
 
-**`benchmark.models[].weight`** (default `1.0`): heavier-weighted models count more toward the weighted average. Use higher weights for flagship models you care most about.
+```bash
+npm install
+npm run build
+```
 
-The **optimizer** only accepts a mutation when:
-1. the weighted average improves by at least `minImprovement`, AND
-2. no model that was above the floor drops below it.
+Only `openrouter/...` model refs are supported.
 
-**Exit codes**: `0` = PASS, `1` = FAIL — usable directly in CI pipelines.
+## Quick Start
 
-## Scope & coverage
+Run the suite against the models listed in `suite.yml`:
 
-Control which actions are benchmarked with `target.scope`:
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+```
 
-- **`target.scope.include`** (default `["*"]`): glob patterns for actions to include.
-- **`target.scope.exclude`** (default `[]`): glob patterns for actions to exclude.
+Run one case directly:
 
-The `*` wildcard matches any sequence of characters including dots and slashes — it is not limited to a single path segment.
+```bash
+npx tsx src/cli.ts run-case ./case.yml --model openrouter/google/gemini-2.5-flash
+```
 
-Examples:
-- `"Wallet.*"` — includes all Wallet methods
-- `"*.internal*"` — excludes anything with "internal" anywhere in the name
-- `"get_*"` — includes only getter actions
+CLI help:
 
-Task generation is **coverage-guaranteed**: every in-scope action gets at least one task. If the first generation pass misses any, a targeted retry runs (max 2 iterations). If coverage still fails, an error names the uncovered actions and suggests either fixing SKILL.md guidance or adding them to `scope.exclude`.
+```bash
+npx tsx src/cli.ts --help
+npx tsx src/cli.ts run-case --help
+npx tsx src/cli.ts run-suite --help
+```
 
-## Cost notes
+## How The Workbench Works
 
-Rough LLM spend per run:
+The workbench gives an agent a skill/reference folder, an isolated `/work` directory, and deterministic graders. It is designed for evals where success can be verified from files, command logs, SQL, generated artifacts, or other local state.
 
-- **Baseline benchmark**: N models × M tasks LLM calls.
-- **Optimizer iteration**: 1 mutation call + N models × M tasks re-benchmark per iteration.
-- **Recommendations**: 1 critic call, only on FAIL verdict.
+Core concepts:
 
-No per-failure LLM calls — feedback is deterministic (structured failure details + patterns + passing/failing diffs).
+- A case is one user-like task plus one or more graders.
+- A suite is a matrix of cases and OpenRouter models.
+- `references/` is copied into `/work`; this is where the skill under test lives.
+- The agent phase sees only `/work`, not graders, hidden answers, `/case`, or `/results`.
+- Graders run after the agent with `$CASE`, `$WORK`, and `$RESULTS` available.
+- Graders are the acceptance contract. They can inspect workspace files and artifacts, `answer.json`, `trace.jsonl`, and result state under `$RESULTS`.
 
-## Dependencies
+Read `docs/workbench.md` for the full model: directory layout, Docker phases, graders, outputs, and debugging.
 
-The optimizer's coding agent is powered by `@mariozechner/pi-coding-agent`. OpenRouter-backed runs still use your configured API key env var. Direct OpenAI runs can use either `OPENAI_API_KEY` or the browser-login tokens that Codex stores in `~/.codex/auth.json`.
+## Examples
 
-## Troubleshooting
+Tracked examples live under `examples/workbench/`. The PDF example includes positive PDF extraction/splitting/creation cases and a negative case that checks the agent did not read the PDF skill file for a non-PDF task. The MCP example shows a local calculator server started as a hidden Docker service and exposed through the workbench `mcp` command.
 
-**Missing `OPENROUTER_API_KEY`**: Set it in your shell before running:
 ```bash
-export OPENROUTER_API_KEY=sk-or-...
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+npx tsx src/cli.ts run-suite examples/workbench/mcp/suite.yml --trials 1
 ```
 
-**Using Codex auth**: Set `benchmark.authMode` (and optionally `optimize.authMode`) to `"codex"` or `"auto"` and use direct OpenAI model refs such as `openai/gpt-5.4`. Codex auth only applies to the `openai` provider and reads either a browser-login access token or `OPENAI_API_KEY` from `~/.codex/auth.json`. Alternatively, set `benchmark.format` to `"openai"` with `authMode: "codex"` and `openai/...` model IDs — the client bridges to the Pi/Codex path automatically.
-
-**Dirty git**: The optimizer requires a clean git state in the target repo (`requireCleanGit: true` by default). Commit or stash uncommitted changes before running. Note: the optimizer never writes to the target repo's skill file — it works from local versioned copies in `.skill-optimizer/`.
+## Development
 
-**`maxTasks < scope_size`**: `benchmark.taskGeneration.maxTasks` must be >= the number of in-scope actions. Run `npx skill-optimizer --dry-run --config .skill-optimizer/skill-optimizer.json` to see the count without making LLM calls.
+```bash
+npm run typecheck
+npm test
+npm run build
+npx tsx src/cli.ts --help
+```
 
-**Empty scope**: `target.scope.include` matched nothing. Check your glob patterns — remember `*` matches everything including dots.
+For Docker runner or image changes:
 
-## Contributing
+```bash
+docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .
+```
 
-See [CONTRIBUTING.md](https://github.com/fastxyz/skill-optimizer/blob/main/CONTRIBUTING.md).
+Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials.
diff --git a/SKILL/SKILL.md b/SKILL/SKILL.md
deleted file mode 100644
index 38aa1ca..0000000
--- a/SKILL/SKILL.md
+++ /dev/null
@@ -1,160 +0,0 @@
----
-name: skill-optimizer
-description: >
-  Benchmark and optimize SDK, CLI, MCP, and prompt documentation so every LLM
-  model can reliably call the right actions with correct arguments. Use when
-  setting up skill-optimizer for a project, running benchmarks, interpreting
-  results, optimizing SKILL.md files, or diagnosing configuration issues. Also
-  use when working inside the skill-optimizer repository itself — for running
-  against mock repos, testing changes, or understanding the codebase.
----
-
-# skill-optimizer
-
-Benchmark your SDK / CLI / MCP / prompt docs against multiple LLMs, measure whether they call the right actions with the right arguments, and iteratively rewrite your guidance until a quality floor is met across every model.
-
-## Context Detection
-
-Before doing anything, figure out where you are:
-
-1. **Look for `skill-optimizer.json`** (in CWD or parent directories). If found, you are in a **configured target project**. Use that file path as `<config-path>` in all commands below.
-
-2. **Look for `src/cli.ts` and a `package.json` with `"name": "skill-optimizer"`**. If found, you are in the **optimizer repo itself**. You can use dev commands directly (`npm run build`, `npm test`, `npx tsx src/cli.ts`). To benchmark a target, either use the mock repos in `mock-repos/` or point `--config` at an external project's config.
-
-3. **Neither found** — you are in an **unconfigured target project**. Read `references/setup.md` to scaffold a config before proceeding.
-
-## Quick Reference
-
-| Task | Command |
-|------|---------|
-| Init config (interactive) | `npx skill-optimizer init cli\|sdk\|mcp\|prompt` |
-| Init (non-interactive, explicit surface) | `npx skill-optimizer init cli --yes` |
-| Init (auto-detect surface, non-interactive) | `npx skill-optimizer init --auto --yes` |
-| Import CLI commands | `npx skill-optimizer import-commands --from ./src/cli.ts` |
-| Import with output file | `npx skill-optimizer import-commands --from ./src/cli.ts --out ./commands.json` |
-| Import (overwrite existing) | `npx skill-optimizer import-commands --from ./src/cli.ts --out ./commands.json --force` |
-| Import (binary scrape) | `npx skill-optimizer import-commands --from my-cli --scrape --depth 3` |
-| Diagnose config | `npx skill-optimizer doctor --config <config-path>` |
-| Diagnose (skip code discovery) | `npx skill-optimizer doctor --config <config-path> --static` |
-| Diagnose (verify model access) | `npx skill-optimizer doctor --config <config-path> --check-models` |
-| Auto-fix config | `npx skill-optimizer doctor --fix --config <config-path>` |
-| Dry run (no LLM calls) | `npx skill-optimizer run --dry-run --config <config-path>` |
-| Run benchmark | `npx skill-optimizer run --config <config-path>` |
-| Run (filter by model tier) | `npx skill-optimizer run --config <config-path> --tier flagship` |
-| Generate tasks only | `npx skill-optimizer generate-tasks --config <config-path>` |
-| Run optimizer | `npx skill-optimizer optimize --config <config-path>` |
-| Compare two runs | `npx skill-optimizer compare --baseline a.json --current b.json` |
-
-`<config-path>` is the path to your `skill-optimizer.json` — typically `./.skill-optimizer/skill-optimizer.json` after running `init`, or wherever you placed it.
-
-## What Do You Need?
-
-Read the reference file that matches your current goal:
-
-| Goal | Reference |
-|------|-----------|
-| Set up skill-optimizer for a project (first time) | Read `references/setup.md` |
-| Run a benchmark or understand results | Read `references/benchmark.md` |
-| Automatically optimize a SKILL.md | Read `references/optimize.md` |
-| Understand config options | Read `references/config.md` |
-
-If you are in an **unconfigured project** (context detection case 3), start with `references/setup.md`.
-
-## Command Details
-
-### `init` — scaffold a skill-optimizer config
-
-The `init` command has three modes:
-
-1. **Interactive wizard** (default): `npx skill-optimizer init [surface]` — prompts you through setup. Optionally pass `cli`, `sdk`, `mcp`, or `prompt` as a positional argument to pre-select the surface type.
-
-2. **Non-interactive with explicit surface**: `npx skill-optimizer init <surface> --yes` — accepts all defaults for the named surface without prompting.
-
-3. **Auto-detect + non-interactive** (fully automated, zero prompts): `npx skill-optimizer init --auto --yes` — inspects the current directory to detect the surface type, then applies defaults without prompting. This is the right choice when the task says "initialize without prompts", "fully automated setup", or "detect and scaffold" — especially when the surface type isn't stated.
-
-Key parameters:
-
-| Parameter | Meaning | Notes |
-|-----------|---------|-------|
-| `[surface]` | Positional: `cli`, `sdk`, `mcp`, or `prompt` | Optional; omit when using `--auto` or running the interactive wizard |
-| `--auto` | Auto-detect surface type from CWD | Detects surface; still prompts unless combined with `--yes` |
-| `--yes` | Accept all defaults without prompting | Alone: needs explicit surface. With `--auto`: fully non-interactive. |
-| `--answers <file.json>` | Load answers from a JSON file | For CI pipelines with a pre-built answers file |
-
-**Critical:** `--auto` and `--yes` have independent effects. `--yes` alone still requires a surface name. `--auto` alone still opens the interactive wizard (pre-filled). Only `--auto --yes` together produces a completely non-interactive run.
-
-```
-# Fully automated: detect surface + accept defaults (no prompts at all)
-npx skill-optimizer init --auto --yes
-
-# Explicit surface, no prompts
-npx skill-optimizer init cli --yes
-
-# Interactive wizard for MCP surface
-npx skill-optimizer init mcp
-```
-
-### `doctor` — diagnose your configuration
-
-The base command validates your `skill-optimizer.json` and checks that discovered surfaces are intact. Two optional flags activate additional checks that are *off by default*:
-
-- `--static` — skip live code discovery (tree-sitter analysis). Use this when you want to validate config and manifests without requiring the project source to be present, or to speed up CI checks. **Do not confuse with `--no-discovery` — the correct flag is `--static`.**
-- `--check-models` — ping each configured model to verify API credentials and routing are working. Use this when you suspect auth issues or want to confirm model availability before a benchmark run. **The flag is `--check-models`, not `--ping` or `--verify-models`.**
-
-These flags are independent and can be combined:
-```
-npx skill-optimizer doctor --config ./skill-optimizer.json --static
-npx skill-optimizer doctor --config ./skill-optimizer.json --check-models
-npx skill-optimizer doctor --config ./skill-optimizer.json --static --check-models
-```
-
-### `import-commands` — extract CLI surface from source or binary
-
-Discovery mode is determined by whether `--scrape` is present:
-
-- **Source mode** (default): `--from` points to a TypeScript/JavaScript file (e.g. `./src/cli.ts`). Tree-sitter parses commands statically.
-- **Scrape mode**: Add `--scrape` to invoke the binary named in `--from` and walk its `--help` output.
-
-Key parameters:
-
-| Parameter | Meaning | Notes |
-|-----------|---------|-------|
-| `--from <source>` | File path or binary name to import from | Required |
-| `--out <path>` | Write discovered commands to this JSON file | Optional; without it, output goes to stdout |
-| `--force` | Overwrite `--out` file if it already exists | Required when the output file exists; without it the command refuses to overwrite |
-| `--scrape` | Invoke as a binary and parse `--help` output | Enables scrape mode |
-| `--depth <n>` | Max subcommand depth to explore during scrape | Only meaningful with `--scrape`; **the flag is `--depth`, not `--max-depth`** |
-
-Output goes to the `--out` file — **do not use shell redirection (`>`) to capture output** because the tool writes structured JSON with metadata that is not suitable for piping.
-
-```
-# Source import, write to file (safe to re-run with --force)
-npx skill-optimizer import-commands --from ./src/cli.ts --out ./commands.json --force
-
-# Scrape a binary, limit depth to 3 levels
-npx skill-optimizer import-commands --from my-app --scrape --depth 3
-```
-
-### `run` — execute the benchmark
-
-Filterable via:
-- `--tier <name>` — only run models whose tier matches. Valid values: `flagship`, `mid`, `budget`. **The flag is `--tier`, not `--model-tier`.**
-- `--model <id>` — run a single specific model.
-- `--dry-run` — generate prompts and tasks without making LLM calls.
-
-```
-npx skill-optimizer run --config ./skill-optimizer.json --tier flagship
-npx skill-optimizer run --config ./skill-optimizer.json --tier mid
-```
-
-## Key Concepts
-
-**Surfaces** — The callable interface of your project: SDK methods, CLI commands, MCP tools, or prompt templates. Skill-optimizer discovers these via tree-sitter code analysis, manifest files, or markdown parsing.
-
-**Static evaluation** — Benchmark evaluation never executes generated code. Actions are extracted from model responses via pattern matching and compared structurally against expected calls. This makes benchmarks safe and repeatable.
-
-**Verdict gates** — Two thresholds must both pass for a benchmark to receive a PASS verdict: `perModelFloor` (each model individually meets a minimum score) and `targetWeightedAverage` (the weighted mean across all models meets a target). A single model below the floor fails the entire run.
-
-**Safety boundary** — The optimizer never modifies your original SKILL.md. It creates versioned copies in `.skill-optimizer/skill-v{N}.md` and only accepts mutations that improve scores without dropping any model below the floor. It does not modify tracked source files, but the generated artifacts appear under `.skill-optimizer/` — add that directory to your `.gitignore`.
-
-**LLM routing** — By default (`format: "pi"`), all benchmark calls route through [OpenRouter](https://openrouter.ai) and need `OPENROUTER_API_KEY`. You can also call providers directly: `format: "anthropic"` uses the Anthropic API directly (`ANTHROPIC_API_KEY`), and `format: "openai"` uses the OpenAI API directly (`OPENAI_API_KEY`), with optional Codex browser-login auth via `authMode: "codex"`. The model ID prefix must match the format — see `references/config.md` for the full mapping.
diff --git a/SKILL/references/benchmark.md b/SKILL/references/benchmark.md
deleted file mode 100644
index c7dfd20..0000000
--- a/SKILL/references/benchmark.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# Running & Interpreting Benchmarks
-
-This guide covers running benchmarks, reading results, diagnosing failures, and comparing runs.
-
-## 1. Pre-flight Check
-
-Before running a benchmark, verify:
-
-```bash
-# Config is valid
-npx skill-optimizer doctor --config <config-path>
-
-# API key is set
-echo $OPENROUTER_API_KEY  # should print sk-or-...
-
-# Git is clean (if requireCleanGit is true, which is the default)
-git status  # should show "nothing to commit, working tree clean"
-```
-
-## 2. Dry Run First
-
-Always start with a dry run to check scope and estimate cost:
-
-```bash
-npx skill-optimizer run --dry-run --config <config-path>
-```
-
-This shows:
-- How many actions were discovered
-- How many are in scope after filtering
-- How many tasks would be generated
-- Which models would be called
-
-No LLM calls are made. Use this to verify your scope and estimate cost (N models x M tasks = total calls).
-
-## 3. Run the Benchmark
-
-```bash
-npx skill-optimizer run --config <config-path>
-```
-
-Optional run flags:
-
-| Flag | Effect | Note |
-|------|--------|------|
-| `--tier <name>` | Only run models whose tier matches. | Valid values: `flagship`, `mid`, `budget`. Flag is `--tier`, not `--model-tier`. |
-| `--model <id>` | Run a single specific model. | Pass the full model ID. |
-| `--task <id>` | Run a single task by ID. | Stable IDs from `tasks.generated.json`. |
-| `--no-cache` | Force fresh skill fetch. | |
-| `--dry-run` | Preview scope without making LLM calls. | |
-
-```bash
-# Run only flagship models
-npx skill-optimizer run --config <config-path> --tier flagship
-
-# Debug a single task
-npx skill-optimizer run --config <config-path> --task <task-id>
-```
-
-What happens at each stage:
-
-1. **Discover** — find callable actions via tree-sitter or manifest
-2. **Scope** — apply `include`/`exclude` filters
-3. **Generate tasks** — create one prompt per in-scope action (coverage-guaranteed: every action gets at least one task)
-4. **Call models** — each configured model attempts each task
-5. **Extract** — pull action calls from model responses via pattern matching
-6. **Evaluate** — compare extracted actions against expected actions
-7. **Verdict** — PASS or FAIL based on two gates
-
-## 4. Reading the Output
-
-The benchmark produces:
-
-- **Per-model score table** — each model's pass rate as a fraction (e.g., `Claude Sonnet: 18/20 (0.90)`)
-- **Weighted average** — computed from individual scores and model weights
-- **Verdict** — `PASS` (both gates satisfied) or `FAIL` (at least one gate missed)
-- **Exit code** — `0` for PASS, `1` for FAIL
-
-## 5. Verdict Gates
-
-Two gates must **both** pass for a PASS verdict:
-
-**`perModelFloor`** (default: `0.6`)
-Every model must individually score at or above this threshold. If any single model scores below, the entire benchmark fails — regardless of how well other models did. This prevents one weak model from hiding behind a strong average.
-
-**`targetWeightedAverage`** (default: `0.7`)
-The weighted mean across all models must reach this threshold. Models with higher `weight` values count more. This ensures overall quality, not just per-model minimums.
-
-**Model `weight`** (default: `1.0`)
-Controls how much each model influences the weighted average. Set flagship models to `2.0` and budget models to `0.5` if you care more about flagship performance.
-
-## 6. Diagnosing Failures
-
-When a benchmark fails, look at the per-task breakdown to identify patterns:
-
-**Hallucinated actions** — the model calls functions that don't exist in your API.
-- *Cause:* SKILL.md describes features ambiguously or mentions non-existent methods
-- *Fix:* Tighten your docs. Remove references to deprecated methods. Be explicit about what exists.
-
-**Missing arguments** — the model calls the right action but with wrong or missing arguments.
-- *Cause:* Documentation doesn't clearly specify required parameters or their types
-- *Fix:* Add explicit parameter sections with types, defaults, and examples
-
-**Wrong tool selection** — the model calls a related but incorrect action (e.g., `deleteTask` instead of `removeTask`).
-- *Cause:* Action names are ambiguous or the docs don't distinguish between similar actions
-- *Fix:* Add disambiguation notes or rename actions to be more distinct
-
-**One model fails, others pass** — a specific model consistently underperforms.
-- *Cause:* That model may need more explicit guidance or has known weaknesses with your API style
-- *Fix:* Consider adjusting its `weight`, adding model-specific notes to your docs, or accepting the floor as-is
-
-## 7. Comparing Runs
-
-After making changes to your SKILL.md, compare before and after:
-
-```bash
-npx skill-optimizer compare --baseline report-before.json --current report-after.json
-```
-
-This shows:
-- Per-model score deltas (e.g. `Claude Sonnet: 0.75 → 0.90 (+0.15)`)
-- Per-task deltas — which tasks improved, which regressed
-- Overall weighted average change
-
-**Finding the report files:** The benchmark writes its report JSON to the `output.dir` configured in your `skill-optimizer.json` (default: `benchmark-results/`). Each run creates a timestamped file there.
-
-## 8. Cost Awareness
-
-Each benchmark run makes `N models x M tasks` LLM calls. To minimize cost while iterating:
-
-- **Start narrow** — use `scope.include` to benchmark only your most important actions first
-- **Few models first** — start with 2-3 models, expand after the skill stabilizes
-- **Dry run** — always check scope size with `--dry-run` before committing to a full run
-- **Iterate on docs first** — fix obvious SKILL.md gaps before re-running. Each run costs real money.
-
-## 9. CI Integration
-
-The exit code (`0` = PASS, `1` = FAIL) makes skill-optimizer suitable for CI pipelines:
-
-```bash
-# In a CI script or Makefile
-npx skill-optimizer run --config <config-path>
-# Exits 0 on PASS, 1 on FAIL — use as a gate step
-```
-
-This lets you catch regressions in documentation quality as part of your CI workflow.
-
-## Next Steps
-
-If the benchmark fails and the issues are scattered (not one obvious fix), read `references/optimize.md` to run the automatic optimization loop.
-
-If you need to adjust config (models, scope, thresholds), read `references/config.md`.
diff --git a/SKILL/references/config.md b/SKILL/references/config.md
deleted file mode 100644
index 8a40364..0000000
--- a/SKILL/references/config.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Configuration Reference
-
-Complete reference for `skill-optimizer.json`. For auto-generated schema docs, see `docs/reference/config-schema.md` in the skill-optimizer repo.
-
-## Minimal Working Configs
-
-### CLI surface
-
-```json
-{
-  "name": "my-cli-tool",
-  "target": {
-    "surface": "cli",
-    "repoPath": "/path/to/my-project",
-    "skill": "./SKILL.md",
-    "discovery": {
-      "mode": "auto",
-      "sources": ["src/cli.ts"]
-    }
-  },
-  "benchmark": {
-    "format": "pi",
-    "models": [
-      { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude Sonnet", "tier": "flagship" }
-    ]
-  }
-}
-```
-
-### SDK surface
-
-```json
-{
-  "name": "my-sdk",
-  "target": {
-    "surface": "sdk",
-    "repoPath": "/path/to/my-sdk",
-    "skill": "./SKILL.md",
-    "discovery": {
-      "mode": "auto",
-      "sources": ["src/index.ts"],
-      "language": "typescript"
-    }
-  },
-  "benchmark": {
-    "format": "pi",
-    "models": [
-      { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude Sonnet", "tier": "flagship" }
-    ]
-  }
-}
-```
-
-### MCP surface
-
-```json
-{
-  "name": "my-mcp-server",
-  "target": {
-    "surface": "mcp",
-    "repoPath": "/path/to/my-mcp-server",
-    "skill": "./SKILL.md",
-    "discovery": {
-      "mode": "auto",
-      "sources": ["src/server.ts"]
-    }
-  },
-  "benchmark": {
-    "format": "pi",
-    "models": [
-      { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude Sonnet", "tier": "flagship" }
-    ]
-  }
-}
-```
-
-### Prompt surface
-
-```json
-{
-  "name": "my-skill-doc",
-  "target": {
-    "surface": "prompt",
-    "repoPath": "/path/to/my-project",
-    "skill": "./SKILL.md"
-  },
-  "benchmark": {
-    "format": "pi",
-    "models": [
-      { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude Sonnet", "tier": "flagship" }
-    ]
-  }
-}
-```
-
-## Field-by-Field Reference
-
-### `target` — What You're Benchmarking
-
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `surface` | Yes | — | `"cli"`, `"sdk"`, `"mcp"`, or `"prompt"` |
-| `repoPath` | Yes | — | Absolute or config-relative path to your project root |
-| `skill` | Yes | — | Path to your SKILL.md or guidance doc, relative to `repoPath` |
-| `discovery.mode` | No | `"auto"` | `"auto"` (tree-sitter) or `"manifest"` (hand-written JSON) |
-| `discovery.sources` | No | `[]` | Source files for tree-sitter to parse, relative to `repoPath` |
-| `discovery.language` | No | — | SDK only: `"typescript"`, `"python"`, or `"rust"` |
-| `scope.include` | No | `["*"]` | Glob patterns for actions to include |
-| `scope.exclude` | No | `[]` | Glob patterns for actions to exclude |
-
-### `benchmark` — How to Test
-
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `format` | No | `"pi"` | `"pi"` — route through OpenRouter (default); `"openai"` — call OpenAI API directly; `"anthropic"` — call Anthropic API directly |
-| `authMode` | No | `"env"` | `"env"` — read key from env var (default); `"codex"` — read from `~/.codex/auth.json` (OpenAI only); `"auto"` — try env first, fall back to codex for OpenAI |
-| `apiKeyEnv` | No | provider default | Env var holding the API key. Defaults: `OPENROUTER_API_KEY` for `pi`, `OPENAI_API_KEY` for `openai`, `ANTHROPIC_API_KEY` for `anthropic` |
-| `baseUrl` | No | — | Override the API base URL (e.g. for a custom OpenAI-compatible endpoint) |
-| `models[].id` | Yes | — | Model ID with provider prefix: `openrouter/<p>/<model>` (OpenRouter — dots in version segments, e.g. `openrouter/anthropic/claude-sonnet-4.6`), `anthropic/<model>` (direct Anthropic — hyphens, e.g. `anthropic/claude-sonnet-4-6`), `openai/<model>` (direct OpenAI — dots, e.g. `openai/gpt-5.4`). |
-| `models[].name` | No | — | Human-readable label for output tables |
-| `models[].tier` | No | — | `"flagship"`, `"mid"`, or `"low"` (informational only) |
-| `models[].weight` | No | `1.0` | Influence on weighted average (higher = counts more) |
-| `verdict.perModelFloor` | No | `0.6` | Minimum score each model must reach individually |
-| `verdict.targetWeightedAverage` | No | `0.7` | Minimum weighted average across all models |
-| `taskGeneration.enabled` | No | `true` | Whether to auto-generate tasks |
-| `taskGeneration.maxTasks` | No | `20` | Upper bound on tasks (must be >= in-scope action count) |
-| `taskGeneration.outputDir` | No | `".skill-optimizer"` | Where to write task artifacts |
-
-### `optimize` — How to Improve
-
-| Field | Required | Default | Description |
-|-------|----------|---------|-------------|
-| `enabled` | No | `true` | Whether optimization is allowed |
-| `mode` | No | `"stable-surface"` | `"stable-surface"` (reuse tasks) or `"surface-changing"` (regenerate per iteration) |
-| `model` | No | `"openrouter/anthropic/claude-sonnet-4.6"` | Which LLM writes mutations |
-| `maxIterations` | No | `5` | Maximum optimization rounds |
-| `minImprovement` | No | `0.02` | Minimum delta in weighted average required to accept a mutation |
-| `allowedPaths` | No | `["SKILL.md"]` | Files the mutation agent may edit |
-| `requireCleanGit` | No | `true` | Block optimizer if target repo has uncommitted changes |
-
-## Model ID and Auth Quick Guide
-
-The `benchmark.format` field controls which API receives requests. Pick the combination that matches your setup:
-
-| Format | Model ID prefix | Required env var | Notes |
-|--------|----------------|------------------|-------|
-| `"pi"` (default) | `openrouter/<provider>/<model>` | `OPENROUTER_API_KEY` | All providers via OpenRouter |
-| `"openai"` | `openai/<model>` | `OPENAI_API_KEY` | Direct OpenAI API |
-| `"openai"` + `authMode: "codex"` | `openai/<model>` | `~/.codex/auth.json` | Codex browser-login auth |
-| `"anthropic"` | `anthropic/<model>` | `ANTHROPIC_API_KEY` | Direct Anthropic API |
-
-**Codex auth** (`authMode: "codex"`) reads credentials from `~/.codex/auth.json` in this priority order:
-1. `tokens.access_token` — browser-login JWT (checked for expiry)
-2. `tokens.OPENAI_API_KEY` — static key nested under tokens
-3. `OPENAI_API_KEY` — root-level static key
-
-Codex auth only works with the OpenAI provider. For `openrouter/` or `anthropic/` models, use `authMode: "env"`.
-
-**`authMode: "auto"`** tries the env var first; for OpenAI models, falls back to `~/.codex/auth.json` if the env var is unset.
-
-## Model Configuration Tips
-
-- Browse available models at [openrouter.ai/models](https://openrouter.ai/models)
-- **Recommended starter set:** one flagship (Claude Sonnet or GPT-4o) + one budget model (Gemini Flash or Haiku) to test both capability ends
-- **Weighting strategy:** set flagship models to `weight: 2.0` and budget to `weight: 0.5` if flagship performance matters most to you
-- `tier` is informational only — it appears in output tables but doesn't affect scoring
-
-## Scope Patterns
-
-The `*` wildcard matches any sequence of characters, including dots and slashes. It is not limited to a single path segment like filesystem globs.
-
-| Pattern | Matches |
-|---------|---------|
-| `"Wallet.*"` | All Wallet methods (`Wallet.create`, `Wallet.balance`, etc.) |
-| `"*.internal*"` | Anything with "internal" in the name |
-| `"get_*"` | Only getter actions |
-| `["create_*", "update_*", "delete_*"]` | Only mutation actions |
-
-Task generation is **coverage-guaranteed**: every in-scope action gets at least one task. If coverage fails after retries, an error names the uncovered actions and suggests either fixing SKILL.md guidance or excluding them.
-
-## Common Error Codes
-
-| Code | Meaning | Fix |
-|------|---------|-----|
-| `E_MISSING_SKILL` | `target.skill` file not found | Create the file or fix the path in config |
-| `E_INVALID_SURFACE` | `target.surface` is not cli/sdk/mcp/prompt | Use one of the four valid values |
-| `E_DIRTY_GIT` | Uncommitted changes in target repo | Commit or stash, or set `requireCleanGit: false` |
-| `E_EMPTY_SCOPE` | Scope filters matched no actions | Check your `include`/`exclude` patterns |
-| `E_MISSING_API_KEY` | API key env var not set | `export OPENROUTER_API_KEY=sk-or-...` (or `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` for direct formats) |
-
-Full error reference with detailed descriptions: `docs/reference/errors.md`
-
-Full config schema reference (auto-generated from Zod): `docs/reference/config-schema.md`
diff --git a/SKILL/references/optimize.md b/SKILL/references/optimize.md
deleted file mode 100644
index 12996e8..0000000
--- a/SKILL/references/optimize.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# Optimization Loop
-
-This guide covers when and how to use the automatic optimizer, how to interpret its results, and what to do when it doesn't converge.
-
-## 1. When to Optimize vs. Fix Manually
-
-**Fix manually** when the benchmark reveals a clear, localized problem — a missing section, a wrong example, an outdated method name. Manual fixes are faster and more precise for known issues.
-
-**Run the optimizer** when failures are scattered across multiple models and tasks with no obvious single fix. The optimizer systematically tries mutations to your SKILL.md and keeps only changes that improve scores.
-
-A good workflow: run a benchmark, fix the obvious stuff by hand, re-benchmark, then let the optimizer handle whatever's left.
-
-## 2. How the Loop Works
-
-1. **Baseline benchmark** — establish starting scores for all models
-2. **Copy** — your SKILL.md is copied to `.skill-optimizer/skill-v0.md` (original is never touched)
-3. **Failure analysis** — identify patterns in what models get wrong
-4. **Mutation** — a mutation agent (powered by `optimize.model`, defaults to Claude Opus via OpenRouter) proposes edits to the versioned copy
-5. **Re-benchmark** — run all models against all tasks using the mutated skill
-6. **Accept or reject** — the mutation is accepted only if:
-   - The weighted average improves by at least `minImprovement`
-   - No model that was above the floor drops below it
-7. **Rollback** if rejected — revert to the previous version
-8. **Repeat** up to `maxIterations` times
-9. **Progress table** — final output shows Baseline -> each iteration -> Final -> delta per model
-
-## 3. Safety Guarantees
-
-The optimizer is designed to be safe to run:
-
-- **Your original SKILL.md is never modified.** All edits happen on versioned copies in `.skill-optimizer/skill-v0.md`, `skill-v1.md`, etc.
-- **`requireCleanGit`** is enforced by default — the optimizer won't run if your target repo has uncommitted changes
-- **`allowedPaths`** constrains which files the mutation agent can edit (defaults to just the skill file)
-- **Stabilization window** prevents oscillation — if the same mutation keeps getting accepted and rejected, the optimizer exits early
-
-## 4. Running the Optimizer
-
-```bash
-npx skill-optimizer optimize --config <config-path>
-```
-
-Output during the run:
-- Current iteration number and total
-- Per-model scores after each mutation attempt
-- Accept/reject decision with reasoning
-- Running progress table
-
-The optimizer can take several minutes per iteration (it runs a full benchmark each time).
-
-## 5. Key Config Knobs
-
-| Setting | Default | What it controls |
-|---------|---------|------------------|
-| `optimize.maxIterations` | `5` | Upper bound on optimization rounds |
-| `optimize.mode` | `"stable-surface"` | `"stable-surface"`: reuse tasks across iterations (faster, apples-to-apples). `"surface-changing"`: regenerate tasks each iteration (if skill changes might affect task phrasing) |
-| `optimize.model` | `"openrouter/anthropic/claude-sonnet-4.6"` | Which LLM writes mutations |
-| `optimize.enabled` | `true` | Set to `false` to skip optimization (useful in CI) |
-| `optimize.requireCleanGit` | `true` | Block optimizer if target repo has uncommitted changes |
-
-## 6. Interpreting Results
-
-**Progress table** — rows are models, columns are iterations. Shows the score trajectory for each model across the optimization run.
-
-**Accepted iteration** — the mutation improved scores without violating either gate. The versioned copy advances to `skill-v{N+1}.md`.
-
-**Rejected iteration** — the mutation either didn't improve the weighted average enough, or it caused a model to drop below the floor. The previous version is kept and the optimizer tries a different mutation.
-
-**Early exit** — if scores plateau for consecutive iterations, the optimizer may stop before reaching `maxIterations`. This is normal and means further mutations aren't producing meaningful improvements.
-
-## 7. After Optimization
-
-The best version is the highest-numbered `skill-v{N}.md` in `.skill-optimizer/`. To apply it:
-
-```bash
-# 1. See what changed
-diff SKILL.md .skill-optimizer/skill-v3.md   # adjust N to your highest version
-
-# 2. Review the diff — the optimizer is a tool, not an oracle
-#    Look for: overly specific examples, removed important context, awkward phrasing
-
-# 3. Copy it back
-cp .skill-optimizer/skill-v3.md SKILL.md
-
-# 4. Commit
-git add SKILL.md
-git commit -m "docs: apply skill-optimizer improvements (v3)"
-```
-
-## 8. When It Doesn't Converge
-
-If the optimizer oscillates or plateaus without reaching your target scores:
-
-**Narrow the scope** — exclude actions that are inherently ambiguous or rarely used. A smaller, cleaner scope gives the optimizer more room to improve what matters.
-
-**Improve discovery** — make sure `discovery.sources` points at the right files. If the surface is incomplete (missing actions), the optimizer is working with bad data.
-
-**Manual intervention** — read the failure analysis output from the last iteration. It often reveals patterns that a targeted manual edit can fix more effectively than automated mutation.
-
-**Adjust gates** — if `perModelFloor` or `targetWeightedAverage` are set very high, lower them to something achievable first. Optimize to hit that floor, then ratchet up gradually.
-
-**Try different models** — change `optimize.model` to a different LLM. Different models have different strengths in rewriting documentation.
diff --git a/SKILL/references/setup.md b/SKILL/references/setup.md
deleted file mode 100644
index 159b80c..0000000
--- a/SKILL/references/setup.md
+++ /dev/null
@@ -1,184 +0,0 @@
-# Setup & Init
-
-This guide walks through setting up skill-optimizer for your project, from prerequisites to a verified configuration.
-
-## 1. Prerequisites
-
-Before starting, verify these three requirements:
-
-**Node.js 20+:**
-```bash
-node --version
-# Expected: v20.x.x or higher
-```
-
-**API key** (which one depends on your `benchmark.format`):
-```bash
-# Default — OpenRouter (format: "pi"):
-export OPENROUTER_API_KEY=sk-or-your-key-here
-
-# Direct OpenAI (format: "openai"):
-export OPENAI_API_KEY=sk-your-key-here
-
-# Direct Anthropic (format: "anthropic"):
-export ANTHROPIC_API_KEY=sk-ant-your-key-here
-```
-If you're just getting started, use OpenRouter — one key covers all providers.
-
-**skill-optimizer available:**
-```bash
-npx skill-optimizer --help
-# Expected: Usage information
-# If not installed globally, install from the repo:
-#   cd /path/to/skill-optimizer && npm install && npm run build && npm link
-```
-
-## 2. Determine Your Surface Type
-
-skill-optimizer supports four surface types. Pick the one that matches your project:
-
-| Surface | Your project exposes... | Examples |
-|---------|------------------------|----------|
-| `cli` | CLI commands or a binary | Yargs, Commander, @optique/core, argparse, Click, Clap |
-| `sdk` | Library methods users call in code | TypeScript/Python/Rust SDKs |
-| `mcp` | MCP tool handlers | MCP servers with `server.tool()` definitions |
-| `prompt` | Prompt templates or agent skill docs | SKILL.md files, Claude Code skills, agent instructions |
-
-If unsure: does your user run commands in a terminal (`cli`), import your package and call functions (`sdk`), connect an AI agent to your tool server (`mcp`), or follow a prompt template / skill document (`prompt`)?
-
-## 3. Run the Init Wizard
-
-From your project root:
-
-```bash
-npx skill-optimizer init <surface>
-# Example: npx skill-optimizer init cli
-```
-
-The wizard prompts for:
-
-- **Repo path** — absolute path to your project root (defaults to CWD)
-- **Models** — model IDs to benchmark against (e.g., `openrouter/anthropic/claude-sonnet-4.6`)
-- **SKILL.md location** — path to your existing documentation or guidance file
-- **Discovery sources** — source files for tree-sitter to parse (e.g., `src/cli.ts`, `src/index.ts`)
-- **Max tasks** — upper bound on generated benchmark tasks (default: 20)
-
-**Non-interactive mode** (for CI or scripting):
-
-The `--auto` and `--yes` flags are independent and serve different purposes:
-
-| Flag | Effect |
-|------|--------|
-| `--yes` | Accept all defaults without prompting. Still requires a surface name unless combined with `--auto`. |
-| `--auto` | Auto-detect the surface type from the current directory. Still opens the interactive wizard (pre-filled) unless combined with `--yes`. |
-| `--auto --yes` | **Fully non-interactive**: detect surface + accept all defaults. Use this for automated pipelines where the surface type isn't known in advance. |
-
-```bash
-# Explicit surface, no prompts
-npx skill-optimizer init cli --yes
-
-# Auto-detect surface + no prompts (fully automated, zero interaction)
-npx skill-optimizer init --auto --yes
-
-# Load answers from a file
-npx skill-optimizer init --answers answers.json
-```
-
-`answers.json` format:
-```json
-{
-  "surface": "cli",
-  "repoPath": "/absolute/path/to/your-repo",
-  "models": ["openrouter/anthropic/claude-sonnet-4.6", "openrouter/openai/gpt-4o-mini"],
-  "maxTasks": 20,
-  "maxIterations": 5,
-  "entryFile": "src/cli.ts"
-}
-```
-
-## 4. Surface Discovery
-
-After init, skill-optimizer needs to know what actions your project exposes. There are two discovery modes:
-
-**Code-first (auto)** — tree-sitter parses your source files automatically. This works for:
-- TypeScript: Yargs, Commander, @optique/core CLI frameworks
-- TypeScript/Python/Rust: SDK method extraction
-- TypeScript: MCP `server.tool()` definitions
-
-If auto-discovery finds your actions, you're done. Check with:
-```bash
-npx skill-optimizer run --dry-run --config <config-path>
-# Look for "Discovered N actions" in the output
-```
-
-**Manual / import** — if auto-discovery yields nothing or misses actions:
-
-```bash
-# Extract from TypeScript source, write to file
-npx skill-optimizer import-commands --from ./src/cli.ts --out ./.skill-optimizer/cli-commands.json
-
-# Overwrite an existing output file (required when the file already exists)
-npx skill-optimizer import-commands --from ./src/cli.ts --out ./.skill-optimizer/cli-commands.json --force
-
-# Extract from a compiled binary's help text, limit subcommand depth
-npx skill-optimizer import-commands --from my-cli --scrape --depth 3
-```
-
-Key `import-commands` flags:
-
-| Flag | Meaning |
-|------|---------|
-| `--from <path>` | Source file or binary name (required) |
-| `--out <path>` | Write output to this file. Without `--out`, output goes to stdout. Do not use `>` shell redirection — it produces malformed output. |
-| `--force` | Overwrite `--out` file if it already exists. Required on re-runs. |
-| `--scrape` | Invoke as a binary and parse `--help` output instead of reading source |
-| `--depth <n>` | Max subcommand depth during scrape. Flag is `--depth`, not `--max-depth`. |
-
-This populates `.skill-optimizer/cli-commands.json` (CLI) or `.skill-optimizer/tools.json` (MCP). You can also edit these manifest files by hand.
-
-## 5. Verify with Doctor
-
-Run the config diagnostics to catch problems early:
-
-```bash
-npx skill-optimizer doctor --config <config-path>
-```
-
-If issues are found, auto-fix what's fixable:
-
-```bash
-npx skill-optimizer doctor --fix --config <config-path>
-```
-
-Two optional flags activate additional checks that are off by default:
-
-| Flag | Effect | Note |
-|------|--------|------|
-| `--static` | Skip live code discovery (tree-sitter). Validates config and manifests only. | Flag is `--static`, not `--no-discovery`. |
-| `--check-models` | Ping each configured model to verify API credentials and routing. | Flag is `--check-models`, not `--ping` or `--verify-models`. |
-
-```bash
-# Validate config without running discovery (fast, works without project source)
-npx skill-optimizer doctor --config <config-path> --static
-
-# Verify model API keys are working
-npx skill-optimizer doctor --config <config-path> --check-models
-```
-
-## 6. What You Should Have Now
-
-After successful setup:
-
-- **`skill-optimizer.json`** — main config file (commit this); when created by `init`, the default location is `./.skill-optimizer/skill-optimizer.json`
-- **`.skill-optimizer/`** — working directory for task artifacts, surface manifests, and versioned skill copies (gitignored)
-
-Your project is ready for benchmarking. Read `references/benchmark.md` for next steps.
-
-## 7. Common Pitfalls
-
-| Problem | Cause | Fix |
-|---------|-------|-----|
-| "Config not found" | Wrong path to `skill-optimizer.json` | Use `--config` with the full path |
-| "No actions discovered" | `discovery.sources` points at wrong files | Check paths are relative to `repoPath` |
-| "Skill file not found" | `target.skill` path is wrong | Path is relative to `repoPath` — verify it exists |
-| "repoPath not found" | Relative path resolved wrong | Use absolute path, or make it relative to config file location |
diff --git a/docker/workbench-runner.Dockerfile b/docker/workbench-runner.Dockerfile
new file mode 100644
index 0000000..73591f7
--- /dev/null
+++ b/docker/workbench-runner.Dockerfile
@@ -0,0 +1,42 @@
+FROM node:22-bookworm
+
+ENV PATH="/app/node_modules/.bin:/work/.venv/bin:${PATH}" \
+    PIP_REQUIRE_VIRTUALENV=1
+
+WORKDIR /app
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+    bash \
+    ca-certificates \
+    coreutils \
+    curl \
+    file \
+    findutils \
+    gawk \
+    git \
+    grep \
+    jq \
+    less \
+    python-is-python3 \
+    python3 \
+    python3-pip \
+    python3-venv \
+    ripgrep \
+    sed \
+    unzip \
+    wget \
+    zip \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY package.json package-lock.json tsconfig.json ./
+COPY src ./src
+COPY scripts ./scripts
+COPY docs ./docs
+
+RUN npm ci \
+  && npm run build \
+  && useradd -m -u 10001 agent
+USER agent
+
+ENTRYPOINT ["node", "/app/dist/workbench/container-runner.js"]
diff --git a/docs/README.codex.md b/docs/README.codex.md
new file mode 100644
index 0000000..6401f37
--- /dev/null
+++ b/docs/README.codex.md
@@ -0,0 +1,31 @@
+# Codex Install
+
+`skill-optimizer` can be used in Codex as either a plugin or a plain Agent Skill.
+
+## Plugin Install
+
+Register this repository as a plugin marketplace:
+
+```bash
+codex plugin marketplace add fastxyz/skill-optimizer
+```
+
+Open `/plugins`, select the `skill-optimizer` marketplace, and install the `skill-optimizer` plugin.
+
+Codex reads the repo marketplace from `.agents/plugins/marketplace.json`. That marketplace points at the repository root, where the plugin manifest lives at `.codex-plugin/plugin.json`; bundled skills are read from `skills/`.
+
+To pin a Git ref while installing the marketplace:
+
+```bash
+codex plugin marketplace add fastxyz/skill-optimizer --ref main
+```
+
+## Skill-Only Install
+
+Install only the skill files with the open skills CLI:
+
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a codex -y
+```
+
+Restart Codex if the skill does not appear immediately. The canonical skill path is `skills/skill-optimizer/SKILL.md`.
diff --git a/docs/README.opencode.md b/docs/README.opencode.md
new file mode 100644
index 0000000..93ff893
--- /dev/null
+++ b/docs/README.opencode.md
@@ -0,0 +1,35 @@
+# skill-optimizer for OpenCode
+
+Use `skill-optimizer` in OpenCode through the bundled OpenCode plugin.
+
+## Installation
+
+Add the plugin to `opencode.json` at user or project scope:
+
+```json
+{
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git"]
+}
+```
+
+Restart OpenCode. The plugin registers this repository's `skills/` directory so OpenCode can discover `skill-optimizer` without symlinks.
+
+## Verify
+
+Use OpenCode's native `skill` tool to list skills or load `skill-optimizer`.
+
+## Updating
+
+OpenCode reinstalls git plugins when it starts. To pin a tag or commit, append a ref:
+
+```json
+{
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git#v2.0.0"]
+}
+```
+
+## How It Works
+
+The plugin exposes `.opencode/plugins/skill-optimizer.js` and adds the repository `skills/` directory to `config.skills.paths`.
+
+The canonical skill is `skills/skill-optimizer/SKILL.md`.
diff --git a/docs/reference/config-schema.md b/docs/reference/config-schema.md
deleted file mode 100644
index 328a086..0000000
--- a/docs/reference/config-schema.md
+++ /dev/null
@@ -1,46 +0,0 @@
-<!-- AUTO-GENERATED — do not edit. Run `npm run gen-docs` to regenerate. -->
-
-
-# Config Schema Reference
-
-All configuration lives in a single `skill-optimizer.json` file.
-Paths in the config are relative to the config file location.
-
-| Field | Type | Default | Description |
-|---|---|---|---|
-| `name` | `string` | — | Human-readable project name |
-| `target.surface` | `"sdk" | "cli" | "mcp" | "prompt"` | — | Type of callable surface |
-| `target.repoPath` | `string` | — | Path to the target repo (default ".") |
-| `target.skill` | `string | object` | — | Path to SKILL.md or { source, cache } object |
-| `target.discovery.mode` | `"auto" | "manifest"` | — | "auto" = code-first tree-sitter; "manifest" = use provided file only |
-| `target.discovery.sources` | `string[]` | — | Source files to scan for callable methods/commands/tools |
-| `target.discovery.fallbackManifest` | `string` | — | Path to manifest JSON when code-first discovery is incomplete |
-| `target.discovery.language` | `"typescript" | "python" | "rust"` | — | Language for code-first discovery |
-| `target.sdk.language` | `"typescript" | "python" | "rust"` | — | SDK language |
-| `target.sdk.entrypoints` | `string[]` | — | SDK entry files for discovery |
-| `target.cli.commands` | `string` | — | Path to CLI commands manifest JSON (CliCommandDefinition[]) |
-| `target.mcp.tools` | `string` | — | Path to MCP tools manifest JSON (OpenAI function tool definitions) |
-| `target.scope.include` | `string[]` | — | Glob patterns for actions to include (default ["*"]) |
-| `target.scope.exclude` | `string[]` | — | Glob patterns for actions to exclude (default []) |
-| `benchmark.format` | `"pi" | "openai" | "anthropic"` | — | LLM transport format: "pi" routes through OpenRouter/Pi (use openrouter/* or openai/* model refs); "openai" calls the OpenAI API directly (supports Codex auth); "anthropic" calls the Anthropic API directly |
-| `benchmark.authMode` | `"env" | "codex" | "auto"` | — | How to resolve credentials: env var, ~/.codex/auth.json browser-login tokens, or env-then-codex fallback |
-| `benchmark.apiKeyEnv` | `string` | — | Env var name for the API key (default is determined by the model provider prefix: openrouter/ → OPENROUTER_API_KEY, openai/ → OPENAI_API_KEY, anthropic/ → ANTHROPIC_API_KEY; leave unset to use the per-provider default) |
-| `benchmark.timeout` | `integer` | — | Milliseconds per model call (default 240000) |
-| `benchmark.models` | `object[]` | — | Models to benchmark — at least one required |
-| `benchmark.taskGeneration.enabled` | `boolean` | — | Whether to generate tasks automatically (default false) |
-| `benchmark.taskGeneration.maxTasks` | `integer` | — | Max tasks to generate — must be >= in-scope action count (default 10) |
-| `benchmark.taskGeneration.seed` | `integer` | — | RNG seed for reproducible generation (default 1) |
-| `benchmark.taskGeneration.outputDir` | `string` | — | Where to write generated task artifacts (default ".skill-optimizer") |
-| `benchmark.output.dir` | `string` | — | Directory where reports are saved (default "benchmark-results/") |
-| `benchmark.verdict.perModelFloor` | `number` | — | Minimum per-model pass fraction for PASS verdict (default 0.6) |
-| `benchmark.verdict.targetWeightedAverage` | `number` | — | Minimum weighted average across all models for PASS (default 0.7) |
-| `optimize.model` | `string` | — | Model for mutation, e.g. openrouter/anthropic/claude-sonnet-4.6 |
-| `optimize.authMode` | `"env" | "codex" | "auto"` | — | How to resolve optimizer credentials: env var, ~/.codex/auth.json browser-login tokens, or env-then-codex fallback |
-| `optimize.apiKeyEnv` | `string` | — | Env var for the optimizer API key |
-| `optimize.thinkingLevel` | `"off" | "minimal" | "low" | "medium" | "high" | "xhigh"` | — | Reasoning depth for mutation calls (default "medium") |
-| `optimize.allowedPaths` | `string[]` | — | Paths the optimizer may edit — safety boundary |
-| `optimize.validation` | `string[]` | — | Shell commands to run to validate each mutation |
-| `optimize.requireCleanGit` | `boolean` | — | Require clean git state before starting (default true) |
-| `optimize.maxIterations` | `integer` | — | Maximum optimization iterations (default 5) |
-| `optimize.minImprovement` | `number` | — | Minimum weighted-average gain per accepted iteration (default 0.02) |
-| `optimize.reportContextMaxBytes` | `integer` | — | Byte budget for mutation context (default 16000) |
diff --git a/docs/reference/errors.md b/docs/reference/errors.md
deleted file mode 100644
index 4388373..0000000
--- a/docs/reference/errors.md
+++ /dev/null
@@ -1,226 +0,0 @@
-<!-- AUTO-GENERATED — do not edit. Run `npm run gen-docs` to regenerate. -->
-
-
-# Error Reference
-
-Every `skill-optimizer` error has a code, a short message, and a fix list.
-The catch-all `E_UNEXPECTED` appears if an error slips past the known list.
-
-## Summary
-
-| Code | Description | Quick fix |
-|---|---|---|
-| `E_INVALID_SURFACE` | Invalid surface value | Set target.surface to one of: sdk, cli, mcp, prompt |
-| `E_MODELS_EMPTY` | benchmark.models is empty or missing | Add at least one model to benchmark.models, e.g.: |
-| `E_MODEL_ID_FORMAT` | Model ID is missing a provider prefix | Prefix all model IDs with a supported provider prefix: |
-| `E_VERDICT_OUT_OF_RANGE` | Verdict threshold is out of range | Set benchmark.verdict.perModelFloor and targetWeightedAverage to values between 0.0 and 1.0 |
-| `E_MAX_ITERATIONS_ZERO` | optimize.maxIterations must be a positive integer | Set optimize.maxIterations to a positive integer, e.g. 5 |
-| `E_INVALID_FORMAT` | Invalid benchmark.format value | Set benchmark.format to one of: pi, openai, anthropic |
-| `E_REPO_NOT_FOUND` | target.repoPath does not exist or is not a directory | Fix target.repoPath in your skill-optimizer.json to point at an existing directory |
-| `E_MISSING_SKILL` | target.skill file not found | Create a SKILL.md at the path specified in target.skill |
-| `E_SOURCES_NOT_FOUND` | One or more target.discovery.sources files do not exist | Check that all paths in target.discovery.sources exist in your repo |
-| `E_CLI_MANIFEST_NOT_FOUND` | target.cli.commands manifest file not found | Run: skill-optimizer import-commands --from <entry-file> to auto-extract |
-| `E_MCP_MANIFEST_NOT_FOUND` | target.mcp.tools manifest file not found | Create the tools.json file at the path specified in target.mcp.tools |
-| `E_ALLOWED_PATHS_ESCAPE` | optimize.allowedPaths contains a path outside target.repoPath | All paths in optimize.allowedPaths must be inside target.repoPath |
-| `E_OUTPUT_DIR_NOT_WRITABLE` | benchmark.output.dir is not writable | Check directory permissions for the path set in benchmark.output.dir |
-| `E_MISSING_API_KEY` | API key environment variable is not set | Export your OpenRouter API key before running: export OPENROUTER_API_KEY=sk-or-... |
-| `E_DISCOVERY_EMPTY` | Discovery found zero callable actions | Check that target.discovery.sources points at the right entry file |
-| `E_MAXTASKS_TOO_LOW` | benchmark.taskGeneration.maxTasks is less than the in-scope action count | Raise benchmark.taskGeneration.maxTasks to at least the number of in-scope actions |
-| `E_COVERAGE_EXHAUSTED` | Task generation could not cover all in-scope actions after 2 retry passes | Add guidance for the uncovered actions to your SKILL.md |
-| `E_DIRTY_GIT` | Target repo has uncommitted changes | Commit or stash changes in target.repoPath before running the optimizer |
-| `E_GIT_CHECKPOINT_FAILED` | Git checkpoint creation failed | Check disk space and git permissions in target.repoPath |
-| `E_VALIDATION_FAILED` | Configured validation command exited non-zero | Fix the issue flagged by the validation command before retrying |
-| `E_INIT_AUTO_LOW_CONFIDENCE` | init --auto --yes requires high confidence detection | Run init interactively to review and confirm detection: skill-optimizer init --auto |
-| `E_UNEXPECTED` | An unexpected error occurred | Check the full error message and stack trace above for details |
-
-## Details
-
-### `E_INVALID_SURFACE`
-
-**Invalid surface value**
-
-**How to fix:**
-- Set target.surface to one of: sdk, cli, mcp, prompt
-- sdk = TypeScript/Python/Rust library, cli = command-line tool, mcp = MCP server, prompt = prompt template / skill document
-
-### `E_MODELS_EMPTY`
-
-**benchmark.models is empty or missing**
-
-**How to fix:**
-- Add at least one model to benchmark.models, e.g.:
--   { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude Sonnet", "tier": "flagship" }
-
-### `E_MODEL_ID_FORMAT`
-
-**Model ID is missing a provider prefix**
-
-**How to fix:**
-- Prefix all model IDs with a supported provider prefix:
--   openrouter/<provider>/<model>  — routed via OpenRouter (e.g. openrouter/anthropic/claude-sonnet-4.6)
--   anthropic/<model>              — direct Anthropic API (e.g. anthropic/claude-sonnet-4-6)
--   openai/<model>                 — direct OpenAI API (e.g. openai/gpt-4.1)
-- Browse OpenRouter models at https://openrouter.ai/models
-
-### `E_VERDICT_OUT_OF_RANGE`
-
-**Verdict threshold is out of range**
-
-**How to fix:**
-- Set benchmark.verdict.perModelFloor and targetWeightedAverage to values between 0.0 and 1.0
-- Typical values: perModelFloor=0.6, targetWeightedAverage=0.7
-
-### `E_MAX_ITERATIONS_ZERO`
-
-**optimize.maxIterations must be a positive integer**
-
-**How to fix:**
-- Set optimize.maxIterations to a positive integer, e.g. 5
-
-### `E_INVALID_FORMAT`
-
-**Invalid benchmark.format value**
-
-**How to fix:**
-- Set benchmark.format to one of: pi, openai, anthropic
-
-### `E_REPO_NOT_FOUND`
-
-**target.repoPath does not exist or is not a directory**
-
-**How to fix:**
-- Fix target.repoPath in your skill-optimizer.json to point at an existing directory
-- Paths in the config are relative to the config file location
-
-### `E_MISSING_SKILL`
-
-**target.skill file not found**
-
-**How to fix:**
-- Create a SKILL.md at the path specified in target.skill
-- Or update target.skill in your config to point at an existing file
-
-### `E_SOURCES_NOT_FOUND`
-
-**One or more target.discovery.sources files do not exist**
-
-**How to fix:**
-- Check that all paths in target.discovery.sources exist in your repo
-- Paths are relative to target.repoPath
-- For CLI: point at your main entry file (e.g. src/cli.ts)
-- For MCP: point at your server entry file (e.g. src/server.ts)
-
-### `E_CLI_MANIFEST_NOT_FOUND`
-
-**target.cli.commands manifest file not found**
-
-**How to fix:**
-- Run: skill-optimizer import-commands --from <entry-file> to auto-extract
-- Or create the file manually and populate it with your CLI commands
-- Format: Array of { command, description, options[] }
-
-### `E_MCP_MANIFEST_NOT_FOUND`
-
-**target.mcp.tools manifest file not found**
-
-**How to fix:**
-- Create the tools.json file at the path specified in target.mcp.tools
-- Format: Array of OpenAI function tool definitions { type: "function", function: { name, description, parameters } }
-
-### `E_ALLOWED_PATHS_ESCAPE`
-
-**optimize.allowedPaths contains a path outside target.repoPath**
-
-**How to fix:**
-- All paths in optimize.allowedPaths must be inside target.repoPath
-- This is a safety boundary — the optimizer will only edit files within this list
-
-### `E_OUTPUT_DIR_NOT_WRITABLE`
-
-**benchmark.output.dir is not writable**
-
-**How to fix:**
-- Check directory permissions for the path set in benchmark.output.dir
-- Or change benchmark.output.dir to a path you have write access to
-
-### `E_MISSING_API_KEY`
-
-**API key environment variable is not set**
-
-**How to fix:**
-- Export your OpenRouter API key before running: export OPENROUTER_API_KEY=sk-or-...
-- Or add it to a .env file alongside your skill-optimizer.json
-- Get a key at https://openrouter.ai/keys
-
-### `E_DISCOVERY_EMPTY`
-
-**Discovery found zero callable actions**
-
-**How to fix:**
-- Check that target.discovery.sources points at the right entry file
-- For SDK: should be your public API entry (e.g. src/index.ts)
-- For CLI: should be the file that registers all subcommands
-- For MCP: should be the file that registers all tools
-- Add a fallback manifest: target.discovery.fallbackManifest or target.cli.commands / target.mcp.tools
-
-### `E_MAXTASKS_TOO_LOW`
-
-**benchmark.taskGeneration.maxTasks is less than the in-scope action count**
-
-**How to fix:**
-- Raise benchmark.taskGeneration.maxTasks to at least the number of in-scope actions
-- Run: skill-optimizer --dry-run --config ./skill-optimizer.json to see the action count
-- Or narrow the scope with target.scope.exclude to reduce the action count
-
-### `E_COVERAGE_EXHAUSTED`
-
-**Task generation could not cover all in-scope actions after 2 retry passes**
-
-**How to fix:**
-- Add guidance for the uncovered actions to your SKILL.md
-- The error message above names the specific uncovered actions
-- Or exclude them with target.scope.exclude if they should not be benchmarked
-
-### `E_DIRTY_GIT`
-
-**Target repo has uncommitted changes**
-
-**How to fix:**
-- Commit or stash changes in target.repoPath before running the optimizer
-- Run: git -C <repoPath> stash
-- Or: git -C <repoPath> add -A && git -C <repoPath> commit -m "wip: before optimizer run"
-
-### `E_GIT_CHECKPOINT_FAILED`
-
-**Git checkpoint creation failed**
-
-**How to fix:**
-- Check disk space and git permissions in target.repoPath
-- Make sure the directory is a valid git repository
-- Run: git -C <repoPath> status to verify git state
-
-### `E_VALIDATION_FAILED`
-
-**Configured validation command exited non-zero**
-
-**How to fix:**
-- Fix the issue flagged by the validation command before retrying
-- The failing command is listed in optimize.validation in your config
-- Run the validation command manually to see the full error output
-
-### `E_INIT_AUTO_LOW_CONFIDENCE`
-
-**init --auto --yes requires high confidence detection**
-
-**How to fix:**
-- Run init interactively to review and confirm detection: skill-optimizer init --auto
-- Or supply a pre-filled answers file: skill-optimizer init --answers answers.json
-- See README for the answers.json format
-
-### `E_UNEXPECTED`
-
-**An unexpected error occurred**
-
-**How to fix:**
-- Check the full error message and stack trace above for details
-- File an issue at https://github.com/fastxyz/skill-optimizer/issues with the full output
diff --git a/docs/workbench.md b/docs/workbench.md
new file mode 100644
index 0000000..2bb344d
--- /dev/null
+++ b/docs/workbench.md
@@ -0,0 +1,270 @@
+# Workbench Guide
+
+`skill-optimizer` runs agent skill evals in a Docker workbench. A model receives a normal user task plus files under `/work`; deterministic graders inspect the final workspace and trace to decide whether the attempt passed.
+
+## Mental Model
+
+- A case is one user-like task plus one or more deterministic graders.
+- A suite is a matrix of cases and OpenRouter models.
+- `references/` is copied into `/work` before the agent starts. Put the skill under test here.
+- `workspace/` is copied into `/work` after `references/`. Use it to seed starter files or repos.
+- `checks/` and `bin/` are case support files. They are mounted for setup and grading, not for the agent.
+- The agent phase sees only `/work`. It cannot see `/case`, `/results`, graders, hidden answers, or hidden metadata.
+- Graders define acceptance. They inspect files, command logs, generated artifacts, `answer.json`, `trace.jsonl`, and result state.
+
+## Directory Layout
+
+```text
+my-eval/
+  suite.yml
+  references/
+    my-skill/SKILL.md
+    my-skill/references/api.md
+  checks/
+    create-inputs.mjs
+    grade-output.mjs
+  bin/
+    fake-product-cli
+  workspace/
+    starter-repo/
+```
+
+Support directory behavior:
+
+| Directory | Visible To Agent | Purpose |
+|-----------|------------------|---------|
+| `references/` | yes, copied into `/work` | Skills and reference docs under test |
+| `workspace/` | yes, copied into `/work` | Starter repos, input files, seed state |
+| `checks/` | no during agent phase | Setup helpers and graders under `$CASE/checks` |
+| `bin/` | yes, copied to `/work/bin` | Fake CLIs and command recorders; also mounted under `$CASE/bin` for setup/grading |
+
+## Suite And Case Files
+
+Minimal suite:
+
+```yaml
+name: pdf-skill-eval
+references: ./references
+models:
+  - openrouter/google/gemini-2.5-flash
+env:
+  - OPENROUTER_API_KEY
+timeoutSeconds: 600
+setup:
+  - node $CASE/checks/create-inputs.mjs
+appendSystemPrompt: |
+  Keep task outputs at the top level of /work unless the user asks otherwise.
+cases:
+  - name: extract-pdf-facts
+    task: |
+      Read statement.pdf and write answer.json with the account, quarter, approval code, and risk flags.
+    graders:
+      - name: answer-json
+        command: node $CASE/checks/extract-pdf-facts.mjs
+```
+
+Case fields:
+
+- `name`: human-readable case name; inline suite cases use this to derive result slugs.
+- `references`: directory copied into `/work`; required for standalone cases and defaulted by suites.
+- `task`: natural user request sent to the agent.
+- `graders`: shell commands run after the agent; every grader must pass for the case to pass.
+- `setup`: shell commands run before the agent.
+- `cleanup`: optional shell commands run after grading.
+- `env`: host environment variable names forwarded into setup, agent, grading, and cleanup.
+- `mcpServers`: optional MCP server map exposed through the agent's `mcp` tool.
+- `mcpServices`: optional hidden Docker MCP services started beside the agent container.
+- `model`: default model for `run-case`.
+- `timeoutSeconds`: agent timeout, default `600`.
+
+Task prompts should not mention graders, hidden answers, `/case`, `/results`, or eval internals. Ask for the real deliverable just like a user would.
+
+## MCP Servers
+
+Cases and suite inline-case defaults may define `mcpServers`. The workbench writes a per-trial `/work/mcporter.json` with only those servers and `imports: []`, then exposes an `mcp` command on `PATH` for the agent. The command delegates to `mcporter` inside the workbench image.
+
+Example:
+
+```yaml
+mcpServers:
+  calculator:
+    baseUrl: http://calculator:3000/mcp
+
+  context7:
+    baseUrl: https://mcp.context7.com/mcp
+    headers:
+      Authorization: "Bearer ${CONTEXT7_API_KEY}"
+env:
+  - OPENROUTER_API_KEY
+  - CONTEXT7_API_KEY
+
+mcpServices:
+  calculator:
+    command: node
+    args:
+      - calculator-server.mjs
+```
+
+Suite-level `mcpServers` apply to inline cases. Inline case definitions merge by server name, with the inline case winning. External case files do not inherit suite defaults.
+
+Use `mcpServices` for local MCP servers whose source should not be visible to the agent. Service files live under the case `mcp/` support directory. During `run-case` and `run-suite`, Docker mounts that directory read-only into separate service containers at `/mcp`, joins those containers to a private Docker network, and joins the agent container to the same network. The agent sees only the configured `mcpServers` URL such as `http://calculator:3000/mcp`; it does not mount `/case` or the `mcp/` source directory. Set service ports in the matching `mcpServers` URL rather than in `mcpServices`.
+
+Remote HTTP/SSE servers must be reachable from Docker. Host-local endpoints need Docker-reachable addresses such as `host.docker.internal`. Direct stdio `mcpServers.command` entries run inside the agent container and are only appropriate when the server implementation is intentionally agent-visible.
+
+OAuth/browser auth is not supported in v1. Use non-interactive headers, bearer tokens, or environment-variable placeholders. Only env names listed in `env` are forwarded into the containers.
+
+## Docker Execution Phases
+
+`run-case` and `run-suite` use Docker for model attempts. Each trial has a prepared case directory, work directory, and result directory on the host; Docker mounts them into phase containers.
+
+| Phase | Docker Mounts | Working Dir | What Happens |
+|-------|---------------|-------------|--------------|
+| setup | `/case:ro`, `/work:rw` | `/work` | Runs setup commands and prepares inputs |
+| agent | `/work:rw` only | `/work` | Runs the agent/model with the user task |
+| grade | `/case:ro`, `/work:rw`, `/results:rw` | `/work` | Runs graders and writes result files |
+| cleanup | `/case:ro`, `/work:rw`, `/results:rw` | `/work` | Runs optional cleanup commands |
+
+Important agent-phase constraints:
+
+- The agent cannot see `/case` or `/results`.
+- The Docker socket is not mounted.
+- Global/user Pi skills are not mounted.
+- Additional skills are discovered from `/work`.
+- If configured, MCP servers are exposed through the `mcp` command using `/work/mcporter.json`.
+- Python installs should use `/work/.venv`.
+- Environment variables listed in `env` are available unchanged to the agent.
+
+Use dedicated test accounts and scoped credentials for live integration evals. Treat `trace.jsonl`, `result.json`, grader evidence, stdout/stderr, and preserved workspaces as potentially sensitive.
+
+## Graders
+
+Graders are shell commands. They run from `/work` with these environment variables:
+
+| Variable | Meaning |
+|----------|---------|
+| `$CASE` | Read-only case directory mounted at `/case` |
+| `$WORK` | Mutable workspace used by the agent |
+| `$RESULTS` | Result directory containing `trace.jsonl` |
+
+Preferred grader output is one JSON object on stdout:
+
+```json
+{ "pass": false, "score": 0, "evidence": ["answer.json missing approvalCode"] }
+```
+
+If no JSON object is printed, exit code `0` passes and non-zero fails. Keep graders deterministic and local. Do not use an LLM judge unless the eval explicitly requires one.
+
+Good graders check one thing when practical:
+
+- Exact JSON shape and values.
+- PDF, DOCX, PPTX, XLSX, image, ZIP, or database structure.
+- Command calls recorded by a fake CLI.
+- Static SQL, source code, diffs, or generated files.
+- `trace.jsonl` for negative behavior, such as reading an irrelevant skill file.
+
+## Acceptance Contract
+
+Graders are the only source of truth for pass/fail. Design graders to inspect whatever local evidence the task should produce, including:
+
+- Workspace files and generated artifacts under `$WORK`
+- Structured outputs such as `answer.json`
+- Agent behavior captured in `$RESULTS/trace.jsonl`
+- Any additional result-state files your setup/graders write under `$RESULTS`
+
+Keep graders deterministic and local so acceptance criteria stay stable across model runs.
+
+## Running Evals
+
+Run one case:
+
+```bash
+npx tsx src/cli.ts run-case ./case.yml
+```
+
+Run a case across models:
+
+```bash
+npx tsx src/cli.ts run-case ./case.yml \
+  --models openrouter/google/gemini-2.5-flash,openrouter/openai/gpt-5.4 \
+  --trials 3 \
+  --concurrency 2
+```
+
+Run a suite:
+
+```bash
+npx tsx src/cli.ts run-suite ./suite.yml --trials 3 --concurrency 2
+```
+
+Useful options:
+
+- `--out <path>`: results root, default `<case-dir>/.results` or `<suite-dir>/.results`.
+- `--model <model>`: single `run-case` model override.
+- `--models <models>`: comma-separated `run-case` model list.
+- `--trials <n>`: independent trials per model/case, default `1`.
+- `--concurrency <n>`: maximum concurrent trial containers, default `1`.
+- `--image <image>`: Docker image name, default `skill-optimizer-workbench:local`.
+- `--keep-workspace`: copy final `/work` to results; failed trials are always preserved.
+
+`run-suite` always uses `models:` from `suite.yml`; it does not have a model override flag.
+
+## Outputs
+
+Single-trial `run-case` output:
+
+```text
+case/.results/<run-id>/
+  trace.jsonl
+  result.json
+  summary.json
+  workspace/        # on failure or --keep-workspace
+```
+
+Matrix `run-case` output:
+
+```text
+case/.results/<run-id>/
+  run-result.json
+  trials/<model-slug>--001/trace.jsonl
+  trials/<model-slug>--001/result.json
+```
+
+`run-suite` output:
+
+```text
+suite/.results/<run-id>/
+  suite-result.json
+  trials/<case-slug>--<model-slug>--001/trace.jsonl
+  trials/<case-slug>--<model-slug>--001/result.json
+```
+
+`result.json` includes `pass`, `score`, `evidence`, per-grader results, duration, turns, tool counts, tokens, and cost. Aggregates include trial pass rate, pass@k, pass^k, mean score, and relative result/trace paths.
+
+`trace.jsonl` is the primary debugging source. It records assistant messages, tool calls, tool results, stop reasons, and errors. Use it to understand why a model failed or to grade negative cases.
+
+## Debugging Failed Runs
+
+1. Read the failing trial `result.json` evidence.
+2. Inspect `graders[]` to identify the failed grader.
+3. Open `summary.json` for final assistant text and commands.
+4. Open `trace.jsonl` to inspect tool calls and file reads.
+5. Inspect preserved `workspace/` for failed trials.
+6. Classify the failure as unclear skill guidance, missing reference material, brittle grader, unrealistic input data, task ambiguity, or product/code bug.
+7. Update the target skill, references, inputs, graders, or code according to that diagnosis.
+8. Re-run the same case or suite and compare grader evidence across the target models/trials.
+
+## Example
+
+The tracked PDF demo is the best starting point:
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+npx tsx src/cli.ts run-suite examples/workbench/mcp/suite.yml --trials 1
+```
+
+Useful files:
+
+- `examples/workbench/pdf/suite.yml`: inline suite with models, setup, graders, and append prompt.
+- `examples/workbench/pdf/references/pdf-skill/SKILL.md`: skill under test copied into `/work`.
+- `examples/workbench/pdf/checks/*.mjs`: deterministic graders and setup helpers.
+- `examples/workbench/pdf/README.md`: demo walkthrough.
diff --git a/examples/workbench/README.md b/examples/workbench/README.md
new file mode 100644
index 0000000..47377ca
--- /dev/null
+++ b/examples/workbench/README.md
@@ -0,0 +1,23 @@
+# Workbench Examples
+
+These examples are small, demoable suites that exercise the Docker workbench end to end.
+
+See `../../docs/workbench.md` for the full workbench model, including cases, suites, graders, Docker phases, and result files.
+
+## PDF Skill Demo
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+```
+
+Graders are the acceptance contract. They evaluate agent outputs from `/work`, generated artifacts, `answer.json`, and behavior captured in `trace.jsonl`.
+
+`run-suite` runs the configured model matrix and writes `trace.jsonl`, `result.json`, and failed workspaces under `examples/workbench/pdf/.results/<run-id>/`.
+
+## MCP Calculator Demo
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/mcp/suite.yml --trials 1
+```
+
+The MCP demo starts a local calculator MCP server as a separate hidden Docker service and exposes it through the workbench `mcp` command. The server has `add`, `subtract`, `multiply`, and `divide`; the grader checks both `answer.json` and `trace.jsonl` for MCP usage.
diff --git a/examples/workbench/mcp/README.md b/examples/workbench/mcp/README.md
new file mode 100644
index 0000000..5b84aca
--- /dev/null
+++ b/examples/workbench/mcp/README.md
@@ -0,0 +1,11 @@
+# MCP Calculator Workbench Example
+
+This example shows a local MCP server started as a separate hidden Docker service beside the agent container. The agent sees the `calculator` MCP URL through the workbench `mcp` command, but it cannot read the server source file. The server exposes calculator tools: `add`, `subtract`, `multiply`, and `divide`.
+
+Run a model trial:
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/mcp/suite.yml --trials 1
+```
+
+The case asks the agent to compute the expression and write `answer.json`. The grader checks the computed answer and verifies that the trace contains separate bash calls to `mcp call calculator.add`, `calculator.multiply`, `calculator.subtract`, and `calculator.divide`.
diff --git a/examples/workbench/mcp/checks/calculator-answer.mjs b/examples/workbench/mcp/checks/calculator-answer.mjs
new file mode 100644
index 0000000..07c30de
--- /dev/null
+++ b/examples/workbench/mcp/checks/calculator-answer.mjs
@@ -0,0 +1,61 @@
+import { existsSync, readFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const expectedExpression = '((17 + 25) * 3 - 18) / 6';
+const expectedResult = 18;
+const failures = [];
+
+const answerPath = join(process.env.WORK, 'answer.json');
+if (!existsSync(answerPath)) {
+  failures.push('answer.json was not created');
+} else {
+  try {
+    const answer = JSON.parse(readFileSync(answerPath, 'utf-8'));
+    if (answer.expression !== expectedExpression) {
+      failures.push(`expression mismatch: ${JSON.stringify(answer.expression)}`);
+    }
+    if (answer.result !== expectedResult) {
+      failures.push(`result mismatch: ${JSON.stringify(answer.result)}`);
+    }
+  } catch (error) {
+    failures.push(`answer.json is not valid JSON: ${error.message}`);
+  }
+}
+
+const tracePath = join(process.env.RESULTS, 'trace.jsonl');
+if (!existsSync(tracePath)) {
+  failures.push('trace.jsonl was not created');
+} else {
+  const trace = readFileSync(tracePath, 'utf-8').trim().split(/\r?\n/).flatMap((line) => {
+    try {
+      return [JSON.parse(line)];
+    } catch {
+      return [];
+    }
+  });
+  const requiredTools = [
+    { tool: 'add', pattern: /\bmcp\s+call\s+calculator\.add\b/ },
+    { tool: 'multiply', pattern: /\bmcp\s+call\s+calculator\.multiply\b/ },
+    { tool: 'subtract', pattern: /\bmcp\s+call\s+calculator\.subtract\b/ },
+    { tool: 'divide', pattern: /\bmcp\s+call\s+calculator\.divide\b/ },
+  ];
+  const bashCommands = trace.flatMap((entry) => {
+    if (entry.type !== 'tool_call' || entry.name !== 'bash') return [];
+    const args = entry.arguments ?? {};
+    return typeof args.command === 'string' ? [args.command] : [];
+  });
+  for (const { tool, pattern } of requiredTools) {
+    if (!bashCommands.some((command) => pattern.test(command))) {
+      failures.push(`trace does not contain calculator.${tool} MCP call`);
+    }
+  }
+}
+
+const pass = failures.length === 0;
+console.log(JSON.stringify({
+  pass,
+  score: pass ? 1 : 0,
+  evidence: pass ? ['answer matched and all calculator MCP tools were used'] : failures,
+}));
+
+process.exit(pass ? 0 : 1);
diff --git a/examples/workbench/mcp/mcp/calculator-server.mjs b/examples/workbench/mcp/mcp/calculator-server.mjs
new file mode 100644
index 0000000..5b12ebb
--- /dev/null
+++ b/examples/workbench/mcp/mcp/calculator-server.mjs
@@ -0,0 +1,120 @@
+import { randomUUID } from 'node:crypto';
+import { createRequire } from 'node:module';
+
+const requireFromApp = createRequire('/app/package.json');
+const { createMcpExpressApp } = requireFromApp('@modelcontextprotocol/sdk/server/express.js');
+const { Server } = requireFromApp('@modelcontextprotocol/sdk/server/index.js');
+const { StreamableHTTPServerTransport } = requireFromApp('@modelcontextprotocol/sdk/server/streamableHttp.js');
+const { CallToolRequestSchema, isInitializeRequest, ListToolsRequestSchema } = requireFromApp('@modelcontextprotocol/sdk/types.js');
+
+const tools = [
+  { name: 'add', description: 'Add two numbers.', inputSchema: binaryNumberSchema('a', 'b') },
+  { name: 'subtract', description: 'Subtract b from a.', inputSchema: binaryNumberSchema('a', 'b') },
+  { name: 'multiply', description: 'Multiply two numbers.', inputSchema: binaryNumberSchema('a', 'b') },
+  { name: 'divide', description: 'Divide a by b.', inputSchema: binaryNumberSchema('a', 'b') },
+];
+
+const transports = {};
+const app = createMcpExpressApp({ host: '0.0.0.0' });
+
+app.post('/mcp', async (req, res) => {
+  try {
+    const sessionId = req.headers['mcp-session-id'];
+    let transport = typeof sessionId === 'string' ? transports[sessionId] : undefined;
+
+    if (!transport && isInitializeRequest(req.body)) {
+      transport = new StreamableHTTPServerTransport({
+        sessionIdGenerator: () => randomUUID(),
+        onsessioninitialized: (newSessionId) => {
+          transports[newSessionId] = transport;
+        },
+      });
+      await createCalculatorServer().connect(transport);
+    }
+
+    if (!transport) {
+      res.status(400).json({ jsonrpc: '2.0', error: { code: -32000, message: 'Bad Request' }, id: null });
+      return;
+    }
+
+    await transport.handleRequest(req, res, req.body);
+  } catch (error) {
+    if (!res.headersSent) {
+      res.status(500).json({
+        jsonrpc: '2.0',
+        error: { code: -32603, message: error instanceof Error ? error.message : String(error) },
+        id: null,
+      });
+    }
+  }
+});
+
+app.get('/mcp', async (req, res) => {
+  const sessionId = req.headers['mcp-session-id'];
+  const transport = typeof sessionId === 'string' ? transports[sessionId] : undefined;
+  if (!transport) {
+    res.status(400).send('Invalid or missing session ID');
+    return;
+  }
+  await transport.handleRequest(req, res);
+});
+
+app.listen(3000, (error) => {
+  if (error) {
+    console.error(error);
+    process.exit(1);
+  }
+  console.error('calculator MCP server listening on :3000');
+});
+
+function createCalculatorServer() {
+  const server = new Server(
+    { name: 'calculator', version: '1.0.0' },
+    { capabilities: { tools: {} } },
+  );
+
+  server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
+  server.setRequestHandler(CallToolRequestSchema, async (request) => {
+    const name = request.params.name;
+    const args = request.params.arguments ?? {};
+    const a = readNumber(args.a, 'a');
+    const b = readNumber(args.b, 'b');
+
+    if (name === 'add') return toolResult(a + b);
+    if (name === 'subtract') return toolResult(a - b);
+    if (name === 'multiply') return toolResult(a * b);
+    if (name === 'divide') {
+      if (b === 0) throw new Error('Cannot divide by zero');
+      return toolResult(a / b);
+    }
+    throw new Error(`Unknown calculator tool: ${name}`);
+  });
+
+  return server;
+}
+
+function toolResult(result) {
+  return {
+    content: [{ type: 'text', text: String(result) }],
+    structuredContent: { result },
+  };
+}
+
+function binaryNumberSchema(left, right) {
+  return {
+    type: 'object',
+    properties: {
+      [left]: { type: 'number' },
+      [right]: { type: 'number' },
+    },
+    required: [left, right],
+    additionalProperties: false,
+  };
+}
+
+function readNumber(value, name) {
+  if (typeof value !== 'number' || !Number.isFinite(value)) {
+    throw new Error(`Argument ${name} must be a finite number`);
+  }
+  return value;
+}
diff --git a/examples/workbench/mcp/references/README.md b/examples/workbench/mcp/references/README.md
new file mode 100644
index 0000000..6832bfa
--- /dev/null
+++ b/examples/workbench/mcp/references/README.md
@@ -0,0 +1,3 @@
+This directory is copied into `/work` for the agent.
+
+The calculator MCP server source is intentionally not here. It lives under the case's hidden `mcp/` support directory and is started by the Docker workbench as a separate service container.
diff --git a/examples/workbench/mcp/suite.yml b/examples/workbench/mcp/suite.yml
new file mode 100644
index 0000000..5ac689a
--- /dev/null
+++ b/examples/workbench/mcp/suite.yml
@@ -0,0 +1,30 @@
+name: mcp-calculator-example
+references: ./references
+models:
+  - openrouter/google/gemini-2.5-flash
+env:
+  - OPENROUTER_API_KEY
+timeoutSeconds: 600
+mcpServers:
+  calculator:
+    baseUrl: http://calculator:3000/mcp
+mcpServices:
+  calculator:
+    command: node
+    args:
+      - calculator-server.mjs
+cases:
+  - name: use-calculator-mcp
+    task: |
+      Compute this expression:
+
+      ((17 + 25) * 3 - 18) / 6
+
+      Write answer.json with this exact shape:
+      {
+        "expression": "((17 + 25) * 3 - 18) / 6",
+        "result": <number>
+      }
+    graders:
+      - name: calculator-answer
+        command: node $CASE/checks/calculator-answer.mjs
diff --git a/examples/workbench/pdf/README.md b/examples/workbench/pdf/README.md
new file mode 100644
index 0000000..3fdf617
--- /dev/null
+++ b/examples/workbench/pdf/README.md
@@ -0,0 +1,36 @@
+# PDF Workbench Demo
+
+This suite demonstrates the main workbench features with a PDF skill:
+
+- `models`: suite-owned model matrix
+- `env`: API key forwarding into the agent container
+- `appendSystemPrompt`: suite-wide prompt additions
+- `setup`: input generation before the agent starts
+- `graders`: deterministic post-run checks
+- trace grading: the negative case checks `trace.jsonl` for forbidden skill reads
+
+## Run The Demo
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+```
+
+`run-suite` runs each case against the suite models. Results are written to:
+
+```text
+examples/workbench/pdf/.results/<run-id>/
+  suite-result.json
+  trials/<case>--<model>--001/result.json
+  trials/<case>--<model>--001/trace.jsonl
+```
+
+Failed trials also preserve `workspace/` so you can inspect exactly what the agent wrote.
+
+## Cases
+
+- `extract-pdf-facts`: reads `statement.pdf` and writes exact structured JSON.
+- `split-customer-packet`: keeps only customer-copy pages from a packet PDF.
+- `build-briefing-pdf`: creates a valid one-page briefing PDF.
+- `no-pdf-skill-needed`: writes a text file and fails if the agent reads `/work/pdf-skill/SKILL.md`.
+
+The example skill under `references/pdf-skill/` is intentionally small and demo-safe. Replace it with a real skill to evaluate production PDF guidance.
diff --git a/examples/workbench/pdf/checks/_pdf.mjs b/examples/workbench/pdf/checks/_pdf.mjs
new file mode 100644
index 0000000..2a263cb
--- /dev/null
+++ b/examples/workbench/pdf/checks/_pdf.mjs
@@ -0,0 +1,305 @@
+import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { inflateSync } from 'node:zlib';
+
+function escapePdfLiteral(value) {
+  return String(value)
+    .replace(/\\/g, '\\\\')
+    .replace(/\(/g, '\\(')
+    .replace(/\)/g, '\\)');
+}
+
+function unescapePdfLiteral(value) {
+  let output = '';
+  for (let index = 0; index < value.length; index += 1) {
+    const char = value[index];
+    if (char !== '\\') {
+      output += char;
+      continue;
+    }
+
+    const next = value[index + 1];
+    if (next === undefined) {
+      output += '\\';
+      continue;
+    }
+
+    if (/[0-7]/.test(next)) {
+      const match = value.slice(index + 1).match(/^[0-7]{1,3}/)?.[0] ?? next;
+      output += String.fromCharCode(Number.parseInt(match, 8));
+      index += match.length;
+      continue;
+    }
+
+    index += 1;
+    if (next === 'n') output += '\n';
+    else if (next === 'r') output += '\r';
+    else if (next === 't') output += '\t';
+    else if (next === 'b') output += '\b';
+    else if (next === 'f') output += '\f';
+    else if (next === '\n' || next === '\r') output += '';
+    else output += next;
+  }
+  return output;
+}
+
+function contentStreamForPage(pageText) {
+  const lines = String(pageText).split(/\r?\n/);
+  return [
+    'BT',
+    '/F1 12 Tf',
+    '72 720 Td',
+    ...lines.flatMap((line, index) => [
+      index === 0 ? null : '0 -18 Td',
+      `(${escapePdfLiteral(line)}) Tj`,
+    ]).filter(Boolean),
+    'ET',
+  ].join('\n');
+}
+
+export function createPdf(filePath, pages) {
+  if (!Array.isArray(pages) || pages.length === 0) {
+    throw new Error('createPdf requires at least one page');
+  }
+
+  const pageObjectIds = pages.map((_, index) => 4 + index * 2);
+  const contentObjectIds = pages.map((_, index) => 5 + index * 2);
+  const objects = new Map();
+
+  objects.set(1, '<< /Type /Catalog /Pages 2 0 R >>');
+  objects.set(2, `<< /Type /Pages /Kids [${pageObjectIds.map((id) => `${id} 0 R`).join(' ')}] /Count ${pages.length} >>`);
+  objects.set(3, '<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>');
+
+  for (let index = 0; index < pages.length; index += 1) {
+    const pageObjectId = pageObjectIds[index];
+    const contentObjectId = contentObjectIds[index];
+    const stream = contentStreamForPage(pages[index]);
+    objects.set(pageObjectId, `<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 3 0 R >> >> /Contents ${contentObjectId} 0 R >>`);
+    objects.set(contentObjectId, `<< /Length ${Buffer.byteLength(stream, 'ascii')} >>\nstream\n${stream}\nendstream`);
+  }
+
+  const maxObjectId = Math.max(...objects.keys());
+  let pdf = '%PDF-1.4\n';
+  const offsets = [0];
+  for (let objectId = 1; objectId <= maxObjectId; objectId += 1) {
+    const body = objects.get(objectId);
+    if (!body) {
+      throw new Error(`missing PDF object ${objectId}`);
+    }
+    offsets[objectId] = Buffer.byteLength(pdf, 'ascii');
+    pdf += `${objectId} 0 obj\n${body}\nendobj\n`;
+  }
+  const xrefOffset = Buffer.byteLength(pdf, 'ascii');
+  pdf += `xref\n0 ${maxObjectId + 1}\n`;
+  pdf += '0000000000 65535 f \n';
+  for (let objectId = 1; objectId <= maxObjectId; objectId += 1) {
+    pdf += `${String(offsets[objectId]).padStart(10, '0')} 00000 n \n`;
+  }
+  pdf += `trailer\n<< /Size ${maxObjectId + 1} /Root 1 0 R >>\nstartxref\n${xrefOffset}\n%%EOF\n`;
+
+  mkdirSync(dirname(filePath), { recursive: true });
+  writeFileSync(filePath, pdf, 'ascii');
+}
+
+export function readTextFile(filePath) {
+  return readFileSync(filePath, 'latin1');
+}
+
+export function isPdfFile(filePath) {
+  const raw = readTextFile(filePath);
+  return raw.startsWith('%PDF-') && raw.includes('%%EOF');
+}
+
+export function countPdfPages(filePath) {
+  const raw = readTextFile(filePath);
+  return [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
+}
+
+function readFilters(dictionary) {
+  const match = dictionary.match(/\/Filter\s*(\[[^\]]+\]|\/\w+)/);
+  if (!match) {
+    return [];
+  }
+  return [...match[1].matchAll(/\/(\w+)/g)].map((filter) => filter[1]);
+}
+
+function ascii85Decode(buffer) {
+  const input = buffer.toString('latin1').replace(/\s+/g, '').replace(/^<~/, '').replace(/~>$/, '');
+  const bytes = [];
+  let group = [];
+
+  const flush = (values, outputLength) => {
+    let value = 0;
+    for (const digit of values) {
+      value = value * 85 + digit;
+    }
+    const decoded = [
+      (value >>> 24) & 0xff,
+      (value >>> 16) & 0xff,
+      (value >>> 8) & 0xff,
+      value & 0xff,
+    ];
+    bytes.push(...decoded.slice(0, outputLength));
+  };
+
+  for (const char of input) {
+    if (char === 'z' && group.length === 0) {
+      bytes.push(0, 0, 0, 0);
+      continue;
+    }
+    group.push(char.charCodeAt(0) - 33);
+    if (group.length === 5) {
+      flush(group, 4);
+      group = [];
+    }
+  }
+
+  if (group.length > 0) {
+    const outputLength = group.length - 1;
+    while (group.length < 5) group.push(84);
+    flush(group, outputLength);
+  }
+
+  return Buffer.from(bytes);
+}
+
+function decodeStream(filters, streamContent) {
+  let buffer = Buffer.from(streamContent, 'latin1');
+  for (const filter of filters) {
+    if (filter === 'ASCII85Decode' || filter === 'A85') {
+      buffer = ascii85Decode(buffer);
+    } else if (filter === 'FlateDecode' || filter === 'Fl') {
+      buffer = inflateSync(buffer);
+    }
+  }
+  return buffer.toString('latin1');
+}
+
+function decodedContentStreams(raw) {
+  const streams = [raw];
+  const pattern = /(<<[\s\S]*?>>)\s*stream\r?\n([\s\S]*?)\r?\nendstream/g;
+  let match;
+  while ((match = pattern.exec(raw)) !== null) {
+    const filters = readFilters(match[1]);
+    try {
+      streams.push(decodeStream(filters, match[2]));
+    } catch {
+      streams.push(match[2]);
+    }
+  }
+  return streams;
+}
+
+function extractPdfStringLiterals(value) {
+  const texts = [];
+  const literalPattern = /\(((?:\\.|[^\\)])*)\)/g;
+  let match;
+  while ((match = literalPattern.exec(value)) !== null) {
+    texts.push(unescapePdfLiteral(match[1]));
+  }
+  return texts;
+}
+
+export function extractSimplePdfText(filePath) {
+  const raw = readTextFile(filePath);
+  const texts = [];
+
+  for (const stream of decodedContentStreams(raw)) {
+    const tjPattern = /\(((?:\\.|[^\\)])*)\)\s*Tj/g;
+    let tjMatch;
+    while ((tjMatch = tjPattern.exec(stream)) !== null) {
+      texts.push(unescapePdfLiteral(tjMatch[1]));
+    }
+
+    const tjArrayPattern = /\[([\s\S]*?)\]\s*TJ/g;
+    let arrayMatch;
+    while ((arrayMatch = tjArrayPattern.exec(stream)) !== null) {
+      texts.push(extractPdfStringLiterals(arrayMatch[1]).join(''));
+    }
+  }
+
+  return texts.join('\n');
+}
+
+export function readJson(filePath) {
+  return JSON.parse(readFileSync(filePath, 'utf-8'));
+}
+
+export function result(pass, evidence, score = pass ? 1 : 0) {
+  return {
+    pass,
+    score,
+    evidence: Array.isArray(evidence) ? evidence : [String(evidence)],
+  };
+}
+
+export function printResult(passOrResult, evidence, score) {
+  const output = typeof passOrResult === 'object' && passOrResult !== null
+    ? passOrResult
+    : result(passOrResult, evidence, score);
+  console.log(JSON.stringify(output));
+  process.exit(output.pass ? 0 : 1);
+}
+
+export function requireEnv(name) {
+  const value = process.env[name];
+  if (!value) {
+    printResult(false, `${name} env var is required`);
+  }
+  return value;
+}
+
+export function missingStrings(text, expected) {
+  return expected.filter((value) => !text.includes(value));
+}
+
+export function writeInputPdfs(rootDir) {
+  createPdf(join(rootDir, 'statement.pdf'), [
+    [
+      'Quarterly Statement',
+      'Account: Delta Orchard Cooperative',
+      'Quarter: Q4 2025',
+      'Total Revenue: $128,430.00',
+      'Risk Flag: inventory write-down',
+      'Risk Flag: late supplier audit',
+      'Approval Code: PDF-7429',
+    ].join('\n'),
+  ]);
+
+  createPdf(join(rootDir, 'customer-packet.pdf'), [
+    [
+      'CUSTOMER COPY',
+      'Invoice: C-204',
+      'Status: PAID',
+      'Customer: Northwind Labs',
+    ].join('\n'),
+    [
+      'INTERNAL NOTES',
+      'Do not share with customer.',
+      'Margin review pending.',
+    ].join('\n'),
+    [
+      'CUSTOMER COPY',
+      'Warranty Code: W-8832',
+      'Support Tier: Priority',
+    ].join('\n'),
+  ]);
+
+  createPdf(join(rootDir, 'briefing-source.pdf'), [
+    [
+      'Renewal Source Notes',
+      'Source: Alpine Sensors',
+      'Decision: approve expedited renewal',
+      'Deadline: 2026-05-14',
+      'draft-only note: internal discount floor is 18 percent',
+    ].join('\n'),
+  ]);
+}
+
+if (process.argv[1] === new URL(import.meta.url).pathname && process.argv[2] === 'write-inputs') {
+  const outputDir = process.argv[3];
+  if (!outputDir) {
+    throw new Error('Usage: node _pdf.mjs write-inputs <output-dir>');
+  }
+  writeInputPdfs(outputDir);
+}
diff --git a/examples/workbench/pdf/checks/_trace.mjs b/examples/workbench/pdf/checks/_trace.mjs
new file mode 100644
index 0000000..60aea18
--- /dev/null
+++ b/examples/workbench/pdf/checks/_trace.mjs
@@ -0,0 +1,60 @@
+import { readFileSync } from 'node:fs';
+
+export function readTraceJsonl(tracePath) {
+  return readFileSync(tracePath, 'utf-8')
+    .trim()
+    .split(/\r?\n/)
+    .filter(Boolean)
+    .flatMap((line) => {
+      try {
+        return [JSON.parse(line)];
+      } catch {
+        return [];
+      }
+    });
+}
+
+function readPathFromToolCall(entry) {
+  if (entry?.type !== 'tool_call' || entry.name !== 'read') {
+    return undefined;
+  }
+  const args = entry.arguments;
+  if (!args || typeof args !== 'object') {
+    return undefined;
+  }
+  if (typeof args.path === 'string') return args.path;
+  if (typeof args.filePath === 'string') return args.filePath;
+  return undefined;
+}
+
+function matchesPath(path, pattern) {
+  if (pattern instanceof RegExp) {
+    return pattern.test(path);
+  }
+  return path === String(pattern);
+}
+
+export function noReadPath(tracePath, forbiddenPath) {
+  const forbidden = readTraceJsonl(tracePath)
+    .map(readPathFromToolCall)
+    .filter((path) => typeof path === 'string' && matchesPath(path, forbiddenPath));
+
+  if (forbidden.length > 0) {
+    return {
+      pass: false,
+      score: 0,
+      evidence: forbidden.map((path) => `forbidden read path: ${path}`),
+    };
+  }
+
+  return {
+    pass: true,
+    score: 1,
+    evidence: ['no forbidden read paths found'],
+  };
+}
+
+export function printResult(result) {
+  console.log(JSON.stringify(result));
+  process.exit(result.pass ? 0 : 1);
+}
diff --git a/examples/workbench/pdf/checks/build-briefing-pdf.mjs b/examples/workbench/pdf/checks/build-briefing-pdf.mjs
new file mode 100644
index 0000000..5c4e137
--- /dev/null
+++ b/examples/workbench/pdf/checks/build-briefing-pdf.mjs
@@ -0,0 +1,21 @@
+import { existsSync } from 'node:fs';
+import { join } from 'node:path';
+
+import { countPdfPages, isPdfFile, printResult, requireEnv } from './_pdf.mjs';
+
+const outputPath = join(requireEnv('WORK'), 'briefing.pdf');
+
+if (!existsSync(outputPath)) {
+  printResult(false, 'briefing.pdf was not created');
+}
+if (!isPdfFile(outputPath)) {
+  printResult(false, 'briefing.pdf is not a valid-looking PDF with header and EOF marker');
+}
+
+const failures = [];
+const pageCount = countPdfPages(outputPath);
+if (pageCount !== 1) {
+  failures.push(`expected 1 page, found ${pageCount}`);
+}
+
+printResult(failures.length === 0, failures.length === 0 ? 'briefing.pdf is a valid one-page PDF' : failures);
diff --git a/examples/workbench/pdf/checks/extract-pdf-facts.mjs b/examples/workbench/pdf/checks/extract-pdf-facts.mjs
new file mode 100644
index 0000000..83af938
--- /dev/null
+++ b/examples/workbench/pdf/checks/extract-pdf-facts.mjs
@@ -0,0 +1,33 @@
+import { existsSync } from 'node:fs';
+import { join } from 'node:path';
+
+import { printResult, readJson, requireEnv } from './_pdf.mjs';
+
+const answerPath = join(requireEnv('WORK'), 'answer.json');
+
+if (!existsSync(answerPath)) {
+  printResult(false, 'answer.json was not created');
+}
+
+let answer;
+try {
+  answer = readJson(answerPath);
+} catch (error) {
+  printResult(false, `answer.json is not valid JSON: ${error instanceof Error ? error.message : String(error)}`);
+}
+
+const failures = [];
+if (answer.account !== 'Delta Orchard Cooperative') failures.push('account mismatch');
+if (answer.quarter !== 'Q4 2025') failures.push('quarter mismatch');
+if (answer.totalRevenue !== 128430) failures.push('totalRevenue must be numeric 128430');
+if (!Array.isArray(answer.riskFlags)) {
+  failures.push('riskFlags must be an array');
+} else {
+  for (const expected of ['inventory write-down', 'late supplier audit']) {
+    if (!answer.riskFlags.includes(expected)) failures.push(`missing risk flag: ${expected}`);
+  }
+  if (answer.riskFlags.length !== 2) failures.push('riskFlags should contain exactly the two source risk flags');
+}
+if (answer.approvalCode !== 'PDF-7429') failures.push('approvalCode mismatch');
+
+printResult(failures.length === 0, failures.length === 0 ? 'answer.json matched expected PDF facts' : failures);
diff --git a/examples/workbench/pdf/checks/no-pdf-skill.mjs b/examples/workbench/pdf/checks/no-pdf-skill.mjs
new file mode 100644
index 0000000..efd6bed
--- /dev/null
+++ b/examples/workbench/pdf/checks/no-pdf-skill.mjs
@@ -0,0 +1,28 @@
+import { existsSync, readFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+import { printResult as printPdfResult, requireEnv, result } from './_pdf.mjs';
+import { noReadPath } from './_trace.mjs';
+
+const workDir = requireEnv('WORK');
+const resultsDir = requireEnv('RESULTS');
+const notePath = join(workDir, 'note.txt');
+const tracePath = join(resultsDir, 'trace.jsonl');
+const failures = [];
+
+if (!existsSync(notePath)) {
+  failures.push('note.txt was not created');
+} else if (readFileSync(notePath, 'utf-8').trim() !== 'done') {
+  failures.push('note.txt did not contain exactly: done');
+}
+
+if (existsSync(tracePath)) {
+  const traceResult = noReadPath(tracePath, /\/pdf-skill\/SKILL\.md$/);
+  if (!traceResult.pass) {
+    failures.push(...traceResult.evidence);
+  }
+}
+
+printPdfResult(failures.length === 0
+  ? result(true, 'note.txt was created without reading the PDF skill')
+  : result(false, failures));
diff --git a/examples/workbench/pdf/checks/split-customer-packet.mjs b/examples/workbench/pdf/checks/split-customer-packet.mjs
new file mode 100644
index 0000000..c6e1182
--- /dev/null
+++ b/examples/workbench/pdf/checks/split-customer-packet.mjs
@@ -0,0 +1,33 @@
+import { existsSync } from 'node:fs';
+import { join } from 'node:path';
+
+import { countPdfPages, extractSimplePdfText, isPdfFile, missingStrings, printResult, requireEnv } from './_pdf.mjs';
+
+const outputPath = join(requireEnv('WORK'), 'customer-copy.pdf');
+
+if (!existsSync(outputPath)) {
+  printResult(false, 'customer-copy.pdf was not created');
+}
+if (!isPdfFile(outputPath)) {
+  printResult(false, 'customer-copy.pdf is not a valid-looking PDF with header and EOF marker');
+}
+
+const text = extractSimplePdfText(outputPath);
+const missing = missingStrings(text, [
+  'CUSTOMER COPY',
+  'Invoice: C-204',
+  'Status: PAID',
+  'Warranty Code: W-8832',
+  'Support Tier: Priority',
+]);
+const failures = [...missing.map((value) => `missing expected text: ${value}`)];
+
+if (text.includes('INTERNAL NOTES') || text.includes('Margin review pending')) {
+  failures.push('customer-copy.pdf includes internal-only page text');
+}
+const pageCount = countPdfPages(outputPath);
+if (pageCount !== 2) {
+  failures.push(`expected 2 pages, found ${pageCount}`);
+}
+
+printResult(failures.length === 0, failures.length === 0 ? 'customer-copy.pdf contains only customer pages' : failures);
diff --git a/examples/workbench/pdf/references/pdf-skill/SKILL.md b/examples/workbench/pdf/references/pdf-skill/SKILL.md
new file mode 100644
index 0000000..d8f9a45
--- /dev/null
+++ b/examples/workbench/pdf/references/pdf-skill/SKILL.md
@@ -0,0 +1,82 @@
+---
+name: pdf
+description: Use this skill when a task requires reading, creating, splitting, merging, or otherwise manipulating PDF files.
+---
+
+# PDF Skill Demo
+
+Use Python packages installed in `/work/.venv` for PDF work. Common choices:
+
+- `pypdf` for reading, splitting, and writing pages
+- `pdfplumber` for extracting text from PDFs
+- `reportlab` for creating new PDFs
+
+Always inspect the extracted text before writing parsing regexes. Do not guess labels or field formats from the task prompt.
+
+Example text extraction:
+
+```python
+from pypdf import PdfReader
+
+reader = PdfReader("input.pdf")
+text = "\n".join(page.extract_text() or "" for page in reader.pages)
+print(text)
+```
+
+Example structured extraction after inspecting text:
+
+```python
+from pypdf import PdfReader
+import json
+
+reader = PdfReader("statement.pdf")
+text = "\n".join(page.extract_text() or "" for page in reader.pages)
+lines = [line.strip() for line in text.splitlines() if line.strip()]
+
+answer = {"riskFlags": []}
+for line in lines:
+    if line.startswith("Account:"):
+        answer["account"] = line.split(":", 1)[1].strip()
+    elif line.startswith("Quarter:"):
+        answer["quarter"] = line.split(":", 1)[1].strip()
+    elif line.startswith("Total Revenue:"):
+        raw = line.split(":", 1)[1].strip().replace("$", "").replace(",", "")
+        answer["totalRevenue"] = float(raw)
+    elif line.startswith("Risk Flag:"):
+        answer["riskFlags"].append(line.split(":", 1)[1].strip())
+    elif line.startswith("Approval Code:"):
+        answer["approvalCode"] = line.split(":", 1)[1].strip()
+
+with open("answer.json", "w") as output:
+    json.dump(answer, output, indent=2)
+```
+
+Example page filtering:
+
+```python
+from pypdf import PdfReader, PdfWriter
+
+reader = PdfReader("input.pdf")
+writer = PdfWriter()
+writer.add_page(reader.pages[0])
+
+with open("output.pdf", "wb") as output:
+    writer.write(output)
+```
+
+Example page filtering by extracted page text:
+
+```python
+from pypdf import PdfReader, PdfWriter
+
+reader = PdfReader("customer-packet.pdf")
+writer = PdfWriter()
+
+for page in reader.pages:
+    text = page.extract_text() or ""
+    if "CUSTOMER COPY" in text and "INTERNAL NOTES" not in text:
+        writer.add_page(page)
+
+with open("customer-copy.pdf", "wb") as output:
+    writer.write(output)
+```
diff --git a/examples/workbench/pdf/suite.yml b/examples/workbench/pdf/suite.yml
new file mode 100644
index 0000000..0dc13de
--- /dev/null
+++ b/examples/workbench/pdf/suite.yml
@@ -0,0 +1,57 @@
+name: pdf-workbench-example
+references: ./references
+appendSystemPrompt: |
+  Keep task outputs at the top level of /work unless the user asks for a different path.
+models:
+  - openrouter/google/gemini-2.5-flash
+env:
+  - OPENROUTER_API_KEY
+timeoutSeconds: 600
+setup:
+  - mkdir -p input
+  - node $CASE/checks/_pdf.mjs write-inputs input
+  - cp input/statement.pdf statement.pdf
+  - cp input/customer-packet.pdf customer-packet.pdf
+  - cp input/briefing-source.pdf briefing-source.pdf
+cases:
+  - name: extract-pdf-facts
+    task: |
+      Extract the key facts from statement.pdf and write answer.json with this exact schema:
+      {
+        "account": string,
+        "quarter": string,
+        "totalRevenue": number,
+        "riskFlags": string[],
+        "approvalCode": string
+      }
+      Use numeric dollars without commas or a currency symbol for totalRevenue.
+    graders:
+      - name: answer-json
+        command: node $CASE/checks/extract-pdf-facts.mjs
+
+  - name: split-customer-packet
+    task: |
+      Create customer-copy.pdf from customer-packet.pdf. Include only the pages marked CUSTOMER COPY, in their original order. Exclude the page marked INTERNAL NOTES. The output must be a PDF, not a text file.
+    graders:
+      - name: customer-copy-pdf
+        command: node $CASE/checks/split-customer-packet.mjs
+
+  - name: build-briefing-pdf
+    task: |
+      Create briefing.pdf as a one-page PDF briefing based on briefing-source.pdf. It must include these exact lines:
+      PDF Skill Briefing
+      Source: Alpine Sensors
+      Decision: approve expedited renewal
+      Deadline: 2026-05-14
+      Do not include the draft-only note from the source document.
+    graders:
+      - name: briefing-pdf
+        command: node $CASE/checks/build-briefing-pdf.mjs
+
+  - name: no-pdf-skill-needed
+    task: |
+      Write note.txt with exactly this text:
+      done
+    graders:
+      - name: note-without-pdf-skill
+        command: node $CASE/checks/no-pdf-skill.mjs
diff --git a/gemini-extension.json b/gemini-extension.json
new file mode 100644
index 0000000..18f86b3
--- /dev/null
+++ b/gemini-extension.json
@@ -0,0 +1,6 @@
+{
+  "name": "skill-optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "version": "2.0.0",
+  "contextFileName": "GEMINI.md"
+}
diff --git a/mock-repos/README.md b/mock-repos/README.md
deleted file mode 100644
index 264fee3..0000000
--- a/mock-repos/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Mock Repos
-
-`mock-repos/` contains tracked end-to-end demo templates for manual benchmark and optimizer testing:
-
-- `mcp-tracker-demo` — MCP surface, `surface-changing` optimize mode
-- `sdk-counter-demo` — SDK surface, intentionally lossy SKILL.md
-- `cli-taskfile-demo` — CLI surface, intentionally lossy SKILL.md
-
-Use a tracked template directly for read-only benchmark runs.
-
-Materialize a standalone copy before running the optimizer so git checkpointing stays isolated:
-
-```bash
-tsx src/optimizer/materialize-mock-repo.ts mcp-tracker-demo ./.tmp/mock-repos
-npx skill-optimizer optimize --config ./.tmp/mock-repos/mcp-tracker-demo/skill-optimizer.json
-```
-
-Each demo repo's `skill-optimizer.json` is the unified config entry point for both benchmarking and optimization.
diff --git a/mock-repos/cli-taskfile-demo/README.md b/mock-repos/cli-taskfile-demo/README.md
deleted file mode 100644
index 6d54be6..0000000
--- a/mock-repos/cli-taskfile-demo/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# cli-taskfile-demo
-
-A CLI surface demo for skill-optimizer.
-
-This mock repo demonstrates optimizing a CLI tool's `SKILL.md` so that LLMs can use all five commands (`add`, `list`, `done`, `delete`, `update`) with the right flags. The included `SKILL.md` intentionally omits `delete`, `update`, and flag details like `--priority` and `--due` — leaving the optimizer room to improve coverage.
-
-## What's here
-
-| File | Purpose |
-|------|---------|
-| `skill-optimizer.json` | Unified benchmark + optimizer config |
-| `SKILL.md` | Intentionally incomplete docs — the optimizer rewrites this |
-| `src/commands.ts` | CLI command definitions (the surface being benchmarked) |
-
-## Quickstart
-
-Materialize an isolated copy before running the optimizer (required for git checkpointing):
-
-```bash
-tsx src/optimizer/materialize-mock-repo.ts cli-taskfile-demo ./.tmp/mock-repos
-npx tsx src/cli.ts optimize --config ./.tmp/mock-repos/cli-taskfile-demo/skill-optimizer.json
-```
-
-Or run a benchmark-only pass against the tracked template:
-
-```bash
-npx tsx src/cli.ts run --config mock-repos/cli-taskfile-demo/skill-optimizer.json
-```
diff --git a/mock-repos/cli-taskfile-demo/SKILL.md b/mock-repos/cli-taskfile-demo/SKILL.md
deleted file mode 100644
index 7bf84c4..0000000
--- a/mock-repos/cli-taskfile-demo/SKILL.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# taskfile CLI
-
-A simple command-line task manager.
-
-## Usage
-
-```bash
-taskfile add --title "Buy groceries"
-taskfile list
-taskfile done --id abc123
-```
-
-## Commands
-
-### add
-
-Add a new task.
-
-```bash
-taskfile add --title "Task title"
-```
-
-### list
-
-List your current tasks.
-
-```bash
-taskfile list
-```
-
-### done
-
-Mark a task as done using its ID.
-
-```bash
-taskfile done --id <id>
-```
diff --git a/mock-repos/cli-taskfile-demo/skill-optimizer.json b/mock-repos/cli-taskfile-demo/skill-optimizer.json
deleted file mode 100644
index 002bd21..0000000
--- a/mock-repos/cli-taskfile-demo/skill-optimizer.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "name": "cli-taskfile-demo",
-  "target": {
-    "surface": "cli",
-    "repoPath": ".",
-    "skill": "./SKILL.md",
-    "discovery": {
-      "mode": "auto",
-      "sources": ["./src/commands.ts"]
-    },
-    "scope": { "include": ["*"], "exclude": [] }
-  },
-  "benchmark": {
-    "format": "pi",
-    "apiKeyEnv": "OPENROUTER_API_KEY",
-    "models": [
-      { "id": "openrouter/openai/gpt-4o-mini", "name": "GPT-4o mini", "tier": "low" },
-      { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude 4.6", "tier": "mid" }
-    ],
-    "verdict": { "perModelFloor": 0.6, "targetWeightedAverage": 0.7 },
-    "taskGeneration": { "enabled": true, "maxTasks": 10, "seed": 1, "outputDir": "./.skill-optimizer" }
-  },
-  "optimize": {
-    "enabled": true,
-    "model": "openrouter/anthropic/claude-sonnet-4.6",
-    "apiKeyEnv": "OPENROUTER_API_KEY",
-    "allowedPaths": ["SKILL.md"],
-    "validation": [],
-    "maxIterations": 3,
-    "minImprovement": 0.02,
-    "reportContextMaxBytes": 16000
-  }
-}
diff --git a/mock-repos/cli-taskfile-demo/src/commands.ts b/mock-repos/cli-taskfile-demo/src/commands.ts
deleted file mode 100644
index 2f825bf..0000000
--- a/mock-repos/cli-taskfile-demo/src/commands.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * CLI command definitions for the taskfile demo.
- * Exported as a literal array so the skill-optimizer can discover the surface
- * via static analysis — no runtime evaluation needed.
- */
-export const COMMANDS = [
-  {
-    command: "add",
-    description: "Add a new task to the list",
-    options: [
-      { name: "title", takesValue: true, description: "Task title (required)" },
-      { name: "priority", takesValue: true, description: "Priority level: low, medium, or high (default: medium)" },
-      { name: "due", takesValue: true, description: "Due date in YYYY-MM-DD format" },
-    ],
-  },
-  {
-    command: "list",
-    description: "List tasks, optionally filtered",
-    options: [
-      { name: "status", takesValue: true, description: "Filter by status: pending, done, or all (default: pending)" },
-      { name: "priority", takesValue: true, description: "Filter by priority level" },
-    ],
-  },
-  {
-    command: "done",
-    description: "Mark a task as completed",
-    options: [
-      { name: "id", takesValue: true, description: "Task ID to mark as done (required)" },
-    ],
-  },
-  {
-    command: "delete",
-    description: "Permanently delete a task",
-    options: [
-      { name: "id", takesValue: true, description: "Task ID to delete (required)" },
-      { name: "force", takesValue: false, description: "Skip the confirmation prompt" },
-    ],
-  },
-  {
-    command: "update",
-    description: "Update one or more fields of an existing task",
-    options: [
-      { name: "id", takesValue: true, description: "Task ID to update (required)" },
-      { name: "title", takesValue: true, description: "New task title" },
-      { name: "priority", takesValue: true, description: "New priority level" },
-      { name: "due", takesValue: true, description: "New due date in YYYY-MM-DD format" },
-    ],
-  },
-];
diff --git a/mock-repos/mcp-tracker-demo/.gitignore b/mock-repos/mcp-tracker-demo/.gitignore
deleted file mode 100644
index d36689c..0000000
--- a/mock-repos/mcp-tracker-demo/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.skill-benchmark-optimize/
-.skill-optimizer/
diff --git a/mock-repos/mcp-tracker-demo/README.md b/mock-repos/mcp-tracker-demo/README.md
deleted file mode 100644
index 15a26da..0000000
--- a/mock-repos/mcp-tracker-demo/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# mcp-tracker-demo
-
-A minimal MCP server used to demonstrate `skill-optimizer` end-to-end.
-
-## What this shows
-
-- How to configure `skill-optimizer.json` for an MCP surface
-- Task generation, benchmarking, and optimization against a small tool set
-
-## Quickstart
-
-```bash
-# From the skill-optimizer repo root:
-export OPENROUTER_API_KEY=sk-or-...
-
-# Preview the surface without any LLM calls:
-npx skill-optimizer --dry-run --config mock-repos/mcp-tracker-demo/skill-optimizer.json
-
-# Run the benchmark only:
-npx skill-optimizer run --config mock-repos/mcp-tracker-demo/skill-optimizer.json
-
-# Run the full optimization loop:
-npx skill-optimizer optimize --config mock-repos/mcp-tracker-demo/skill-optimizer.json
-```
-
-## Files
-
-- `SKILL.md` — the guidance document being evaluated and improved
-- `tools.json` — MCP tool definitions (used for manifest discovery)
-- `src/server.ts` — the actual server implementation (used for code-first discovery)
-- `skill-optimizer.json` — benchmark + optimizer config
diff --git a/mock-repos/mcp-tracker-demo/SKILL.md b/mock-repos/mcp-tracker-demo/SKILL.md
deleted file mode 100644
index 63edb6f..0000000
--- a/mock-repos/mcp-tracker-demo/SKILL.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Tracker MCP Notes
-
-There are tracker tools for creating and reading tickets.
-
-Use `tkt_new` when a fresh issue is needed.
-Use `get_tkt` when checking an existing issue.
-
-Keep state and comments in sync during longer work.
diff --git a/mock-repos/mcp-tracker-demo/skill-optimizer.json b/mock-repos/mcp-tracker-demo/skill-optimizer.json
deleted file mode 100644
index a11ebab..0000000
--- a/mock-repos/mcp-tracker-demo/skill-optimizer.json
+++ /dev/null
@@ -1,69 +0,0 @@
-{
-  "name": "mcp-tracker-demo",
-  "target": {
-    "surface": "mcp",
-    "repoPath": ".",
-    "skill": "./SKILL.md",
-    "scope": {
-      "include": ["*"],
-      "exclude": []
-    },
-    "discovery": {
-      "mode": "auto",
-      "sources": [
-        "./src/server.ts"
-      ]
-    }
-  },
-  "benchmark": {
-    "format": "pi",
-    "apiKeyEnv": "OPENROUTER_API_KEY",
-    "models": [
-      {
-        "id": "openrouter/openai/gpt-5.4",
-        "name": "GPT-5.4",
-        "tier": "flagship"
-      },
-      {
-        "id": "openrouter/anthropic/claude-sonnet-4.6",
-        "name": "Claude Sonnet 4.6",
-        "tier": "mid"
-      },
-      {
-        "id": "openrouter/google/gemini-3-flash-preview",
-        "name": "Gemini 3 Flash Preview",
-        "tier": "low"
-      }
-    ],
-    "taskGeneration": {
-      "enabled": true,
-      "maxTasks": 8,
-      "seed": 1,
-      "outputDir": "./.skill-optimizer"
-    },
-    "output": {
-      "dir": "./benchmark-results"
-    },
-    "verdict": {
-      "perModelFloor": 0.6,
-      "targetWeightedAverage": 0.7
-    }
-  },
-  "optimize": {
-    "enabled": true,
-    "mode": "surface-changing",
-    "model": "openrouter/openai/gpt-5.4",
-    "apiKeyEnv": "OPENROUTER_API_KEY",
-    "thinkingLevel": "medium",
-    "allowedPaths": [
-      "src",
-      "SKILL.md",
-      "README.md"
-    ],
-    "validation": [],
-    "maxIterations": 5,
-    "stabilityWindow": 2,
-    "minImprovement": 0.02,
-    "reportContextMaxBytes": 16000
-  }
-}
diff --git a/mock-repos/mcp-tracker-demo/src/server.ts b/mock-repos/mcp-tracker-demo/src/server.ts
deleted file mode 100644
index 0759bc9..0000000
--- a/mock-repos/mcp-tracker-demo/src/server.ts
+++ /dev/null
@@ -1,70 +0,0 @@
-export const TRACKER_TOOLS = [
-  {
-    type: 'function',
-    function: {
-      name: 'tkt_new',
-      description: 'make ticket row',
-      parameters: {
-        type: 'object',
-        properties: {
-          t: { type: 'string', description: 'title' },
-          d: { type: 'string', description: 'desc text' },
-          p: { type: 'string', description: 'priority code' },
-          usr: { type: 'string', description: 'owner handle' },
-        },
-        required: ['t', 'd', 'p'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'get_tkt',
-      description: 'pull one ticket by id',
-      parameters: {
-        type: 'object',
-        properties: {
-          id: { type: 'string', description: 'ticket key' },
-        },
-        required: ['id'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'update_tkt_state',
-      description: 'change state to another',
-      parameters: {
-        type: 'object',
-        properties: {
-          id: { type: 'string' },
-          to: { type: 'string', description: 'new state label' },
-        },
-        required: ['id', 'to'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'add_cmnt',
-      description: 'append comment on ticket',
-      parameters: {
-        type: 'object',
-        properties: {
-          tkt: { type: 'string', description: 'ticket key' },
-          body: { type: 'string', description: 'comment body' },
-          author: { type: 'string', description: 'user handle' },
-        },
-        required: ['tkt', 'body'],
-        additionalProperties: false,
-      },
-    },
-  },
-] as const;
-
-export default TRACKER_TOOLS;
diff --git a/mock-repos/mcp-tracker-demo/tools.json b/mock-repos/mcp-tracker-demo/tools.json
deleted file mode 100644
index 87011a2..0000000
--- a/mock-repos/mcp-tracker-demo/tools.json
+++ /dev/null
@@ -1,109 +0,0 @@
-[
-  {
-    "type": "function",
-    "function": {
-      "name": "tkt_new",
-      "description": "make ticket row",
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "t": {
-            "type": "string",
-            "description": "title"
-          },
-          "d": {
-            "type": "string",
-            "description": "desc text"
-          },
-          "p": {
-            "type": "string",
-            "description": "priority code"
-          },
-          "usr": {
-            "type": "string",
-            "description": "owner handle"
-          }
-        },
-        "required": [
-          "t",
-          "d",
-          "p"
-        ],
-        "additionalProperties": false
-      }
-    }
-  },
-  {
-    "type": "function",
-    "function": {
-      "name": "get_tkt",
-      "description": "pull one ticket by id",
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "id": {
-            "type": "string",
-            "description": "ticket key"
-          }
-        },
-        "required": [
-          "id"
-        ],
-        "additionalProperties": false
-      }
-    }
-  },
-  {
-    "type": "function",
-    "function": {
-      "name": "update_tkt_state",
-      "description": "change state to another",
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "id": {
-            "type": "string"
-          },
-          "to": {
-            "type": "string",
-            "description": "new state label"
-          }
-        },
-        "required": [
-          "id",
-          "to"
-        ],
-        "additionalProperties": false
-      }
-    }
-  },
-  {
-    "type": "function",
-    "function": {
-      "name": "add_cmnt",
-      "description": "append comment on ticket",
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "tkt": {
-            "type": "string",
-            "description": "ticket key"
-          },
-          "body": {
-            "type": "string",
-            "description": "comment body"
-          },
-          "author": {
-            "type": "string",
-            "description": "user handle"
-          }
-        },
-        "required": [
-          "tkt",
-          "body"
-        ],
-        "additionalProperties": false
-      }
-    }
-  }
-]
diff --git a/mock-repos/sdk-counter-demo/README.md b/mock-repos/sdk-counter-demo/README.md
deleted file mode 100644
index f1da732..0000000
--- a/mock-repos/sdk-counter-demo/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# sdk-counter-demo
-
-A minimal TypeScript SDK used to demonstrate `skill-optimizer` end-to-end.
-
-The bundled `SKILL.md` intentionally omits the `amount` parameter, the `reset` method, and the `start` option — so the first benchmark run fails, then the optimizer proposes improvements.
-
-## Quickstart
-
-```bash
-# From the skill-optimizer repo root:
-export OPENROUTER_API_KEY=sk-or-...
-
-# Preview the surface without any LLM calls:
-npx skill-optimizer --dry-run --config mock-repos/sdk-counter-demo/skill-optimizer.json
-
-# Run the benchmark only:
-npx skill-optimizer run --config mock-repos/sdk-counter-demo/skill-optimizer.json
-
-# Run the full optimization loop:
-npx skill-optimizer optimize --config mock-repos/sdk-counter-demo/skill-optimizer.json
-```
-
-## Files
-
-- `SKILL.md` — the guidance document being evaluated and improved
-- `src/counter.ts` — the SDK source (used for code-first discovery)
-- `skill-optimizer.json` — benchmark + optimizer config
diff --git a/mock-repos/sdk-counter-demo/SKILL.md b/mock-repos/sdk-counter-demo/SKILL.md
deleted file mode 100644
index 519578c..0000000
--- a/mock-repos/sdk-counter-demo/SKILL.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Counter SDK
-
-A small counter utility.
-
-## Usage
-
-Import from `./counter.ts` and build a counter.
-
-```ts
-import { createCounter } from './counter';
-const c = createCounter();
-c.increment();
-```
-
-That's it. Use `.value()` for the current value.
diff --git a/mock-repos/sdk-counter-demo/skill-optimizer.json b/mock-repos/sdk-counter-demo/skill-optimizer.json
deleted file mode 100644
index 17bd952..0000000
--- a/mock-repos/sdk-counter-demo/skill-optimizer.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-  "name": "sdk-counter-demo",
-  "target": {
-    "surface": "sdk",
-    "repoPath": ".",
-    "skill": "./SKILL.md",
-    "discovery": {
-      "mode": "auto",
-      "sources": ["./src/counter.ts"],
-      "language": "typescript"
-    },
-    "sdk": { "language": "typescript" },
-    "scope": { "include": ["*"], "exclude": [] }
-  },
-  "benchmark": {
-    "format": "pi",
-    "apiKeyEnv": "OPENROUTER_API_KEY",
-    "models": [
-      { "id": "openrouter/openai/gpt-4o-mini", "name": "GPT-4o mini", "tier": "low" },
-      { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude 4.6", "tier": "mid" }
-    ],
-    "verdict": { "perModelFloor": 0.6, "targetWeightedAverage": 0.7 },
-    "taskGeneration": { "enabled": true, "maxTasks": 8, "seed": 1, "outputDir": "./.skill-optimizer" }
-  },
-  "optimize": {
-    "enabled": true,
-    "model": "openrouter/anthropic/claude-sonnet-4.6",
-    "apiKeyEnv": "OPENROUTER_API_KEY",
-    "allowedPaths": ["SKILL.md"],
-    "validation": [],
-    "maxIterations": 3,
-    "minImprovement": 0.02,
-    "reportContextMaxBytes": 16000
-  }
-}
diff --git a/mock-repos/sdk-counter-demo/src/counter.ts b/mock-repos/sdk-counter-demo/src/counter.ts
deleted file mode 100644
index 3147073..0000000
--- a/mock-repos/sdk-counter-demo/src/counter.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-// src/counter.ts
-
-/** Creates a new counter, optionally starting at a given value. */
-export function createCounter(options?: { start?: number }): Counter {
-  return new Counter(options?.start ?? 0);
-}
-
-export class Counter {
-  #value: number;
-  constructor(start: number) { this.#value = start; }
-
-  /** Advances the counter and returns the new value. */
-  increment(amount?: number): number {
-    this.#value += amount ?? 1;
-    return this.#value;
-  }
-
-  /** Resets the counter to 0 (or the given value). */
-  reset(to?: number): number {
-    this.#value = to ?? 0;
-    return this.#value;
-  }
-
-  value(): number { return this.#value; }
-}
diff --git a/package-lock.json b/package-lock.json
index 373185c..d985d28 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,23 +1,21 @@
 {
   "name": "skill-optimizer",
-  "version": "1.1.0",
+  "version": "2.0.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "skill-optimizer",
-      "version": "1.1.0",
+      "version": "2.0.0",
       "license": "MIT",
+      "main": ".opencode/plugins/skill-optimizer.js",
       "dependencies": {
-        "@clack/prompts": "^1.2.0",
         "@mariozechner/pi-agent-core": "^0.66.1",
         "@mariozechner/pi-ai": "^0.66.1",
         "@mariozechner/pi-coding-agent": "^0.66.1",
         "dotenv": "^17.4.1",
-        "tree-sitter-wasms": "^0.1.13",
-        "web-tree-sitter": "^0.24.7",
-        "zod": "^4.3.6",
-        "zod-to-json-schema": "^3.25.2"
+        "mcporter": "^0.9.0",
+        "yaml": "^2.8.2"
       },
       "bin": {
         "skill-optimizer": "dist/cli.js"
@@ -765,26 +763,35 @@
         "url": "https://github.com/sponsors/Borewit"
       }
     },
-    "node_modules/@clack/core": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@clack/core/-/core-1.2.0.tgz",
-      "integrity": "sha512-qfxof/3T3t9DPU/Rj3OmcFyZInceqj/NVtO9rwIuJqCUgh32gwPjpFQQp/ben07qKlhpwq7GzfWpST4qdJ5Drg==",
+    "node_modules/@emnapi/core": {
+      "version": "1.9.2",
+      "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.9.2.tgz",
+      "integrity": "sha512-UC+ZhH3XtczQYfOlu3lNEkdW/p4dsJ1r/bP7H8+rhao3TTTMO1ATq/4DdIi23XuGoFY+Cz0JmCbdVl0hz9jZcA==",
       "license": "MIT",
+      "optional": true,
       "dependencies": {
-        "fast-wrap-ansi": "^0.1.3",
-        "sisteransi": "^1.0.5"
+        "@emnapi/wasi-threads": "1.2.1",
+        "tslib": "^2.4.0"
       }
     },
-    "node_modules/@clack/prompts": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@clack/prompts/-/prompts-1.2.0.tgz",
-      "integrity": "sha512-4jmztR9fMqPMjz6H/UZXj0zEmE43ha1euENwkckKKel4XpSfokExPo5AiVStdHSAlHekz4d0CA/r45Ok1E4D3w==",
+    "node_modules/@emnapi/runtime": {
+      "version": "1.9.2",
+      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.9.2.tgz",
+      "integrity": "sha512-3U4+MIWHImeyu1wnmVygh5WlgfYDtyf0k8AbLhMFxOipihf6nrWC4syIm/SwEeec0mNSafiiNnMJwbza/Is6Lw==",
+      "license": "MIT",
+      "optional": true,
+      "dependencies": {
+        "tslib": "^2.4.0"
+      }
+    },
+    "node_modules/@emnapi/wasi-threads": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
+      "integrity": "sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w==",
       "license": "MIT",
+      "optional": true,
       "dependencies": {
-        "@clack/core": "1.2.0",
-        "fast-string-width": "^1.1.0",
-        "fast-wrap-ansi": "^0.1.3",
-        "sisteransi": "^1.0.5"
+        "tslib": "^2.4.0"
       }
     },
     "node_modules/@esbuild/aix-ppc64": {
@@ -1252,6 +1259,24 @@
         }
       }
     },
+    "node_modules/@hono/node-server": {
+      "version": "1.19.14",
+      "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.14.tgz",
+      "integrity": "sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.14.1"
+      },
+      "peerDependencies": {
+        "hono": "^4"
+      }
+    },
+    "node_modules/@iarna/toml": {
+      "version": "2.2.5",
+      "resolved": "https://registry.npmjs.org/@iarna/toml/-/toml-2.2.5.tgz",
+      "integrity": "sha512-trnsAYxU3xnS1gPHPyU961coFyLkh4gAD/0zQ5mymY4yOZ+CYvsPqUbOFSw0aDM4y0tV7tiFxL/1XfXPNC6IPg==",
+      "license": "ISC"
+    },
     "node_modules/@mariozechner/clipboard": {
       "version": "0.3.2",
       "resolved": "https://registry.npmjs.org/@mariozechner/clipboard/-/clipboard-0.3.2.tgz",
@@ -1549,6 +1574,73 @@
         "zod-to-json-schema": "^3.24.1"
       }
     },
+    "node_modules/@modelcontextprotocol/sdk": {
+      "version": "1.29.0",
+      "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.29.0.tgz",
+      "integrity": "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@hono/node-server": "^1.19.9",
+        "ajv": "^8.17.1",
+        "ajv-formats": "^3.0.1",
+        "content-type": "^1.0.5",
+        "cors": "^2.8.5",
+        "cross-spawn": "^7.0.5",
+        "eventsource": "^3.0.2",
+        "eventsource-parser": "^3.0.0",
+        "express": "^5.2.1",
+        "express-rate-limit": "^8.2.1",
+        "hono": "^4.11.4",
+        "jose": "^6.1.3",
+        "json-schema-typed": "^8.0.2",
+        "pkce-challenge": "^5.0.0",
+        "raw-body": "^3.0.0",
+        "zod": "^3.25 || ^4.0",
+        "zod-to-json-schema": "^3.25.1"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "@cfworker/json-schema": "^4.1.1",
+        "zod": "^3.25 || ^4.0"
+      },
+      "peerDependenciesMeta": {
+        "@cfworker/json-schema": {
+          "optional": true
+        },
+        "zod": {
+          "optional": false
+        }
+      }
+    },
+    "node_modules/@napi-rs/wasm-runtime": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-1.1.4.tgz",
+      "integrity": "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow==",
+      "license": "MIT",
+      "optional": true,
+      "dependencies": {
+        "@tybys/wasm-util": "^0.10.1"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      },
+      "peerDependencies": {
+        "@emnapi/core": "^1.7.1",
+        "@emnapi/runtime": "^1.7.1"
+      }
+    },
+    "node_modules/@oxc-project/types": {
+      "version": "0.126.0",
+      "resolved": "https://registry.npmjs.org/@oxc-project/types/-/types-0.126.0.tgz",
+      "integrity": "sha512-oGfVtjAgwQVVpfBrbtk4e1XDyWHRFta6BS3GWVzrF8xYBT2VGQAk39yJS/wFSMrZqoiCU4oghT3Ch0HaHGIHcQ==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/Boshen"
+      }
+    },
     "node_modules/@protobufjs/aspromise": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
@@ -1613,6 +1705,254 @@
       "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==",
       "license": "BSD-3-Clause"
     },
+    "node_modules/@rolldown/binding-android-arm64": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-android-arm64/-/binding-android-arm64-1.0.0-rc.16.tgz",
+      "integrity": "sha512-rhY3k7Bsae9qQfOtph2Pm2jZEA+s8Gmjoz4hhmx70K9iMQ/ddeae+xhRQcM5IuVx5ry1+bGfkvMn7D6MJggVSA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-darwin-arm64": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-darwin-arm64/-/binding-darwin-arm64-1.0.0-rc.16.tgz",
+      "integrity": "sha512-rNz0yK078yrNn3DrdgN+PKiMOW8HfQ92jQiXxwX8yW899ayV00MLVdaCNeVBhG/TbH3ouYVObo8/yrkiectkcQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-darwin-x64": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-darwin-x64/-/binding-darwin-x64-1.0.0-rc.16.tgz",
+      "integrity": "sha512-r/OmdR00HmD4i79Z//xO06uEPOq5hRXdhw7nzkxQxwSavs3PSHa1ijntdpOiZ2mzOQ3fVVu8C1M19FoNM+dMUQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-freebsd-x64": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-freebsd-x64/-/binding-freebsd-x64-1.0.0-rc.16.tgz",
+      "integrity": "sha512-KcRE5w8h0OnjUatG8pldyD14/CQ5Phs1oxfR+3pKDjboHRo9+MkqQaiIZlZRpsxC15paeXme/I127tUa9TXJ6g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-arm-gnueabihf": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-arm-gnueabihf/-/binding-linux-arm-gnueabihf-1.0.0-rc.16.tgz",
+      "integrity": "sha512-bT0guA1bpxEJ/ZhTRniQf7rNF8ybvXOuWbNIeLABaV5NGjx4EtOWBTSRGWFU9ZWVkPOZ+HNFP8RMcBokBiZ0Kg==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-arm64-gnu": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-arm64-gnu/-/binding-linux-arm64-gnu-1.0.0-rc.16.tgz",
+      "integrity": "sha512-+tHktCHWV8BDQSjemUqm/Jl/TPk3QObCTIjmdDy/nlupcujZghmKK2962LYrqFpWu+ai01AN/REOH3NEpqvYQg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-arm64-musl": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-arm64-musl/-/binding-linux-arm64-musl-1.0.0-rc.16.tgz",
+      "integrity": "sha512-3fPzdREH806oRLxpTWW1Gt4tQHs0TitZFOECB2xzCFLPKnSOy90gwA7P29cksYilFO6XVRY1kzga0cL2nRjKPg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-ppc64-gnu": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-ppc64-gnu/-/binding-linux-ppc64-gnu-1.0.0-rc.16.tgz",
+      "integrity": "sha512-EKwI1tSrLs7YVw+JPJT/G2dJQ1jl9qlTTTEG0V2Ok/RdOenRfBw2PQdLPyjhIu58ocdBfP7vIRN/pvMsPxs/AQ==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-s390x-gnu": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-s390x-gnu/-/binding-linux-s390x-gnu-1.0.0-rc.16.tgz",
+      "integrity": "sha512-Uknladnb3Sxqu6SEcqBldQyJUpk8NleooZEc0MbRBJ4inEhRYWZX0NJu12vNf2mqAq7gsofAxHrGghiUYjhaLQ==",
+      "cpu": [
+        "s390x"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-x64-gnu": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-x64-gnu/-/binding-linux-x64-gnu-1.0.0-rc.16.tgz",
+      "integrity": "sha512-FIb8+uG49sZBtLTn+zt1AJ20TqVcqWeSIyoVt0or7uAWesgKaHbiBh6OpA/k9v0LTt+PTrb1Lao133kP4uVxkg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-linux-x64-musl": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-linux-x64-musl/-/binding-linux-x64-musl-1.0.0-rc.16.tgz",
+      "integrity": "sha512-RuERhF9/EgWxZEXYWCOaViUWHIboceK4/ivdtQ3R0T44NjLkIIlGIAVAuCddFxsZ7vnRHtNQUrt2vR2n2slB2w==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-openharmony-arm64": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-openharmony-arm64/-/binding-openharmony-arm64-1.0.0-rc.16.tgz",
+      "integrity": "sha512-mXcXnvd9GpazCxeUCCnZ2+YF7nut+ZOEbE4GtaiPtyY6AkhZWbK70y1KK3j+RDhjVq5+U8FySkKRb/+w0EeUwA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-wasm32-wasi": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-wasm32-wasi/-/binding-wasm32-wasi-1.0.0-rc.16.tgz",
+      "integrity": "sha512-3Q2KQxnC8IJOLqXmUMoYwyIPZU9hzRbnHaoV3Euz+VVnjZKcY8ktnNP8T9R4/GGQtb27C/UYKABxesKWb8lsvQ==",
+      "cpu": [
+        "wasm32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "dependencies": {
+        "@emnapi/core": "1.9.2",
+        "@emnapi/runtime": "1.9.2",
+        "@napi-rs/wasm-runtime": "^1.1.4"
+      },
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-win32-arm64-msvc": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-win32-arm64-msvc/-/binding-win32-arm64-msvc-1.0.0-rc.16.tgz",
+      "integrity": "sha512-tj7XRemQcOcFwv7qhpUxMTBbI5mWMlE4c1Omhg5+h8GuLXzyj8HviYgR+bB2DMDgRqUE+jiDleqSCRjx4aYk/Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/binding-win32-x64-msvc": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/binding-win32-x64-msvc/-/binding-win32-x64-msvc-1.0.0-rc.16.tgz",
+      "integrity": "sha512-PH5DRZT+F4f2PTXRXR8uJxnBq2po/xFtddyabTJVJs/ZYVHqXPEgNIr35IHTEa6bpa0Q8Awg+ymkTaGnKITw4g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      }
+    },
+    "node_modules/@rolldown/pluginutils": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-rc.16.tgz",
+      "integrity": "sha512-45+YtqxLYKDWQouLKCrpIZhke+nXxhsw+qAHVzHDVwttyBlHNBVs2K25rDXrZzhpTp9w1FlAlvweV1H++fdZoA==",
+      "license": "MIT"
+    },
     "node_modules/@silvia-odwyer/photon-node": {
       "version": "0.3.4",
       "resolved": "https://registry.npmjs.org/@silvia-odwyer/photon-node/-/photon-node-0.3.4.tgz",
@@ -2292,6 +2632,16 @@
       "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
       "license": "MIT"
     },
+    "node_modules/@tybys/wasm-util": {
+      "version": "0.10.1",
+      "resolved": "https://registry.npmjs.org/@tybys/wasm-util/-/wasm-util-0.10.1.tgz",
+      "integrity": "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==",
+      "license": "MIT",
+      "optional": true,
+      "dependencies": {
+        "tslib": "^2.4.0"
+      }
+    },
     "node_modules/@types/mime-types": {
       "version": "2.1.4",
       "resolved": "https://registry.npmjs.org/@types/mime-types/-/mime-types-2.1.4.tgz",
@@ -2323,6 +2673,31 @@
         "@types/node": "*"
       }
     },
+    "node_modules/accepts": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz",
+      "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==",
+      "license": "MIT",
+      "dependencies": {
+        "mime-types": "^3.0.0",
+        "negotiator": "^1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/acorn": {
+      "version": "8.16.0",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
+      "integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
+      "license": "MIT",
+      "bin": {
+        "acorn": "bin/acorn"
+      },
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
     "node_modules/agent-base": {
       "version": "7.1.4",
       "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
@@ -2457,6 +2832,30 @@
         "node": "*"
       }
     },
+    "node_modules/body-parser": {
+      "version": "2.2.2",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz",
+      "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==",
+      "license": "MIT",
+      "dependencies": {
+        "bytes": "^3.1.2",
+        "content-type": "^1.0.5",
+        "debug": "^4.4.3",
+        "http-errors": "^2.0.0",
+        "iconv-lite": "^0.7.0",
+        "on-finished": "^2.4.1",
+        "qs": "^6.14.1",
+        "raw-body": "^3.0.1",
+        "type-is": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
     "node_modules/bowser": {
       "version": "2.14.1",
       "resolved": "https://registry.npmjs.org/bowser/-/bowser-2.14.1.tgz",
@@ -2490,18 +2889,71 @@
       "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==",
       "license": "BSD-3-Clause"
     },
-    "node_modules/chalk": {
-      "version": "5.6.2",
-      "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.6.2.tgz",
-      "integrity": "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==",
+    "node_modules/bytes": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
+      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
       "license": "MIT",
       "engines": {
-        "node": "^12.17.0 || ^14.13 || >=16.0.0"
-      },
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/call-bind-apply-helpers": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/call-bound": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
+      "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "get-intrinsic": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/chalk": {
+      "version": "5.6.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.6.2.tgz",
+      "integrity": "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==",
+      "license": "MIT",
+      "engines": {
+        "node": "^12.17.0 || ^14.13 || >=16.0.0"
+      },
       "funding": {
         "url": "https://github.com/chalk/chalk?sponsor=1"
       }
     },
+    "node_modules/cli-cursor": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz",
+      "integrity": "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==",
+      "license": "MIT",
+      "dependencies": {
+        "restore-cursor": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/cli-highlight": {
       "version": "2.1.11",
       "resolved": "https://registry.npmjs.org/cli-highlight/-/cli-highlight-2.1.11.tgz",
@@ -2539,6 +2991,18 @@
         "url": "https://github.com/chalk/chalk?sponsor=1"
       }
     },
+    "node_modules/cli-spinners": {
+      "version": "3.4.0",
+      "resolved": "https://registry.npmjs.org/cli-spinners/-/cli-spinners-3.4.0.tgz",
+      "integrity": "sha512-bXfOC4QcT1tKXGorxL3wbJm6XJPDqEnij2gQ2m7ESQuE+/z9YFIWnl/5RpTiKWbMq3EVKR4fRLJGn6DVfu0mpw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.20"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/cliui": {
       "version": "7.0.4",
       "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz",
@@ -2589,6 +3053,86 @@
       "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
       "license": "MIT"
     },
+    "node_modules/commander": {
+      "version": "14.0.3",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.3.tgz",
+      "integrity": "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=20"
+      }
+    },
+    "node_modules/content-disposition": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.1.0.tgz",
+      "integrity": "sha512-5jRCH9Z/+DRP7rkvY83B+yGIGX96OYdJmzngqnw2SBSxqCFPd0w2km3s5iawpGX8krnwSGmF0FW5Nhr0Hfai3g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/content-type": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz",
+      "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/cookie": {
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz",
+      "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/cookie-signature": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz",
+      "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.6.0"
+      }
+    },
+    "node_modules/cors": {
+      "version": "2.8.6",
+      "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.6.tgz",
+      "integrity": "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==",
+      "license": "MIT",
+      "dependencies": {
+        "object-assign": "^4",
+        "vary": "^1"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/cross-spawn": {
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
+      "license": "MIT",
+      "dependencies": {
+        "path-key": "^3.1.0",
+        "shebang-command": "^2.0.0",
+        "which": "^2.0.1"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
     "node_modules/data-uri-to-buffer": {
       "version": "4.0.1",
       "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz",
@@ -2629,6 +3173,15 @@
         "node": ">= 14"
       }
     },
+    "node_modules/depd": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
+      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/diff": {
       "version": "8.0.4",
       "resolved": "https://registry.npmjs.org/diff/-/diff-8.0.4.tgz",
@@ -2650,6 +3203,20 @@
         "url": "https://dotenvx.com"
       }
     },
+    "node_modules/dunder-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.2.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
     "node_modules/ecdsa-sig-formatter": {
       "version": "1.0.11",
       "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz",
@@ -2659,12 +3226,27 @@
         "safe-buffer": "^5.0.1"
       }
     },
+    "node_modules/ee-first": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==",
+      "license": "MIT"
+    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
       "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
       "license": "MIT"
     },
+    "node_modules/encodeurl": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
+      "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/end-of-stream": {
       "version": "1.4.5",
       "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
@@ -2674,6 +3256,46 @@
         "once": "^1.4.0"
       }
     },
+    "node_modules/es-define-property": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-errors": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-object-atoms": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-toolkit": {
+      "version": "1.46.1",
+      "resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.46.1.tgz",
+      "integrity": "sha512-5eNtXOs3tbfxXOj04tjjseeWkRWaoCjdEI+96DgwzZoe6c9juL49pXlzAFTI72aWC9Y8p7168g6XIKjh7k6pyQ==",
+      "license": "MIT",
+      "workspaces": [
+        "docs",
+        "benchmarks"
+      ]
+    },
     "node_modules/esbuild": {
       "version": "0.27.7",
       "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz",
@@ -2725,6 +3347,12 @@
         "node": ">=6"
       }
     },
+    "node_modules/escape-html": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+      "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==",
+      "license": "MIT"
+    },
     "node_modules/escodegen": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
@@ -2777,6 +3405,97 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/etag": {
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
+      "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/eventsource": {
+      "version": "3.0.7",
+      "resolved": "https://registry.npmjs.org/eventsource/-/eventsource-3.0.7.tgz",
+      "integrity": "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==",
+      "license": "MIT",
+      "dependencies": {
+        "eventsource-parser": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
+    "node_modules/eventsource-parser": {
+      "version": "3.0.8",
+      "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.8.tgz",
+      "integrity": "sha512-70QWGkr4snxr0OXLRWsFLeRBIRPuQOvt4s8QYjmUlmlkyTZkRqS7EDVRZtzU3TiyDbXSzaOeF0XUKy8PchzukQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
+    "node_modules/express": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz",
+      "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==",
+      "license": "MIT",
+      "dependencies": {
+        "accepts": "^2.0.0",
+        "body-parser": "^2.2.1",
+        "content-disposition": "^1.0.0",
+        "content-type": "^1.0.5",
+        "cookie": "^0.7.1",
+        "cookie-signature": "^1.2.1",
+        "debug": "^4.4.0",
+        "depd": "^2.0.0",
+        "encodeurl": "^2.0.0",
+        "escape-html": "^1.0.3",
+        "etag": "^1.8.1",
+        "finalhandler": "^2.1.0",
+        "fresh": "^2.0.0",
+        "http-errors": "^2.0.0",
+        "merge-descriptors": "^2.0.0",
+        "mime-types": "^3.0.0",
+        "on-finished": "^2.4.1",
+        "once": "^1.4.0",
+        "parseurl": "^1.3.3",
+        "proxy-addr": "^2.0.7",
+        "qs": "^6.14.0",
+        "range-parser": "^1.2.1",
+        "router": "^2.2.0",
+        "send": "^1.1.0",
+        "serve-static": "^2.2.0",
+        "statuses": "^2.0.1",
+        "type-is": "^2.0.1",
+        "vary": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/express-rate-limit": {
+      "version": "8.4.1",
+      "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.4.1.tgz",
+      "integrity": "sha512-NGVYwQSAyEQgzxX1iCM978PP9AdO/hW93gMcF6ZwQCm+rFvLsBH6w4xcXWTcliS8La5EPRN3p9wzItqBwJrfNw==",
+      "license": "MIT",
+      "dependencies": {
+        "ip-address": "10.1.0"
+      },
+      "engines": {
+        "node": ">= 16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/express-rate-limit"
+      },
+      "peerDependencies": {
+        "express": ">= 4.11"
+      }
+    },
     "node_modules/extend": {
       "version": "3.0.2",
       "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
@@ -2809,21 +3528,6 @@
       "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
       "license": "MIT"
     },
-    "node_modules/fast-string-truncated-width": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/fast-string-truncated-width/-/fast-string-truncated-width-1.2.1.tgz",
-      "integrity": "sha512-Q9acT/+Uu3GwGj+5w/zsGuQjh9O1TyywhIwAxHudtWrgF09nHOPrvTLhQevPbttcxjr/SNN7mJmfOw/B1bXgow==",
-      "license": "MIT"
-    },
-    "node_modules/fast-string-width": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/fast-string-width/-/fast-string-width-1.1.0.tgz",
-      "integrity": "sha512-O3fwIVIH5gKB38QNbdg+3760ZmGz0SZMgvwJbA1b2TGXceKE6A2cOlfogh1iw8lr049zPyd7YADHy+B7U4W9bQ==",
-      "license": "MIT",
-      "dependencies": {
-        "fast-string-truncated-width": "^1.2.0"
-      }
-    },
     "node_modules/fast-uri": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
@@ -2840,15 +3544,6 @@
       ],
       "license": "BSD-3-Clause"
     },
-    "node_modules/fast-wrap-ansi": {
-      "version": "0.1.6",
-      "resolved": "https://registry.npmjs.org/fast-wrap-ansi/-/fast-wrap-ansi-0.1.6.tgz",
-      "integrity": "sha512-HlUwET7a5gqjURj70D5jl7aC3Zmy4weA1SHUfM0JFI0Ptq987NH2TwbBFLoERhfwk+E+eaq4EK3jXoT+R3yp3w==",
-      "license": "MIT",
-      "dependencies": {
-        "fast-string-width": "^1.1.0"
-      }
-    },
     "node_modules/fast-xml-builder": {
       "version": "1.1.5",
       "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.5.tgz",
@@ -2934,6 +3629,27 @@
         "url": "https://github.com/sindresorhus/file-type?sponsor=1"
       }
     },
+    "node_modules/finalhandler": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz",
+      "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==",
+      "license": "MIT",
+      "dependencies": {
+        "debug": "^4.4.0",
+        "encodeurl": "^2.0.0",
+        "escape-html": "^1.0.3",
+        "on-finished": "^2.4.1",
+        "parseurl": "^1.3.3",
+        "statuses": "^2.0.1"
+      },
+      "engines": {
+        "node": ">= 18.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
     "node_modules/formdata-polyfill": {
       "version": "4.0.10",
       "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz",
@@ -2946,6 +3662,24 @@
         "node": ">=12.20.0"
       }
     },
+    "node_modules/forwarded": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
+      "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/fresh": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz",
+      "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/fsevents": {
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
@@ -2961,6 +3695,15 @@
         "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
       }
     },
+    "node_modules/function-bind": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
     "node_modules/gaxios": {
       "version": "7.1.4",
       "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-7.1.4.tgz",
@@ -3010,6 +3753,43 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/get-intrinsic": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "es-define-property": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "es-object-atoms": "^1.1.1",
+        "function-bind": "^1.1.2",
+        "get-proto": "^1.0.1",
+        "gopd": "^1.2.0",
+        "has-symbols": "^1.1.0",
+        "hasown": "^2.0.2",
+        "math-intrinsics": "^1.1.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/get-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
+      "license": "MIT",
+      "dependencies": {
+        "dunder-proto": "^1.0.1",
+        "es-object-atoms": "^1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
     "node_modules/get-stream": {
       "version": "5.2.0",
       "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
@@ -3104,6 +3884,18 @@
         "node": ">=14"
       }
     },
+    "node_modules/gopd": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
     "node_modules/graceful-fs": {
       "version": "4.2.11",
       "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@@ -3119,6 +3911,30 @@
         "node": ">=8"
       }
     },
+    "node_modules/has-symbols": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/hasown": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.3.tgz",
+      "integrity": "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg==",
+      "license": "MIT",
+      "dependencies": {
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
     "node_modules/highlight.js": {
       "version": "10.7.3",
       "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz",
@@ -3128,6 +3944,15 @@
         "node": "*"
       }
     },
+    "node_modules/hono": {
+      "version": "4.12.16",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.16.tgz",
+      "integrity": "sha512-jN0ZewiNAWSe5khM3EyCmBb250+b40wWbwNILNfEvq84VREWwOIkuUsFONk/3i3nqkz7Oe1PcpM2mwQEK2L9Kg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.9.0"
+      }
+    },
     "node_modules/hosted-git-info": {
       "version": "9.0.2",
       "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-9.0.2.tgz",
@@ -3140,19 +3965,39 @@
         "node": "^20.17.0 || >=22.9.0"
       }
     },
-    "node_modules/http-proxy-agent": {
-      "version": "7.0.2",
-      "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
-      "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
+    "node_modules/http-errors": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz",
+      "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==",
       "license": "MIT",
       "dependencies": {
-        "agent-base": "^7.1.0",
-        "debug": "^4.3.4"
+        "depd": "~2.0.0",
+        "inherits": "~2.0.4",
+        "setprototypeof": "~1.2.0",
+        "statuses": "~2.0.2",
+        "toidentifier": "~1.0.1"
       },
       "engines": {
-        "node": ">= 14"
-      }
-    },
+        "node": ">= 0.8"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/http-proxy-agent": {
+      "version": "7.0.2",
+      "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
+      "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
+      "license": "MIT",
+      "dependencies": {
+        "agent-base": "^7.1.0",
+        "debug": "^4.3.4"
+      },
+      "engines": {
+        "node": ">= 14"
+      }
+    },
     "node_modules/https-proxy-agent": {
       "version": "7.0.6",
       "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
@@ -3166,6 +4011,22 @@
         "node": ">= 14"
       }
     },
+    "node_modules/iconv-lite": {
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz",
+      "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==",
+      "license": "MIT",
+      "dependencies": {
+        "safer-buffer": ">= 2.1.2 < 3.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
     "node_modules/ieee754": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
@@ -3195,6 +4056,12 @@
         "node": ">= 4"
       }
     },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+      "license": "ISC"
+    },
     "node_modules/ip-address": {
       "version": "10.1.0",
       "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
@@ -3204,6 +4071,15 @@
         "node": ">= 12"
       }
     },
+    "node_modules/ipaddr.js": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
+      "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
     "node_modules/is-fullwidth-code-point": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
@@ -3213,6 +4089,51 @@
         "node": ">=8"
       }
     },
+    "node_modules/is-interactive": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-interactive/-/is-interactive-2.0.0.tgz",
+      "integrity": "sha512-qP1vozQRI+BMOPcjFzrjXuQvdak2pHNUMZoeG2eRbiSqyvbEf/wQtEOTOX1guk6E3t36RkaqiSt8A/6YElNxLQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-promise": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz",
+      "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==",
+      "license": "MIT"
+    },
+    "node_modules/is-unicode-supported": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-2.1.0.tgz",
+      "integrity": "sha512-mE00Gnza5EEB3Ds0HfMyllZzbBrmLOX3vfWoj9A9PEnTfratQ/BcaJOuMhnkhjXvb2+FkY3VuHqtAGpTPmglFQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/isexe": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "license": "ISC"
+    },
+    "node_modules/jose": {
+      "version": "6.2.3",
+      "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz",
+      "integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/panva"
+      }
+    },
     "node_modules/json-bigint": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz",
@@ -3241,6 +4162,18 @@
       "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
       "license": "MIT"
     },
+    "node_modules/json-schema-typed": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/json-schema-typed/-/json-schema-typed-8.0.2.tgz",
+      "integrity": "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==",
+      "license": "BSD-2-Clause"
+    },
+    "node_modules/jsonc-parser": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.3.1.tgz",
+      "integrity": "sha512-HUgH65KyejrUFPvHFPbqOY0rsFip3Bo5wb4ngvdi1EpCYWUQDC5V+Y7mZws+DLkr4M//zQJoanu1SP+87Dv1oQ==",
+      "license": "MIT"
+    },
     "node_modules/jwa": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz",
@@ -3273,6 +4206,22 @@
         "url": "https://liberapay.com/Koromix"
       }
     },
+    "node_modules/log-symbols": {
+      "version": "7.0.1",
+      "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-7.0.1.tgz",
+      "integrity": "sha512-ja1E3yCr9i/0hmBVaM0bfwDjnGy8I/s6PP4DFp+yP+a+mrHO4Rm7DtmnqROTUkHIkqffC84YY7AeqX6oFk0WFg==",
+      "license": "MIT",
+      "dependencies": {
+        "is-unicode-supported": "^2.0.0",
+        "yoctocolors": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/long": {
       "version": "5.3.2",
       "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
@@ -3300,6 +4249,59 @@
         "node": ">= 18"
       }
     },
+    "node_modules/math-intrinsics": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/mcporter": {
+      "version": "0.9.0",
+      "resolved": "https://registry.npmjs.org/mcporter/-/mcporter-0.9.0.tgz",
+      "integrity": "sha512-zbvhQpBUL0DME8H0cYlNDQDLjvdhk1Lpy0QLGxJ6T2eCPyfs71GnP3uFP4u60vO5p/jdDsWpkZsmAdlA7OZ61w==",
+      "license": "MIT",
+      "dependencies": {
+        "@iarna/toml": "^2.2.5",
+        "@modelcontextprotocol/sdk": "^1.29.0",
+        "acorn": "^8.16.0",
+        "commander": "^14.0.3",
+        "es-toolkit": "^1.45.1",
+        "jsonc-parser": "^3.3.1",
+        "ora": "^9.3.0",
+        "rolldown": "1.0.0-rc.16",
+        "zod": "^4.3.6"
+      },
+      "bin": {
+        "mcporter": "dist/cli.js"
+      },
+      "engines": {
+        "node": ">=20.11.0"
+      }
+    },
+    "node_modules/media-typer": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-1.1.0.tgz",
+      "integrity": "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/merge-descriptors": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz",
+      "integrity": "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/mime-db": {
       "version": "1.54.0",
       "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz",
@@ -3325,6 +4327,18 @@
         "url": "https://opencollective.com/express"
       }
     },
+    "node_modules/mimic-function": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz",
+      "integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/minimatch": {
       "version": "10.2.5",
       "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.5.tgz",
@@ -3366,6 +4380,15 @@
         "thenify-all": "^1.0.0"
       }
     },
+    "node_modules/negotiator": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-1.0.0.tgz",
+      "integrity": "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
     "node_modules/netmask": {
       "version": "2.1.1",
       "resolved": "https://registry.npmjs.org/netmask/-/netmask-2.1.1.tgz",
@@ -3422,6 +4445,30 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/object-inspect": {
+      "version": "1.13.4",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
+      "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/on-finished": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz",
+      "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==",
+      "license": "MIT",
+      "dependencies": {
+        "ee-first": "1.1.1"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/once": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
@@ -3431,6 +4478,21 @@
         "wrappy": "1"
       }
     },
+    "node_modules/onetime": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/onetime/-/onetime-7.0.0.tgz",
+      "integrity": "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==",
+      "license": "MIT",
+      "dependencies": {
+        "mimic-function": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/openai": {
       "version": "6.26.0",
       "resolved": "https://registry.npmjs.org/openai/-/openai-6.26.0.tgz",
@@ -3452,6 +4514,44 @@
         }
       }
     },
+    "node_modules/ora": {
+      "version": "9.4.0",
+      "resolved": "https://registry.npmjs.org/ora/-/ora-9.4.0.tgz",
+      "integrity": "sha512-84cglkRILFxdtA8hAvLNdMrtBpPNBTrQ9/ulg0FA7xLMnD6mifv+enAIeRmvtv+WgdCE+LPGOfQmtJRrVaIVhQ==",
+      "license": "MIT",
+      "dependencies": {
+        "chalk": "^5.6.2",
+        "cli-cursor": "^5.0.0",
+        "cli-spinners": "^3.2.0",
+        "is-interactive": "^2.0.0",
+        "is-unicode-supported": "^2.1.0",
+        "log-symbols": "^7.0.1",
+        "stdin-discarder": "^0.3.2",
+        "string-width": "^8.1.0"
+      },
+      "engines": {
+        "node": ">=20"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/ora/node_modules/string-width": {
+      "version": "8.2.1",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-8.2.1.tgz",
+      "integrity": "sha512-IIaP0g3iy9Cyy18w3M9YcaDudujEAVHKt3a3QJg1+sr/oX96TbaGUubG0hJyCjCBThFH+tFpcIyoUHUn1ogaLA==",
+      "license": "MIT",
+      "dependencies": {
+        "get-east-asian-width": "^1.5.0",
+        "strip-ansi": "^7.1.2"
+      },
+      "engines": {
+        "node": ">=20"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/p-retry": {
       "version": "4.6.2",
       "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz",
@@ -3518,6 +4618,15 @@
       "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==",
       "license": "MIT"
     },
+    "node_modules/parseurl": {
+      "version": "1.3.3",
+      "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
+      "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/partial-json": {
       "version": "0.1.7",
       "resolved": "https://registry.npmjs.org/partial-json/-/partial-json-0.1.7.tgz",
@@ -3539,6 +4648,15 @@
         "node": ">=14.0.0"
       }
     },
+    "node_modules/path-key": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/path-scurry": {
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-2.0.2.tgz",
@@ -3555,12 +4673,31 @@
         "url": "https://github.com/sponsors/isaacs"
       }
     },
+    "node_modules/path-to-regexp": {
+      "version": "8.4.2",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.4.2.tgz",
+      "integrity": "sha512-qRcuIdP69NPm4qbACK+aDogI5CBDMi1jKe0ry5rSQJz8JVLsC7jV8XpiJjGRLLol3N+R5ihGYcrPLTno6pAdBA==",
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
     "node_modules/pend": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
       "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
       "license": "MIT"
     },
+    "node_modules/pkce-challenge": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/pkce-challenge/-/pkce-challenge-5.0.1.tgz",
+      "integrity": "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.20.0"
+      }
+    },
     "node_modules/proper-lockfile": {
       "version": "4.1.2",
       "resolved": "https://registry.npmjs.org/proper-lockfile/-/proper-lockfile-4.1.2.tgz",
@@ -3605,6 +4742,19 @@
         "node": ">=12.0.0"
       }
     },
+    "node_modules/proxy-addr": {
+      "version": "2.0.7",
+      "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz",
+      "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==",
+      "license": "MIT",
+      "dependencies": {
+        "forwarded": "0.2.0",
+        "ipaddr.js": "1.9.1"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
     "node_modules/proxy-agent": {
       "version": "6.5.0",
       "resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
@@ -3649,6 +4799,45 @@
         "once": "^1.3.1"
       }
     },
+    "node_modules/qs": {
+      "version": "6.15.1",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.15.1.tgz",
+      "integrity": "sha512-6YHEFRL9mfgcAvql/XhwTvf5jKcOiiupt2FiJxHkiX1z4j7WL8J/jRHYLluORvc1XxB5rV20KoeK00gVJamspg==",
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "side-channel": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=0.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/range-parser": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
+      "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/raw-body": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-3.0.2.tgz",
+      "integrity": "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==",
+      "license": "MIT",
+      "dependencies": {
+        "bytes": "~3.1.2",
+        "http-errors": "~2.0.1",
+        "iconv-lite": "~0.7.0",
+        "unpipe": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
     "node_modules/require-directory": {
       "version": "2.1.1",
       "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
@@ -3677,6 +4866,34 @@
         "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
       }
     },
+    "node_modules/restore-cursor": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-5.1.0.tgz",
+      "integrity": "sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==",
+      "license": "MIT",
+      "dependencies": {
+        "onetime": "^7.0.0",
+        "signal-exit": "^4.1.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/restore-cursor/node_modules/signal-exit": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
+      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
+      "license": "ISC",
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
     "node_modules/retry": {
       "version": "0.13.1",
       "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz",
@@ -3686,6 +4903,55 @@
         "node": ">= 4"
       }
     },
+    "node_modules/rolldown": {
+      "version": "1.0.0-rc.16",
+      "resolved": "https://registry.npmjs.org/rolldown/-/rolldown-1.0.0-rc.16.tgz",
+      "integrity": "sha512-rzi5WqKzEZw3SooTt7cgm4eqIoujPIyGcJNGFL7iPEuajQw7vxMHUkXylu4/vhCkJGXsgRmxqMKXUpT6FEgl0g==",
+      "license": "MIT",
+      "dependencies": {
+        "@oxc-project/types": "=0.126.0",
+        "@rolldown/pluginutils": "1.0.0-rc.16"
+      },
+      "bin": {
+        "rolldown": "bin/cli.mjs"
+      },
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      },
+      "optionalDependencies": {
+        "@rolldown/binding-android-arm64": "1.0.0-rc.16",
+        "@rolldown/binding-darwin-arm64": "1.0.0-rc.16",
+        "@rolldown/binding-darwin-x64": "1.0.0-rc.16",
+        "@rolldown/binding-freebsd-x64": "1.0.0-rc.16",
+        "@rolldown/binding-linux-arm-gnueabihf": "1.0.0-rc.16",
+        "@rolldown/binding-linux-arm64-gnu": "1.0.0-rc.16",
+        "@rolldown/binding-linux-arm64-musl": "1.0.0-rc.16",
+        "@rolldown/binding-linux-ppc64-gnu": "1.0.0-rc.16",
+        "@rolldown/binding-linux-s390x-gnu": "1.0.0-rc.16",
+        "@rolldown/binding-linux-x64-gnu": "1.0.0-rc.16",
+        "@rolldown/binding-linux-x64-musl": "1.0.0-rc.16",
+        "@rolldown/binding-openharmony-arm64": "1.0.0-rc.16",
+        "@rolldown/binding-wasm32-wasi": "1.0.0-rc.16",
+        "@rolldown/binding-win32-arm64-msvc": "1.0.0-rc.16",
+        "@rolldown/binding-win32-x64-msvc": "1.0.0-rc.16"
+      }
+    },
+    "node_modules/router": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz",
+      "integrity": "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==",
+      "license": "MIT",
+      "dependencies": {
+        "debug": "^4.4.0",
+        "depd": "^2.0.0",
+        "is-promise": "^4.0.0",
+        "parseurl": "^1.3.3",
+        "path-to-regexp": "^8.0.0"
+      },
+      "engines": {
+        "node": ">= 18"
+      }
+    },
     "node_modules/safe-buffer": {
       "version": "5.2.1",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -3706,18 +4972,162 @@
       ],
       "license": "MIT"
     },
+    "node_modules/safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
+      "license": "MIT"
+    },
+    "node_modules/send": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/send/-/send-1.2.1.tgz",
+      "integrity": "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==",
+      "license": "MIT",
+      "dependencies": {
+        "debug": "^4.4.3",
+        "encodeurl": "^2.0.0",
+        "escape-html": "^1.0.3",
+        "etag": "^1.8.1",
+        "fresh": "^2.0.0",
+        "http-errors": "^2.0.1",
+        "mime-types": "^3.0.2",
+        "ms": "^2.1.3",
+        "on-finished": "^2.4.1",
+        "range-parser": "^1.2.1",
+        "statuses": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/serve-static": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-2.2.1.tgz",
+      "integrity": "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==",
+      "license": "MIT",
+      "dependencies": {
+        "encodeurl": "^2.0.0",
+        "escape-html": "^1.0.3",
+        "parseurl": "^1.3.3",
+        "send": "^1.2.0"
+      },
+      "engines": {
+        "node": ">= 18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
+      }
+    },
+    "node_modules/setprototypeof": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz",
+      "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==",
+      "license": "ISC"
+    },
+    "node_modules/shebang-command": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+      "license": "MIT",
+      "dependencies": {
+        "shebang-regex": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/shebang-regex": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/side-channel": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
+      "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "object-inspect": "^1.13.3",
+        "side-channel-list": "^1.0.0",
+        "side-channel-map": "^1.0.1",
+        "side-channel-weakmap": "^1.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/side-channel-list": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.1.tgz",
+      "integrity": "sha512-mjn/0bi/oUURjc5Xl7IaWi/OJJJumuoJFQJfDDyO46+hBWsfaVM65TBHq2eoZBhzl9EchxOijpkbRC8SVBQU0w==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "object-inspect": "^1.13.4"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/side-channel-map": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
+      "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bound": "^1.0.2",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.5",
+        "object-inspect": "^1.13.3"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/side-channel-weakmap": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
+      "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bound": "^1.0.2",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.5",
+        "object-inspect": "^1.13.3",
+        "side-channel-map": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
     "node_modules/signal-exit": {
       "version": "3.0.7",
       "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
       "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
       "license": "ISC"
     },
-    "node_modules/sisteransi": {
-      "version": "1.0.5",
-      "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz",
-      "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==",
-      "license": "MIT"
-    },
     "node_modules/smart-buffer": {
       "version": "4.2.0",
       "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
@@ -3766,12 +5176,33 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/statuses": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz",
+      "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/std-env": {
       "version": "3.10.0",
       "resolved": "https://registry.npmjs.org/std-env/-/std-env-3.10.0.tgz",
       "integrity": "sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==",
       "license": "MIT"
     },
+    "node_modules/stdin-discarder": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/stdin-discarder/-/stdin-discarder-0.3.2.tgz",
+      "integrity": "sha512-eCPu1qRxPVkl5605OTWF8Wz40b4Mf45NY5LQmVPQ599knfs5QhASUm9GbJ5BDMDOXgrnh0wyEdvzmL//YMlw0A==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/string-width": {
       "version": "4.2.3",
       "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
@@ -3883,6 +5314,15 @@
         "node": ">=0.8"
       }
     },
+    "node_modules/toidentifier": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz",
+      "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.6"
+      }
+    },
     "node_modules/token-types": {
       "version": "6.1.2",
       "resolved": "https://registry.npmjs.org/token-types/-/token-types-6.1.2.tgz",
@@ -3901,15 +5341,6 @@
         "url": "https://github.com/sponsors/Borewit"
       }
     },
-    "node_modules/tree-sitter-wasms": {
-      "version": "0.1.13",
-      "resolved": "https://registry.npmjs.org/tree-sitter-wasms/-/tree-sitter-wasms-0.1.13.tgz",
-      "integrity": "sha512-wT+cR6DwaIz80/vho3AvSF0N4txuNx/5bcRKoXouOfClpxh/qqrF4URNLQXbbt8MaAxeksZcZd1j8gcGjc+QxQ==",
-      "license": "Unlicense",
-      "dependencies": {
-        "tree-sitter-wasms": "^0.1.11"
-      }
-    },
     "node_modules/ts-algebra": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz",
@@ -3942,6 +5373,20 @@
         "fsevents": "~2.3.3"
       }
     },
+    "node_modules/type-is": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz",
+      "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==",
+      "license": "MIT",
+      "dependencies": {
+        "content-type": "^1.0.5",
+        "media-typer": "^1.1.0",
+        "mime-types": "^3.0.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
     "node_modules/typescript": {
       "version": "5.9.3",
       "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
@@ -3983,6 +5428,24 @@
       "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
       "license": "MIT"
     },
+    "node_modules/unpipe": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
+      "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/vary": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
+      "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
     "node_modules/web-streams-polyfill": {
       "version": "3.3.3",
       "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz",
@@ -3992,11 +5455,20 @@
         "node": ">= 8"
       }
     },
-    "node_modules/web-tree-sitter": {
-      "version": "0.24.7",
-      "resolved": "https://registry.npmjs.org/web-tree-sitter/-/web-tree-sitter-0.24.7.tgz",
-      "integrity": "sha512-CdC/TqVFbXqR+C51v38hv6wOPatKEUGxa39scAeFSm98wIhZxAYonhRQPSMmfZ2w7JDI0zQDdzdmgtNk06/krQ==",
-      "license": "MIT"
+    "node_modules/which": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+      "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "license": "ISC",
+      "dependencies": {
+        "isexe": "^2.0.0"
+      },
+      "bin": {
+        "node-which": "bin/node-which"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
     },
     "node_modules/wrap-ansi": {
       "version": "7.0.0",
diff --git a/package.json b/package.json
index 8422dea..1f1be4c 100644
--- a/package.json
+++ b/package.json
@@ -1,9 +1,9 @@
 {
   "name": "skill-optimizer",
-  "version": "1.1.0",
-  "description": "Benchmark and optimizer for evaluating SDK, CLI, and MCP guidance with static action matching.",
+  "version": "2.0.0",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
   "license": "MIT",
-  "author": "Pi2 Labs",
+  "author": "Fast",
   "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
   "bugs": {
     "url": "https://github.com/fastxyz/skill-optimizer/issues"
@@ -13,21 +13,52 @@
     "url": "git+https://github.com/fastxyz/skill-optimizer.git"
   },
   "keywords": [
-    "benchmark",
-    "optimizer",
-    "mcp",
-    "sdk",
-    "cli",
-    "llm",
-    "tool-calling",
-    "evaluation"
+    "agent",
+    "skills",
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation"
   ],
   "type": "module",
+  "main": "./dist/index.js",
   "files": [
     "dist/",
+    "docker/",
+    "src/",
+    "scripts/",
+    "docs/",
+    "docs/README.codex.md",
+    "docs/README.opencode.md",
+    "examples/workbench/README.md",
+    "examples/workbench/pdf/README.md",
+    "examples/workbench/pdf/suite.yml",
+    "examples/workbench/pdf/checks/",
+    "examples/workbench/pdf/references/",
+    "examples/workbench/mcp/README.md",
+    "examples/workbench/mcp/suite.yml",
+    "examples/workbench/mcp/checks/",
+    "examples/workbench/mcp/mcp/",
+    "examples/workbench/mcp/references/",
+    "skills/",
+    ".agents/plugins/marketplace.json",
+    ".claude-plugin/",
+    ".codex-plugin/",
+    ".cursor-plugin/",
+    ".opencode/plugins/skill-optimizer.js",
+    ".opencode/INSTALL.md",
+    ".codex/INSTALL.md",
+    ".cursor/INSTALL.md",
+    "AGENTS.md",
+    "CLAUDE.md",
+    "CONTRIBUTING.md",
     "README.md",
+    "GEMINI.md",
+    "gemini-extension.json",
     "LICENSE",
-    "CHANGELOG.md"
+    "CHANGELOG.md",
+    "package-lock.json",
+    "tsconfig.json"
   ],
   "bin": {
     "skill-optimizer": "./dist/cli.js"
@@ -36,31 +67,27 @@
     ".": {
       "import": "./dist/index.js",
       "types": "./dist/index.d.ts"
+    },
+    "./server": {
+      "import": "./.opencode/plugins/skill-optimizer.js"
     }
   },
   "scripts": {
-    "benchmark": "tsx src/cli.ts run",
     "prepack": "npm run build",
     "clean": "node --eval \"import { rmSync } from 'node:fs'; rmSync('dist', { recursive: true, force: true });\"",
     "dev": "tsx src/cli.ts",
-    "optimize": "tsx src/cli.ts optimize",
-    "materialize:mock": "tsx src/optimizer/materialize-mock-repo.ts",
-    "gen-docs": "tsx scripts/gen-docs.ts",
-    "build": "tsc && npm run gen-docs && chmod +x dist/cli.js",
+    "build": "tsc && chmod +x dist/cli.js",
     "typecheck": "tsc --noEmit",
     "lint": "tsc --noUnusedLocals --noEmit",
-    "test": "tsx tests/smoke-code.ts && tsx tests/smoke-sdk-python.ts && tsx tests/smoke-sdk-rust.ts && tsx tests/smoke-cli.ts && tsx tests/smoke-cli-entry.ts && tsx tests/smoke-mcp.ts && tsx tests/smoke-llm.ts && tsx tests/smoke-discovery-sdk.ts && tsx tests/smoke-discovery-cli.ts && tsx tests/smoke-discovery-mcp.ts && tsx tests/smoke-prompt-evaluator.ts && tsx tests/smoke-prompt-criteria.ts && tsx tests/smoke-snapshot-prompt.ts && tsx tests/smoke-generation.ts && tsx tests/smoke-optimize.ts && tsx tests/smoke-mock-repos.ts && tsx tests/smoke-release.ts && tsx tests/smoke-changelog-coverage.ts && tsx tests/smoke-scoring.ts && tsx tests/smoke-scope.ts && tsx tests/smoke-coverage.ts && tsx tests/smoke-feedback.ts && tsx tests/smoke-verdict.ts && tsx tests/smoke-verdict-prompt.ts && tsx tests/smoke-dry-run.ts && tsx tests/smoke-errors.ts && tsx tests/smoke-model-ids.ts && tsx tests/smoke-e2e.ts && tsx tests/smoke-import.ts && tsx tests/smoke-init.ts && tsx tests/smoke-gen-docs.ts && tsx tests/smoke-actions.ts"
+    "test": "tsx tests/smoke-workbench-case.ts && tsx tests/smoke-workbench-checks.ts && tsx tests/smoke-workbench-trace.ts && tsx tests/smoke-workbench-container.ts && tsx tests/smoke-workbench-docker-runner.ts && tsx tests/smoke-workbench-pi-agent.ts && tsx tests/smoke-workbench-run-case.ts && tsx tests/smoke-workbench-models.ts && tsx tests/smoke-workbench-suite.ts && tsx tests/smoke-workbench-trials.ts && tsx tests/smoke-workbench-metrics.ts && tsx tests/smoke-skill-distribution.ts"
   },
   "dependencies": {
-    "@clack/prompts": "^1.2.0",
     "@mariozechner/pi-agent-core": "^0.66.1",
     "@mariozechner/pi-ai": "^0.66.1",
     "@mariozechner/pi-coding-agent": "^0.66.1",
     "dotenv": "^17.4.1",
-    "tree-sitter-wasms": "^0.1.13",
-    "web-tree-sitter": "^0.24.7",
-    "zod": "^4.3.6",
-    "zod-to-json-schema": "^3.25.2"
+    "mcporter": "^0.9.0",
+    "yaml": "^2.8.2"
   },
   "devDependencies": {
     "@types/node": "^22.12.0",
diff --git a/scripts/gen-docs.ts b/scripts/gen-docs.ts
deleted file mode 100644
index adfd6eb..0000000
--- a/scripts/gen-docs.ts
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/usr/bin/env tsx
-// scripts/gen-docs.ts — auto-generates docs/reference/ from code artifacts.
-// Run via: npm run gen-docs
-// Hooked into: npm run build
-
-import { writeFileSync, mkdirSync } from 'node:fs';
-import { resolve, dirname } from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { zodToJsonSchema } from 'zod-to-json-schema';
-import { ERRORS } from '../src/errors.js';
-import { ProjectConfigSchema } from '../src/project/schema.js';
-
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const refDir = resolve(__dirname, '../docs/reference');
-mkdirSync(refDir, { recursive: true });
-
-const GENERATED_HEADER = '<!-- AUTO-GENERATED — do not edit. Run `npm run gen-docs` to regenerate. -->\n\n';
-
-// ── errors.md ─────────────────────────────────────────────────────────────────
-
-function generateErrorsMd(): string {
-  const entries = Object.values(ERRORS);
-  const lines: string[] = [
-    GENERATED_HEADER,
-    '# Error Reference',
-    '',
-    'Every `skill-optimizer` error has a code, a short message, and a fix list.',
-    'The catch-all `E_UNEXPECTED` appears if an error slips past the known list.',
-    '',
-    '## Summary',
-    '',
-    '| Code | Description | Quick fix |',
-    '|---|---|---|',
-  ];
-
-  for (const def of entries) {
-    const msg = def.message.replace(/\|/g, '\\|');
-    const quickFix = (def.fix[0] ?? '').replace(/\|/g, '\\|');
-    lines.push(`| \`${def.code}\` | ${msg} | ${quickFix} |`);
-  }
-
-  lines.push('', '## Details', '');
-
-  for (const def of entries) {
-    lines.push(`### \`${def.code}\``);
-    lines.push('');
-    lines.push(`**${def.message}**`);
-    lines.push('');
-    lines.push('**How to fix:**');
-    for (const step of def.fix) {
-      lines.push(`- ${step}`);
-    }
-    lines.push('');
-  }
-
-  return lines.join('\n');
-}
-
-// ── config-schema.md ──────────────────────────────────────────────────────────
-
-interface JsonSchemaNode {
-  type?: string;
-  description?: string;
-  default?: unknown;
-  enum?: unknown[];
-  properties?: Record<string, JsonSchemaNode>;
-  items?: JsonSchemaNode;
-  anyOf?: JsonSchemaNode[];
-  $ref?: string;
-  $defs?: Record<string, JsonSchemaNode>;
-  definitions?: Record<string, JsonSchemaNode>;
-}
-
-function typeLabel(node: JsonSchemaNode): string {
-  if (node.enum) return node.enum.map(v => `"${v}"`).join(' | ');
-  if (node.anyOf) return node.anyOf.map(typeLabel).filter(Boolean).join(' | ');
-  if (node.type === 'array') {
-    const itemLabel = node.items ? typeLabel(node.items) : 'any';
-    return `${itemLabel}[]`;
-  }
-  return node.type ?? '';
-}
-
-function resolveRef(node: JsonSchemaNode, defs: Record<string, JsonSchemaNode>): JsonSchemaNode {
-  if (!node.$ref) return node;
-  const refKey = node.$ref.replace(/^#\/\$defs\//, '').replace(/^#\/definitions\//, '');
-  return defs[refKey] ?? node;
-}
-
-function flattenSchema(
-  node: JsonSchemaNode,
-  prefix: string,
-  rows: Array<{ path: string; type: string; default: string; description: string }>,
-  defs: Record<string, JsonSchemaNode>,
-): void {
-  if (!node.properties) return;
-
-  for (const [key, child] of Object.entries(node.properties)) {
-    const path = prefix ? `${prefix}.${key}` : key;
-    const resolved = resolveRef(child, defs);
-
-    // If it has nested properties, recurse without adding a row for the parent
-    if (resolved.properties) {
-      flattenSchema(resolved, path, rows, defs);
-    } else {
-      rows.push({
-        path,
-        type: typeLabel(resolved),
-        default: resolved.default !== undefined ? JSON.stringify(resolved.default) : '—',
-        description: resolved.description ?? '',
-      });
-    }
-  }
-}
-
-function generateConfigSchemaMd(): string {
-  const jsonSchema = zodToJsonSchema(ProjectConfigSchema, {
-    name: 'ProjectConfig',
-    $refStrategy: 'none',
-  }) as JsonSchemaNode;
-
-  const defs: Record<string, JsonSchemaNode> = {
-    ...(jsonSchema.$defs ?? {}),
-    ...(jsonSchema.definitions ?? {}),
-  };
-
-  // Resolve a top-level $ref if the named schema strategy wrapped everything in one
-  const root = jsonSchema.properties ? jsonSchema : resolveRef(jsonSchema, defs);
-
-  const rows: Array<{ path: string; type: string; default: string; description: string }> = [];
-  flattenSchema(root, '', rows, defs);
-
-  const lines: string[] = [
-    GENERATED_HEADER,
-    '# Config Schema Reference',
-    '',
-    'All configuration lives in a single `skill-optimizer.json` file.',
-    'Paths in the config are relative to the config file location.',
-    '',
-    '| Field | Type | Default | Description |',
-    '|---|---|---|---|',
-  ];
-
-  for (const row of rows) {
-    const desc = row.description.replace(/\|/g, '\\|');
-    lines.push(`| \`${row.path}\` | \`${row.type}\` | ${row.default} | ${desc} |`);
-  }
-
-  lines.push('');
-  return lines.join('\n');
-}
-
-// ── Write files ────────────────────────────────────────────────────────────────
-
-const errorsPath = resolve(refDir, 'errors.md');
-writeFileSync(errorsPath, generateErrorsMd(), 'utf-8');
-console.log(`[gen-docs] Written: ${errorsPath}`);
-
-const schemaPath = resolve(refDir, 'config-schema.md');
-writeFileSync(schemaPath, generateConfigSchemaMd(), 'utf-8');
-console.log(`[gen-docs] Written: ${schemaPath}`);
diff --git a/skills/skill-optimizer/SKILL.md b/skills/skill-optimizer/SKILL.md
new file mode 100644
index 0000000..b376e89
--- /dev/null
+++ b/skills/skill-optimizer/SKILL.md
@@ -0,0 +1,210 @@
+---
+name: skill-optimizer
+description: Use when creating, running, debugging, or documenting skill-optimizer workbench evals; working with agent skill cases, suites, graders, traces, Docker workspaces, OpenRouter model matrices, or the skill-optimizer SDK/CLI.
+---
+
+# skill-optimizer
+
+`skill-optimizer` is an eval workbench for agent skills. It runs a model in an isolated Docker `/work` directory, provides skills/references as normal workspace files, captures an agent trace, and grades deterministic local outcomes.
+
+Use this skill as the source of truth for authoring eval suites in this repo. Detailed schema and patterns are in `references/workbench.md`.
+
+## Core Model
+
+- A case is one user-like task plus one or more deterministic graders.
+- A suite is a set of cases and OpenRouter models to run as a matrix.
+- `references` are copied into `/work` before the agent starts; this is where eval skills live.
+- The agent phase sees `/work` only. It cannot see `/case`, `/results`, graders, hidden answers, or hidden metadata.
+- Cases can define `mcpServers`; these are exposed through a workbench `mcp` command during the agent phase.
+- Graders run after the agent with `/case`, `/work`, and `/results` mounted.
+- `trace.jsonl` is the debugging source for what the agent saw, said, and did.
+
+## Commands
+
+| Goal | Command |
+|------|---------|
+| Install deps | `npm install` |
+| Build CLI | `npm run build` |
+| Run one case | `npx tsx src/cli.ts run-case <case.yml>` |
+| Run one case across models | `npx tsx src/cli.ts run-case <case.yml> --models openrouter/google/gemini-2.5-flash,openrouter/openai/gpt-5.4` |
+| Run a suite | `npx tsx src/cli.ts run-suite <suite.yml>` |
+| CLI help | `npx tsx src/cli.ts --help` |
+
+Rules:
+
+- Use only `openrouter/...` model refs.
+- `OPENROUTER_API_KEY` is required for real model runs.
+- `run-suite` uses `models:` from `suite.yml`; it has no model override flag.
+- `run-case` can use its case `model:` or `--model` / `--models`.
+- Docker image default is `skill-optimizer-workbench:local`.
+
+## Install This Skill
+
+This repository ships one canonical skill at `skills/skill-optimizer/SKILL.md` plus plugin metadata for Claude Code, OpenCode, Codex, Cursor, and Gemini.
+
+Install the skill for common agents with:
+
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a claude-code -a opencode -a codex -a cursor
+```
+
+Plugin entrypoints:
+
+- Claude Code: `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json`
+- OpenCode: `.opencode/plugins/skill-optimizer.js`
+- Codex: `.codex-plugin/plugin.json`
+- Cursor: `.cursor-plugin/plugin.json`
+- Gemini: `gemini-extension.json` and `GEMINI.md`
+
+## Authoring Workflow
+
+1. Create `suite.yml` with `models`, shared defaults, and inline cases or case paths.
+2. Put the skill/reference material under `references/`; it will be copied into `/work`.
+3. Write natural user tasks. Do not mention graders, hidden answers, `/case`, or eval internals.
+4. Put setup helpers and grader helpers under `checks/`; put fake CLIs or command shims under `bin/` when the agent should call them.
+5. Add one or more `graders` per case. Prefer small deterministic graders over one broad grader.
+6. Run `run-suite --trials <n>` and inspect `suite-result.json`, failing `result.json`, `summary.json`, and `trace.jsonl`.
+
+Variables listed in `env` are forwarded unchanged into setup, agent, grading, and cleanup containers. For live integration evals, use dedicated test accounts and scoped credentials because the agent can access those values through shell tools. Treat `trace.jsonl`, `result.json`, grader evidence, stdout/stderr, and preserved `workspace/` directories as potentially sensitive if an agent or grader prints or writes secret values.
+
+Use `mcpServers` when the task should interact with MCP tools. For local servers whose source should stay hidden from the agent, put server files under the case `mcp/` support directory and define `mcpServices`; Docker starts those as separate service containers and the agent only sees their HTTP MCP URL. Direct stdio `mcpServers.command` entries run inside the agent container and are only appropriate when the server implementation is intentionally agent-visible. Remote HTTP/SSE servers must be reachable from Docker. The workbench generates `/work/mcporter.json` with `imports: []`, so host/user MCP configs are not imported. OAuth/browser auth is not supported; use env/header credentials listed in `env`.
+
+Prefer the real CLI/API/service when you do not know its internal behavior well enough to mock it faithfully. Mock only when you are sure the mock matches the real command surface, validation, outputs, and failure modes; otherwise the eval will measure the mock, not the skill. For command skills, include cases for the basic command, important flags/options, a no-tool-needed control, and unsafe-instruction resistance.
+
+## Minimal Suite
+
+```yaml
+name: pdf-skill-eval
+references: ./references
+models:
+  - openrouter/google/gemini-2.5-flash
+env:
+  - OPENROUTER_API_KEY
+timeoutSeconds: 600
+setup:
+  - node $CASE/checks/create-inputs.mjs
+appendSystemPrompt: |
+  Keep task outputs at the top level of /work unless the user asks otherwise.
+cases:
+  - name: extract-pdf-facts
+    task: |
+      Read statement.pdf and write answer.json with the account, quarter, approval code, and risk flags.
+    graders:
+      - name: answer-json
+        command: node $CASE/checks/extract-pdf-facts.mjs
+```
+
+## Directory Layout
+
+```text
+my-eval/
+  suite.yml
+  references/
+    my-skill/SKILL.md
+  checks/
+    create-inputs.mjs
+    extract-pdf-facts.mjs
+  bin/
+    fake-cli
+  workspace/
+    starter-app/
+```
+
+Support directories are optional. `checks/` is mounted read-only at `/case/checks` for setup/grading. `bin/` is copied into `/work/bin` for the agent and is also available as `/case/bin` during setup/grading. `workspace/` is copied into `/work` after `references/`.
+
+## Grader Contract
+
+Graders are shell commands. They run with:
+
+- `$CASE`: read-only case directory mounted at `/case`
+- `$WORK`: mutable workspace the agent used
+- `$RESULTS`: result directory containing `trace.jsonl`
+
+Preferred grader output:
+
+```json
+{ "pass": true, "score": 1, "evidence": ["answer matched"] }
+```
+
+If no JSON object is printed, exit code `0` passes and non-zero fails. Keep graders deterministic and local; do not use an LLM judge unless the eval explicitly requires one.
+
+Graders are the acceptance contract. They should evaluate evidence in `/work`, generated artifacts, `answer.json`, `trace.jsonl`, and any relevant result-state files under `$RESULTS`.
+
+## Outputs
+
+```text
+.results/<run-id>/
+  suite-result.json                  # run-suite aggregate
+  run-result.json                    # run-case matrix aggregate
+  trials/<case>--<model>--001/
+    trace.jsonl                      # agent messages and tool calls
+    result.json                      # pass, score, evidence, graders, metrics
+    summary.json                     # final text, failed graders, commands
+    workspace/                       # failures or --keep-workspace
+```
+
+Use `trace.jsonl` to debug failures and to grade negative behavior, such as whether a task read an irrelevant skill file.
+
+## Optimization Loop
+
+After a run, inspect failing `result.json`, `summary.json`, `trace.jsonl`, and preserved `workspace/` evidence. Classify each failure before changing anything: unclear skill guidance, missing reference material, brittle grader, unrealistic input data, task ambiguity, or product/code bug. Update the target skill, references, inputs, graders, or code according to that diagnosis, then re-run the same case or suite to verify the change. Repeat until the grader evidence shows the intended behavior across the target models/trials.
+
+For live CLI/API evals, use scoped test credentials and avoid printing secrets. Grade durable evidence: command traces, arguments, generated files, response summaries, and safety behavior. Keep service-specific setup facts in the suite prompt or setup commands, not in the portable skill under test.
+
+## Programmatic SDK
+
+The package exports workbench APIs from `skill-optimizer` after build:
+
+```ts
+import {
+  loadWorkbenchCase,
+  loadWorkbenchSuite,
+  runWorkbenchCase,
+  runWorkbenchSuite,
+  runGraderCommands,
+  parseModelList,
+} from 'skill-optimizer';
+```
+
+The CLI is the stable path for normal eval runs. Use SDK functions for tests, wrappers, and internal automation.
+
+## Examples
+
+Tracked demos live in `examples/` (the same repo path users may refer to as `@examples/`). Read these alongside the skill docs when building or debugging evals:
+
+| Path | Why It Matters |
+|------|----------------|
+| `examples/workbench/README.md` | Short command walkthrough for demos |
+| `examples/workbench/pdf/README.md` | Explains the PDF demo cases and expected outputs |
+| `examples/workbench/pdf/suite.yml` | Concrete suite using models, setup, env, graders, and append prompt |
+| `examples/workbench/pdf/references/pdf-skill/SKILL.md` | Example skill copied into `/work` for the agent |
+| `examples/workbench/pdf/checks/*.mjs` | Deterministic grader and setup helper patterns |
+| `examples/workbench/mcp/suite.yml` | Hidden-service MCP calculator example |
+| `examples/workbench/mcp/mcp/calculator-server.mjs` | Example MCP server with add/subtract/multiply/divide tools |
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+npx tsx src/cli.ts run-suite examples/workbench/mcp/suite.yml --trials 1
+```
+
+The PDF demo covers setup, suite models, positive output grading, and trace-based negative grading.
+
+## Development Checks
+
+After code or docs that affect behavior:
+
+```bash
+npm run typecheck
+npm test
+npm run build
+npx tsx src/cli.ts --help
+node dist/cli.js --help
+```
+
+After Dockerfile/container-runner changes:
+
+```bash
+docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .
+```
+
+Do not commit `.skill-eval/`; it is local ignored eval data.
diff --git a/skills/skill-optimizer/references/workbench.md b/skills/skill-optimizer/references/workbench.md
new file mode 100644
index 0000000..e0c5a0f
--- /dev/null
+++ b/skills/skill-optimizer/references/workbench.md
@@ -0,0 +1,533 @@
+# Workbench Reference
+
+This reference is for humans and agents authoring evals with the `skill-optimizer` CLI or SDK.
+
+## What The Workbench Evaluates
+
+The workbench is for tasks that can be graded from local evidence:
+
+- Files the agent creates or edits in `/work`
+- Command invocations recorded by fake CLIs
+- Generated files such as PDF, DOCX, PPTX, XLSX, images, JSON, or code
+- Static SQL, shell scripts, config, or source code
+- Agent behavior captured in `trace.jsonl`
+
+Avoid evals that require running model-produced arbitrary production code outside the container or using a second LLM as the default judge.
+
+## CLI Surface
+
+```bash
+npx tsx src/cli.ts run-case <case.yml>
+npx tsx src/cli.ts run-case <case.yml> --model openrouter/google/gemini-2.5-flash
+npx tsx src/cli.ts run-case <case.yml> --models openrouter/google/gemini-2.5-flash,openrouter/openai/gpt-5.4 --trials 3 --concurrency 2
+npx tsx src/cli.ts run-suite <suite.yml> --trials 3 --concurrency 2
+```
+
+Options:
+
+| Command | Option | Meaning |
+|---------|--------|---------|
+| `run-case` | `--out <path>` | Results root, default `<case-dir>/.results` |
+| `run-case` | `--model <model>` | Single OpenRouter model override |
+| `run-case` | `--models <csv>` | Comma-separated OpenRouter model refs |
+| `run-case` | `--trials <n>` | Independent trials per model |
+| `run-suite` | `--out <path>` | Results root, default `<suite-dir>/.results` |
+| `run-suite` | `--trials <n>` | Independent trials per case/model |
+| both | `--concurrency <n>` | Maximum concurrent trial containers |
+| both | `--image <image>` | Docker image, default `skill-optimizer-workbench:local` |
+| both | `--keep-workspace` | Preserve successful workspaces too; failures are always preserved |
+
+Only `openrouter/...` model refs are accepted. `run-suite` uses the `models:` array in the suite file.
+
+## Case Schema
+
+Case files may be `.yml`, `.yaml`, or `.json`.
+
+```yaml
+name: extract-pdf-facts
+references: ./references
+task: |
+  Read statement.pdf and write answer.json with the account, quarter, approval code, and risk flags.
+graders:
+  - name: answer-json
+    command: node $CASE/checks/extract-pdf-facts.mjs
+setup:
+  - node $CASE/checks/create-inputs.mjs
+cleanup: []
+env:
+  - OPENROUTER_API_KEY
+mcpServers:
+  calculator:
+    baseUrl: http://calculator:3000/mcp
+mcpServices:
+  calculator:
+    command: node
+    args:
+      - calculator-server.mjs
+model: openrouter/google/gemini-2.5-flash
+timeoutSeconds: 600
+```
+
+Required fields:
+
+| Field | Type | Meaning |
+|-------|------|---------|
+| `name` | string | Human-readable case name; suite inline cases slug this for result dirs |
+| `references` | string | Directory copied into `/work` before the agent starts |
+| `task` | string | User-like task sent to the agent |
+| `graders` | array | Non-empty list of `{ name, command }` grader commands |
+
+Optional fields:
+
+| Field | Type | Meaning |
+|-------|------|---------|
+| `setup` | string[] | Commands run in `/work` before the agent phase |
+| `cleanup` | string[] | Commands run after grading |
+| `env` | string[] | Host environment variable names forwarded into setup, agent, grading, and cleanup containers |
+| `mcpServers` | object | MCP servers exposed through the agent `mcp` tool |
+| `mcpServices` | object | Hidden local MCP services started as separate Docker containers |
+| `model` | string | Default model for `run-case`; defaults to `openrouter/google/gemini-2.5-flash` |
+| `timeoutSeconds` | number | Agent timeout; defaults to `600` |
+
+All relative paths resolve from the case file directory.
+
+## Suite Schema
+
+Suites may contain inline case objects or paths to external case files.
+
+```yaml
+name: pdf-workbench-example
+references: ./references
+models:
+  - openrouter/google/gemini-2.5-flash
+env:
+  - OPENROUTER_API_KEY
+timeoutSeconds: 600
+setup:
+  - node $CASE/checks/_pdf.mjs write-inputs input
+appendSystemPrompt: |
+  Keep task outputs at the top level of /work unless the user asks otherwise.
+cases:
+  - name: extract-pdf-facts
+    task: |
+      Read statement.pdf and write answer.json with the account, quarter, approval code, and risk flags.
+    graders:
+      - name: answer-json
+        command: node $CASE/checks/extract-pdf-facts.mjs
+  - cases/external-case/case.yml
+```
+
+Suite fields:
+
+| Field | Required | Meaning |
+|-------|----------|---------|
+| `name` | yes | Suite name in aggregate output |
+| `models` | yes | OpenRouter model refs for the case/model matrix |
+| `cases` | yes | Inline case objects or paths to case files |
+| `references` | no | Default references dir for inline cases; defaults to `./references` |
+| `env` | no | Default env allowlist for inline cases |
+| `setup` | no | Default setup commands for inline cases |
+| `cleanup` | no | Default cleanup commands for inline cases |
+| `mcpServers` | no | Default MCP servers for inline cases, merged by server name |
+| `mcpServices` | no | Default hidden MCP service containers for inline cases, merged by service name |
+| `timeoutSeconds` | no | Default agent timeout for inline cases |
+| `appendSystemPrompt` | no | Extra suite-wide system prompt appended after the workbench prompt |
+
+Inline case fields override suite defaults. External case files are loaded from their own file directory and do not inherit suite defaults.
+
+Environment variables listed in `env` are forwarded unchanged. This intentionally supports live integration evals such as authenticated CLI calls, but it also means the agent can read or print those values through shell tools. Use dedicated test accounts, least-privilege credentials, and cleanup routines for live systems. Treat `trace.jsonl`, `result.json`, grader evidence, stdout/stderr, and preserved `workspace/` directories as potentially sensitive if an agent or grader prints or writes secret values.
+
+## MCP Servers
+
+`mcpServers` uses mcporter-compatible server entries. During each Docker trial, the workbench writes `/work/mcporter.json` with `imports: []` and exposes an `mcp` command on `PATH`.
+
+The `mcp` command delegates to `mcporter`:
+
+```bash
+mcp list calculator
+mcp call calculator.add a=17 b=25
+```
+
+Example suite default:
+
+```yaml
+mcpServers:
+  calculator:
+    baseUrl: http://calculator:3000/mcp
+  context7:
+    baseUrl: https://mcp.context7.com/mcp
+    headers:
+      Authorization: "Bearer ${CONTEXT7_API_KEY}"
+env:
+  - OPENROUTER_API_KEY
+  - CONTEXT7_API_KEY
+mcpServices:
+  calculator:
+    command: node
+    args:
+      - calculator-server.mjs
+```
+
+Suite-level `mcpServers` apply only to inline cases. Inline cases merge by server name and win on conflicts. External case files define their own MCP servers and do not inherit suite defaults.
+
+Use `mcpServices` for local MCP servers whose source should not be visible to the agent. Service files live under the case `mcp/` support directory. During Docker runs, the workbench mounts that directory read-only into separate service containers at `/mcp`, joins those containers to a private Docker network, and joins the agent container to the same network. The agent sees only the configured `mcpServers` URL such as `http://calculator:3000/mcp`; it does not mount `/case` or the `mcp/` source directory. Set service ports in the matching `mcpServers` URL rather than in `mcpServices`.
+
+Remote HTTP/SSE servers must be reachable from Docker. `localhost` means the container, not the host, so use `host.docker.internal` or Docker networking for host-local services. Direct stdio `mcpServers.command` entries run inside the agent container and are only appropriate when the server implementation is intentionally agent-visible.
+
+OAuth/browser auth is not supported. Use non-interactive headers, bearer tokens, or env placeholders. Only variables listed in `env` are forwarded.
+
+## Directory Layout
+
+```text
+eval-root/
+  suite.yml
+  references/
+    product-skill/SKILL.md
+    product-skill/references/api.md
+  checks/
+    create-inputs.mjs
+    grade-output.mjs
+    trace-guards.mjs
+  bin/
+    fake-product-cli
+  workspace/
+    starter-repo/
+```
+
+Directory behavior:
+
+| Directory | Visible To Agent | Purpose |
+|-----------|------------------|---------|
+| `references/` | yes, copied into `/work` | Skills, docs, examples, starter reference files |
+| `workspace/` | yes, copied into `/work` | Seed app repo or starter files the agent may edit |
+| `checks/` | no during agent phase | Graders and setup helpers under `/case/checks` |
+| `bin/` | yes, copied into `/work/bin` and mounted as `/case/bin` during setup and grading | Fake CLIs and command shims on `PATH` |
+
+## Execution Phases
+
+`run-case` and `run-suite` use Docker for model attempts. Each trial is prepared on the host, then mounted into phase containers.
+
+| Phase | Docker Mounts | Working Dir | What Happens |
+|-------|---------------|-------------|--------------|
+| setup | `/case:ro`, `/work:rw` | `/work` | Run `setup` commands and prepare inputs |
+| agent | `/work:rw` only | `/work` | Pi agent receives task and uses tools |
+| grade | `/case:ro`, `/work:rw`, `/results:rw` | `/work` | Run grader commands and write result files |
+| cleanup | `/case:ro`, `/work:rw`, `/results:rw` | `/work` | Run optional cleanup commands |
+
+Agent phase constraints:
+
+- No `/case` mount
+- No `/results` mount
+- No Docker socket
+- No global/user Pi skills
+- Additional skills are discovered from `/work`
+- Configured MCP servers are exposed through the `mcp` command using `/work/mcporter.json`
+- Python installs should use `/work/.venv`
+- Internet is available unless Docker environment blocks it
+- `env` allowlisted credentials are available unchanged to agent shell commands
+
+## Task Writing Rules
+
+Write tasks like normal user requests:
+
+- Ask for the actual deliverable and path.
+- Include enough business detail to complete the task.
+- Keep hidden expected answers in graders or hidden case support files, not in the task.
+- Do not mention graders, answer keys, trace checks, `/case`, `/results`, or benchmark metadata.
+- Do not instruct the agent to read or not read a skill unless that is the real user behavior being evaluated.
+
+Good task:
+
+```text
+Read statement.pdf and write answer.json with the account, quarter, approval code, and risk flags.
+```
+
+Poor task:
+
+```text
+Use the PDF skill and satisfy the grader in /case/checks/extract-pdf-facts.mjs.
+```
+
+## Grader Contract
+
+Each grader is a shell command run in `/work`.
+
+Environment variables:
+
+| Var | Meaning |
+|-----|---------|
+| `$CASE` | Read-only case directory mounted at `/case` |
+| `$WORK` | Mutable workspace from the agent run |
+| `$RESULTS` | Trial result directory with `trace.jsonl` |
+
+Preferred output is one JSON object on stdout:
+
+```json
+{ "pass": false, "score": 0, "evidence": ["answer.json missing approvalCode"] }
+```
+
+Accepted fields:
+
+| Field | Type | Meaning |
+|-------|------|---------|
+| `pass` | boolean | Whether the grader passed |
+| `score` | number | Optional score clamped to 0..1; defaults to 1 for pass and 0 for fail |
+| `evidence` | string or string[] | Human-readable details surfaced in result files |
+
+If stdout does not contain a JSON object, exit code `0` passes and non-zero fails. JSON can be surrounded by logs; the runner parses the first object-shaped span from stdout.
+
+Grader principles:
+
+- Check one concept per grader when practical.
+- Prefer exact structural checks over brittle prose matching.
+- Print useful evidence for failure triage.
+- Keep all grading deterministic and local.
+- Graders should inspect `/work`, command logs, generated outputs, or `trace.jsonl`.
+
+## Grader Examples
+
+JSON output grader:
+
+```js
+import { existsSync, readFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const path = join(process.env.WORK, 'answer.json');
+const failures = [];
+
+if (!existsSync(path)) {
+  failures.push('answer.json was not created');
+} else {
+  const answer = JSON.parse(readFileSync(path, 'utf-8'));
+  if (answer.approvalCode !== 'PDF-7429') failures.push('approvalCode mismatch');
+}
+
+console.log(JSON.stringify({
+  pass: failures.length === 0,
+  score: failures.length === 0 ? 1 : 0,
+  evidence: failures.length === 0 ? ['answer.json matched'] : failures,
+}));
+process.exit(failures.length === 0 ? 0 : 1);
+```
+
+Trace guard grader:
+
+```js
+import { existsSync, readFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const tracePath = join(process.env.RESULTS, 'trace.jsonl');
+const lines = existsSync(tracePath) ? readFileSync(tracePath, 'utf-8').trim().split(/\r?\n/) : [];
+const readForbiddenSkill = lines.some((line) => {
+  try {
+    const entry = JSON.parse(line);
+    const path = entry?.arguments?.path ?? entry?.arguments?.filePath;
+    return entry.type === 'tool_call' && entry.name === 'read' && /\/pdf-skill\/SKILL\.md$/.test(path);
+  } catch {
+    return false;
+  }
+});
+
+console.log(JSON.stringify({
+  pass: !readForbiddenSkill,
+  score: readForbiddenSkill ? 0 : 1,
+  evidence: readForbiddenSkill ? ['agent read the PDF skill'] : ['no forbidden skill read'],
+}));
+process.exit(readForbiddenSkill ? 1 : 0);
+```
+
+## Acceptance Contract
+
+Graders are the source of truth for pass/fail. They can evaluate:
+
+- Files and generated artifacts in `/work`
+- Structured outputs such as `answer.json`
+- Behavior traces in `$RESULTS/trace.jsonl`
+- Any additional result-state files your checks create under `$RESULTS`
+
+Keep grading deterministic and local so results stay stable and reproducible.
+
+## Results And Metrics
+
+Single-trial `run-case` output:
+
+```text
+case/.results/<run-id>/
+  trace.jsonl
+  result.json
+  summary.json
+  workspace/        # on failure or --keep-workspace
+```
+
+Matrix `run-case` output:
+
+```text
+case/.results/<run-id>/
+  run-result.json
+  trials/<model-slug>--001/trace.jsonl
+  trials/<model-slug>--001/result.json
+```
+
+`run-suite` output:
+
+```text
+suite/.results/<run-id>/
+  suite-result.json
+  trials/<case-slug>--<model-slug>--001/trace.jsonl
+  trials/<case-slug>--<model-slug>--001/result.json
+```
+
+`result.json` includes:
+
+- `pass`, `score`, and `evidence`
+- Per-grader results under `graders`
+- `metrics.durationMs`, turns, tool counts, tokens, and cost
+
+Aggregate files include:
+
+- `trialPassRate`: passed trials / total trials
+- `meanScore`: mean top-level score
+- `passAtK`: at least one trial passed
+- `passHatK`: all trials passed
+- Relative `tracePath`, `resultPath`, and `summaryPath` entries
+
+## Trace JSONL
+
+`trace.jsonl` is newline-delimited JSON. Useful entry shapes:
+
+```json
+{ "type": "trace_start", "caseName": "extract-pdf-facts", "model": "openrouter/google/gemini-2.5-flash" }
+{ "type": "message", "role": "assistant", "text": "..." }
+{ "type": "tool_call", "name": "bash", "arguments": { "command": "node script.mjs" } }
+{ "type": "tool_call", "name": "read", "arguments": { "path": "/work/pdf-skill/SKILL.md" } }
+{ "type": "tool_result", "name": "bash", "text": "...", "isError": false }
+```
+
+Use trace evidence to debug why a model failed, verify tool usage, or enforce negative cases.
+
+## SDK Surface
+
+After `npm run build`, the package exports these workbench APIs from `skill-optimizer`:
+
+| API | Purpose |
+|-----|---------|
+| `loadWorkbenchCase(path)` | Parse and validate a case file |
+| `loadWorkbenchSuite(path)` | Parse and validate a suite file |
+| `runWorkbenchCase(params)` | Run one case or a model/trial matrix |
+| `runWorkbenchSuite(params)` | Run a suite matrix |
+| `runDockerWorkbenchCase(params)` | Lower-level Docker case runner |
+| `runGraderCommands(graders, opts)` | Execute grader commands and normalize results |
+| `normalizeCheckResult(result)` | Normalize shell output into a grade |
+| `parseModelList(raw)` | Parse comma-separated OpenRouter refs |
+| `aggregateTrials(results)` | Compute pass@k/pass^k/trial metrics |
+
+Example:
+
+```ts
+import { runWorkbenchSuite } from 'skill-optimizer';
+
+await runWorkbenchSuite({
+  suitePath: 'examples/workbench/pdf/suite.yml',
+  trials: 3,
+  concurrency: 2,
+});
+```
+
+Use CLI commands for normal human workflows. Use SDK functions for tests, wrappers, and automation inside this repo.
+
+## Eval Patterns
+
+Live CLI/API Skills:
+
+- Prefer the real CLI/API/service when you are not certain how to mock its internals.
+- Mock only when you know the real command surface, validation, outputs, and failure modes well enough to reproduce them faithfully.
+- Use dedicated test credentials with least privilege, allowlist only the needed env vars, and avoid printing secrets into trace or grader evidence.
+- If mocking is justified, put a fake executable in `bin/` and record calls to `$WORK/calls.jsonl`. Grade command names, flags, output files, and trace behavior.
+- If the real tool is safe to call with setup/cleanup and scoped test credentials, install it in `setup` and grade its real dry-run or live request output.
+- Include a basic-command case and a flag/options case for command-selection coverage.
+- Include a no-tool-needed control case to catch unnecessary skill or CLI use.
+- Include a prompt-injection or unsafe-instruction case when external content, fetched pages, or third-party responses can influence the agent.
+
+File-output skills:
+
+- Ask for a concrete output file.
+- Grade structure directly, such as PDF page count, ZIP members, JSON schema, image dimensions, or file hash.
+- Inspect failed workspaces or rerun with `--keep-workspace` when you need output files for triage.
+
+Code/editing skills:
+
+- Seed `workspace/` with a small repo.
+- Ask for a normal change.
+- Grade diff, tests, generated files, or static properties.
+
+Negative/control cases:
+
+- Ask for a task that should not require the target skill.
+- Grade `trace.jsonl` for forbidden reads, tool calls, or commands.
+- For trace-based negative cases, ensure graders handle missing or empty trace entries defensively.
+
+## Debugging Failed Runs
+
+1. Open the failing trial `result.json` and read top-level `evidence`.
+2. Open `graders[]` to see which grader failed.
+3. Open `summary.json` for final assistant text and bash commands.
+4. Open `trace.jsonl` to inspect tool calls and file reads.
+5. Inspect preserved `workspace/` for failed trials.
+6. Classify the failure as unclear skill guidance, missing reference material, brittle grader, unrealistic input data, task ambiguity, or product/code bug.
+7. Update the target skill, references, inputs, graders, or code according to that diagnosis.
+8. Re-run the same case or suite and compare grader evidence across the target models/trials.
+
+## Example Suite
+
+The `examples/` tree (often referenced as `@examples/` in path-aware prompts) is part of the packaged skill-optimizer reference material. Use it as the concrete companion to this document.
+
+Start here:
+
+```text
+examples/
+  workbench/
+    README.md
+    pdf/
+      README.md
+      suite.yml
+      references/pdf-skill/SKILL.md
+      checks/*.mjs
+    mcp/
+      mcp/calculator-server.mjs
+```
+
+The tracked PDF demo is the best starting point:
+
+```bash
+npx tsx src/cli.ts run-suite examples/workbench/pdf/suite.yml --trials 1
+```
+
+Files to inspect:
+
+| File | Purpose |
+|------|---------|
+| `examples/workbench/README.md` | Top-level example command walkthrough |
+| `examples/workbench/pdf/suite.yml` | Inline suite using models, setup, graders, and append prompt |
+| `examples/workbench/pdf/references/pdf-skill/SKILL.md` | Skill under test copied into `/work` |
+| `examples/workbench/pdf/checks/*.mjs` | Deterministic graders and setup helpers |
+| `examples/workbench/pdf/README.md` | Demo walkthrough |
+| `examples/workbench/mcp/suite.yml` | Hidden-service MCP calculator demo |
+| `examples/workbench/mcp/mcp/calculator-server.mjs` | Calculator MCP server with add/subtract/multiply/divide |
+
+## Repository Verification
+
+Use these before claiming repo changes are complete:
+
+```bash
+npm run typecheck
+npm test
+npm run build
+npx tsx src/cli.ts --help
+node dist/cli.js --help
+```
+
+For runner/Docker changes, rebuild the image:
+
+```bash
+docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .
+```
diff --git a/src/actions/diff.ts b/src/actions/diff.ts
deleted file mode 100644
index a1763b2..0000000
--- a/src/actions/diff.ts
+++ /dev/null
@@ -1,61 +0,0 @@
-import { normalizeActionArgSchema } from './snapshot.js';
-import type { ActionCatalog, ActionDefinition } from './types.js';
-
-export interface ChangedAction {
-  before: ActionDefinition;
-  after: ActionDefinition;
-}
-
-export interface ActionCatalogDiff {
-  added: ActionDefinition[];
-  removed: ActionDefinition[];
-  changed: ChangedAction[];
-}
-
-function schemaFingerprint(action: ActionDefinition): string {
-  return JSON.stringify(normalizeActionArgSchema(action.args));
-}
-
-function indexByKey(actions: ActionDefinition[], side: 'before' | 'after'): Map<string, ActionDefinition> {
-  const indexed = new Map<string, ActionDefinition>();
-  for (const action of actions) {
-    const canonicalKey = action.key.trim();
-    if (indexed.has(canonicalKey)) {
-      throw new Error(`Duplicate action key in ${side} catalog: ${canonicalKey}`);
-    }
-    indexed.set(canonicalKey, {
-      ...action,
-      key: canonicalKey,
-    });
-  }
-  return indexed;
-}
-
-export function diffActionCatalog(before: ActionCatalog, after: ActionCatalog): ActionCatalogDiff {
-  const beforeByKey = indexByKey(before.actions, 'before');
-  const afterByKey = indexByKey(after.actions, 'after');
-
-  const added: ActionDefinition[] = [];
-  const removed: ActionDefinition[] = [];
-  const changed: ChangedAction[] = [];
-
-  for (const [key, afterAction] of afterByKey.entries()) {
-    const beforeAction = beforeByKey.get(key);
-    if (!beforeAction) {
-      added.push(afterAction);
-      continue;
-    }
-
-    if (schemaFingerprint(beforeAction) !== schemaFingerprint(afterAction)) {
-      changed.push({ before: beforeAction, after: afterAction });
-    }
-  }
-
-  for (const [key, beforeAction] of beforeByKey.entries()) {
-    if (!afterByKey.has(key)) {
-      removed.push(beforeAction);
-    }
-  }
-
-  return { added, removed, changed };
-}
diff --git a/src/actions/discover.ts b/src/actions/discover.ts
deleted file mode 100644
index 4e9d86e..0000000
--- a/src/actions/discover.ts
+++ /dev/null
@@ -1,109 +0,0 @@
-import { loadCliCommands, loadMcpTools } from './loaders.js';
-import type { ResolvedProjectConfig } from '../project/types.js';
-import type { ActionCatalog } from './types.js';
-import { readCliActionsFromSources } from './readers/cli.js';
-import { readMcpActionsFromSources } from './readers/mcp.js';
-import { readSdkActionsFromSources } from './readers/sdk.js';
-
-export function discoverActions(project: ResolvedProjectConfig): ActionCatalog {
-  const discoveryMode = project.target.discovery?.mode ?? 'auto';
-  const discoverySources = project.target.discovery?.sources ?? [];
-  const shouldUseDiscovery = discoveryMode !== 'manifest' && discoverySources.length > 0;
-
-  if (project.target.surface === 'sdk') {
-    if (shouldUseDiscovery) {
-      const actions = readSdkActionsFromSources(discoverySources);
-      if (actions.length > 0) {
-        return {
-          surface: 'sdk',
-          actions,
-        };
-      }
-
-      if ((project.target.sdk?.apiSurface?.length ?? 0) === 0) {
-        throw new Error(`SDK discovery found 0 actions from configured sources: ${discoverySources.join(', ')}`);
-      }
-    }
-
-    return {
-      surface: 'sdk',
-      actions: (project.target.sdk?.apiSurface ?? []).map((name) => ({
-        key: name,
-        name,
-        args: [],
-        source: 'sdk.apiSurface',
-      })),
-    };
-  }
-
-  if (project.target.surface === 'cli') {
-    if (shouldUseDiscovery) {
-      const actions = readCliActionsFromSources(discoverySources);
-      if (actions.length > 0) {
-        return {
-          surface: 'cli',
-          actions,
-        };
-      }
-
-      if (!project.target.cli?.commands) {
-        throw new Error(`CLI discovery found 0 actions from configured sources: ${discoverySources.join(', ')}`);
-      }
-    }
-
-    const commands = project.target.cli ? loadCliCommands(project.target.cli.commands) : [];
-    return {
-      surface: 'cli',
-      actions: commands.map((command) => ({
-        key: command.command,
-        name: command.command,
-        description: command.description,
-        args: (command.options ?? []).map((option) => ({
-          name: normalizeCliArgName(option.name),
-          required: false,
-          type: option.takesValue ? 'string' : 'boolean',
-          description: option.description,
-        })),
-        source: 'cli.commands',
-      })),
-    };
-  }
-
-  if (shouldUseDiscovery) {
-    const actions = readMcpActionsFromSources(discoverySources);
-    if (actions.length > 0) {
-      return {
-        surface: 'mcp',
-        actions,
-      };
-    }
-
-    if (!project.target.mcp?.tools) {
-      throw new Error(`MCP discovery found 0 actions from configured sources: ${discoverySources.join(', ')}`);
-    }
-  }
-
-  const tools = project.target.mcp ? loadMcpTools(project.target.mcp.tools) : [];
-  return {
-    surface: 'mcp',
-    actions: tools.map((tool) => ({
-      key: tool.function.name,
-      name: tool.function.name,
-      description: tool.function.description,
-      args: Object.entries(tool.function.parameters?.properties ?? {}).map(([name, schema]) => ({
-        name,
-        required: (tool.function.parameters?.required ?? []).includes(name),
-        type: typeof schema === 'object' && schema && 'type' in schema ? String((schema as { type?: unknown }).type ?? '') || undefined : undefined,
-        description: typeof schema === 'object' && schema && 'description' in schema ? String((schema as { description?: unknown }).description ?? '') || undefined : undefined,
-        schema: typeof schema === 'object' && schema && !Array.isArray(schema)
-          ? schema as Record<string, unknown>
-          : undefined,
-      })),
-      source: 'mcp.tools',
-    })),
-  };
-}
-
-function normalizeCliArgName(name: string): string {
-  return name.replace(/^-+/, '');
-}
diff --git a/src/actions/index.ts b/src/actions/index.ts
deleted file mode 100644
index dda1e60..0000000
--- a/src/actions/index.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-export { diffActionCatalog, type ActionCatalogDiff, type ChangedAction } from './diff.js';
-export { loadCliCommands, loadMcpTools } from './loaders.js';
-export { discoverActions } from './discover.js';
-export { readCliActionsFromSources } from './readers/cli.js';
-export { readMcpActionsFromSources } from './readers/mcp.js';
-export { readSdkActionsFromSources } from './readers/sdk.js';
-export {
-  ACTION_SNAPSHOT_VERSION,
-  fromSurfaceSnapshot,
-  loadActionSnapshotFile,
-  normalizeActionArgSchema,
-  normalizeActionCatalog,
-  normalizeActionDefinition,
-  toSurfaceSnapshot,
-  writeActionSnapshotFile,
-  type ActionSnapshotArtifact,
-} from './snapshot.js';
-
-export type {
-  ActionArgSchema,
-  ActionAttempt,
-  ActionCatalog,
-  ActionDefinition,
-  ActionSurface,
-} from './types.js';
diff --git a/src/actions/loaders.ts b/src/actions/loaders.ts
deleted file mode 100644
index f4088f2..0000000
--- a/src/actions/loaders.ts
+++ /dev/null
@@ -1,82 +0,0 @@
-import { readFileSync, existsSync } from 'node:fs';
-import { resolve } from 'node:path';
-import type { McpToolDefinition, CliCommandDefinition } from '../benchmark/types.js';
-
-/**
- * Load MCP tool definitions from the tools.json path specified in config.
- */
-export function loadMcpTools(toolsPath: string, baseDir?: string): McpToolDefinition[] {
-  const resolved = resolve(baseDir ?? process.cwd(), toolsPath);
-  if (!existsSync(resolved)) {
-    throw new Error(`MCP tools file not found: ${resolved}`);
-  }
-
-  let raw: string;
-  try {
-    raw = readFileSync(resolved, 'utf-8');
-  } catch (err) {
-    throw new Error(`Failed to read MCP tools: ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  let tools: McpToolDefinition[];
-  try {
-    tools = JSON.parse(raw) as McpToolDefinition[];
-  } catch (err) {
-    throw new Error(`Invalid JSON in MCP tools file: ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  if (!Array.isArray(tools)) {
-    throw new Error(`MCP tools file ${resolved}: must be a JSON array of tool definitions`);
-  }
-
-  return tools;
-}
-
-/**
- * Load CLI command definitions from the commands.json path specified in config.
- */
-export function loadCliCommands(commandsPath: string, baseDir?: string): CliCommandDefinition[] {
-  const resolved = resolve(baseDir ?? process.cwd(), commandsPath);
-  if (!existsSync(resolved)) {
-    throw new Error(`CLI commands file not found: ${resolved}`);
-  }
-
-  let raw: string;
-  try {
-    raw = readFileSync(resolved, 'utf-8');
-  } catch (err) {
-    throw new Error(`Failed to read CLI commands: ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  let commands: CliCommandDefinition[];
-  try {
-    commands = JSON.parse(raw) as CliCommandDefinition[];
-  } catch (err) {
-    throw new Error(`Invalid JSON in CLI commands file: ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  if (!Array.isArray(commands)) {
-    throw new Error(`CLI commands file ${resolved}: must be a JSON array of command definitions`);
-  }
-
-  for (const [index, command] of commands.entries()) {
-    if (!command || typeof command !== 'object') {
-      throw new Error(`CLI commands file ${resolved}: entry ${index} must be an object`);
-    }
-    if (typeof command.command !== 'string' || command.command.trim() === '') {
-      throw new Error(`CLI commands file ${resolved}: entry ${index} must include a non-empty "command" string`);
-    }
-    if (command.options !== undefined && !Array.isArray(command.options)) {
-      throw new Error(`CLI commands file ${resolved}: entry ${index} options must be an array when present`);
-    }
-    if (Array.isArray(command.options)) {
-      for (const [optionIndex, option] of command.options.entries()) {
-        if (!option || typeof option !== 'object' || typeof option.name !== 'string' || option.name.trim() === '') {
-          throw new Error(`CLI commands file ${resolved}: entry ${index} option ${optionIndex} must include a non-empty "name" string`);
-        }
-      }
-    }
-  }
-
-  return commands;
-}
diff --git a/src/actions/readers/cli.ts b/src/actions/readers/cli.ts
deleted file mode 100644
index a1de900..0000000
--- a/src/actions/readers/cli.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-import type { DiscoveryOptions } from '../../discovery/types.js';
-import { discoverCliSurfaceFromSources } from '../../discovery/cli.js';
-import type { ActionDefinition } from '../types.js';
-
-export function readCliActionsFromSources(sources: string[], options: DiscoveryOptions = {}): ActionDefinition[] {
-  const snapshot = discoverCliSurfaceFromSources(sources, options);
-  return snapshot.actions.map((action) => ({
-    key: action.name,
-    name: action.name,
-    description: action.description,
-    args: action.args.map((arg) => ({
-      ...arg,
-      name: normalizeCliArgName(arg.name),
-    })),
-    source: action.source,
-  }));
-}
-
-function normalizeCliArgName(name: string): string {
-  return name.replace(/^-+/, '');
-}
diff --git a/src/actions/readers/mcp.ts b/src/actions/readers/mcp.ts
deleted file mode 100644
index 08a40d9..0000000
--- a/src/actions/readers/mcp.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-import type { DiscoveryOptions } from '../../discovery/types.js';
-import { discoverMcpSurfaceFromSources } from '../../discovery/mcp.js';
-import type { ActionDefinition } from '../types.js';
-
-export function readMcpActionsFromSources(sources: string[], options: DiscoveryOptions = {}): ActionDefinition[] {
-  const snapshot = discoverMcpSurfaceFromSources(sources, options);
-  return snapshot.actions.map((action) => ({
-    key: action.name,
-    name: action.name,
-    description: action.description,
-    args: action.args,
-    source: action.source,
-  }));
-}
diff --git a/src/actions/readers/sdk.ts b/src/actions/readers/sdk.ts
deleted file mode 100644
index e7eb12c..0000000
--- a/src/actions/readers/sdk.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-import type { DiscoveryOptions } from '../../discovery/types.js';
-import { discoverSdkSurfaceFromSources } from '../../discovery/sdk.js';
-import type { ActionDefinition } from '../types.js';
-
-export function readSdkActionsFromSources(sources: string[], options: DiscoveryOptions = {}): ActionDefinition[] {
-  const snapshot = discoverSdkSurfaceFromSources(sources, options);
-  return snapshot.actions.map((action) => ({
-    key: action.name,
-    name: action.name,
-    description: action.description,
-    args: action.args,
-    source: action.source,
-  }));
-}
diff --git a/src/actions/snapshot.ts b/src/actions/snapshot.ts
deleted file mode 100644
index c265a7b..0000000
--- a/src/actions/snapshot.ts
+++ /dev/null
@@ -1,211 +0,0 @@
-import { existsSync, readFileSync, writeFileSync } from 'node:fs';
-
-import type { SurfaceSnapshot } from '../project/types.js';
-import type { ActionArgSchema, ActionCatalog, ActionDefinition } from './types.js';
-
-export const ACTION_SNAPSHOT_VERSION = 1;
-
-export interface ActionSnapshotArtifact {
-  version: typeof ACTION_SNAPSHOT_VERSION;
-  catalog: ActionCatalog;
-}
-
-function invalidSnapshot(snapshotPath: string, detail: string): never {
-  throw new Error(`Invalid action snapshot file: ${snapshotPath} (${detail})`);
-}
-
-function validateActionArgs(snapshotPath: string, actionIndex: number, args: unknown): ActionArgSchema[] {
-  if (!Array.isArray(args)) {
-    invalidSnapshot(snapshotPath, `catalog.actions[${actionIndex}].args must be an array`);
-  }
-
-  return args.map((arg, argIndex) => {
-    const path = `catalog.actions[${actionIndex}].args[${argIndex}]`;
-    if (!arg || typeof arg !== 'object') {
-      invalidSnapshot(snapshotPath, `${path} must be an object`);
-    }
-    const candidate = arg as Partial<ActionArgSchema>;
-    if (typeof candidate.name !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.name must be a string`);
-    }
-    if (typeof candidate.required !== 'boolean') {
-      invalidSnapshot(snapshotPath, `${path}.required must be a boolean`);
-    }
-    if (candidate.type !== undefined && typeof candidate.type !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.type must be a string when provided`);
-    }
-    if (candidate.description !== undefined && typeof candidate.description !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.description must be a string when provided`);
-    }
-    if (candidate.schema !== undefined && (!candidate.schema || typeof candidate.schema !== 'object' || Array.isArray(candidate.schema))) {
-      invalidSnapshot(snapshotPath, `${path}.schema must be an object when provided`);
-    }
-
-    return {
-      name: candidate.name,
-      required: candidate.required,
-      type: candidate.type,
-      description: candidate.description,
-      schema: candidate.schema as Record<string, unknown> | undefined,
-    };
-  });
-}
-
-function validateCatalogActions(snapshotPath: string, actions: unknown): ActionDefinition[] {
-  if (!Array.isArray(actions)) {
-    invalidSnapshot(snapshotPath, 'catalog.actions must be an array');
-  }
-
-  return actions.map((action, actionIndex) => {
-    const path = `catalog.actions[${actionIndex}]`;
-    if (!action || typeof action !== 'object') {
-      invalidSnapshot(snapshotPath, `${path} must be an object`);
-    }
-    const candidate = action as Partial<ActionDefinition>;
-    if (typeof candidate.key !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.key must be a string`);
-    }
-    if (typeof candidate.name !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.name must be a string`);
-    }
-    if (candidate.description !== undefined && typeof candidate.description !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.description must be a string when provided`);
-    }
-    if (candidate.source !== undefined && typeof candidate.source !== 'string') {
-      invalidSnapshot(snapshotPath, `${path}.source must be a string when provided`);
-    }
-
-    return {
-      key: candidate.key,
-      name: candidate.name,
-      description: candidate.description,
-      args: validateActionArgs(snapshotPath, actionIndex, candidate.args),
-      source: candidate.source,
-    };
-  });
-}
-
-function normalizeSchemaValue(value: unknown): unknown {
-  if (Array.isArray(value)) {
-    return value.map(normalizeSchemaValue);
-  }
-
-  if (!value || typeof value !== 'object') {
-    return value;
-  }
-
-  const normalizedEntries = Object.entries(value as Record<string, unknown>)
-    .sort(([left], [right]) => left.localeCompare(right))
-    .map(([key, childValue]) => {
-      if (key === 'required' && Array.isArray(childValue) && childValue.every((entry) => typeof entry === 'string')) {
-        return [key, [...childValue].sort()] as const;
-      }
-
-      return [key, normalizeSchemaValue(childValue)] as const;
-    });
-
-  return Object.fromEntries(normalizedEntries);
-}
-
-export function normalizeActionArgSchema(args: ActionArgSchema[]): ActionArgSchema[] {
-  return [...args]
-    .map((arg) => ({
-      name: arg.name,
-      required: Boolean(arg.required),
-      type: arg.type,
-      description: arg.description,
-      schema: arg.schema ? normalizeSchemaValue(arg.schema) as Record<string, unknown> : undefined,
-    }))
-    .sort((a, b) => a.name.localeCompare(b.name));
-}
-
-export function normalizeActionDefinition(action: ActionDefinition): ActionDefinition {
-  return {
-    ...action,
-    key: action.key.trim(),
-    args: normalizeActionArgSchema(action.args),
-  };
-}
-
-export function normalizeActionCatalog(catalog: ActionCatalog): ActionCatalog {
-  return {
-    surface: catalog.surface,
-    actions: catalog.actions.map(normalizeActionDefinition),
-  };
-}
-
-export function writeActionSnapshotFile(snapshotPath: string, catalog: ActionCatalog): void {
-  const artifact: ActionSnapshotArtifact = {
-    version: ACTION_SNAPSHOT_VERSION,
-    catalog: normalizeActionCatalog(catalog),
-  };
-  writeFileSync(snapshotPath, JSON.stringify(artifact, null, 2), 'utf-8');
-}
-
-export function loadActionSnapshotFile(snapshotPath: string): ActionSnapshotArtifact {
-  if (!existsSync(snapshotPath)) {
-    throw new Error(`Action snapshot file not found: ${snapshotPath}`);
-  }
-
-  const raw = readFileSync(snapshotPath, 'utf-8');
-  let parsed: unknown;
-  try {
-    parsed = JSON.parse(raw) as unknown;
-  } catch (error) {
-    invalidSnapshot(snapshotPath, `invalid JSON: ${error instanceof Error ? error.message : String(error)}`);
-  }
-  if (!parsed || typeof parsed !== 'object') {
-    invalidSnapshot(snapshotPath, 'expected object root');
-  }
-
-  const candidate = parsed as Partial<ActionSnapshotArtifact>;
-  if (typeof candidate.version !== 'number') {
-    invalidSnapshot(snapshotPath, 'version must be a number');
-  }
-  if (candidate.version !== ACTION_SNAPSHOT_VERSION) {
-    throw new Error(`Unsupported action snapshot version ${candidate.version}; expected ${ACTION_SNAPSHOT_VERSION}`);
-  }
-
-  if (!candidate.catalog || typeof candidate.catalog !== 'object') {
-    invalidSnapshot(snapshotPath, 'catalog must be an object');
-  }
-
-  const catalog = candidate.catalog as Partial<ActionCatalog>;
-  if (catalog.surface !== 'sdk' && catalog.surface !== 'cli' && catalog.surface !== 'mcp' && catalog.surface !== 'prompt') {
-    invalidSnapshot(snapshotPath, 'catalog.surface must be one of sdk|cli|mcp|prompt');
-  }
-  const validatedActions = validateCatalogActions(snapshotPath, catalog.actions);
-
-  return {
-    version: ACTION_SNAPSHOT_VERSION,
-    catalog: normalizeActionCatalog({
-      surface: catalog.surface,
-      actions: validatedActions,
-    }),
-  };
-}
-
-export function fromSurfaceSnapshot(snapshot: SurfaceSnapshot): ActionCatalog {
-  return normalizeActionCatalog({
-    surface: snapshot.surface,
-    actions: snapshot.actions.map((action) => ({
-      key: action.name,
-      name: action.name,
-      description: action.description,
-      args: normalizeActionArgSchema(action.args),
-      source: action.source,
-    })),
-  });
-}
-
-export function toSurfaceSnapshot(catalog: ActionCatalog): SurfaceSnapshot {
-  return {
-    surface: catalog.surface,
-    actions: catalog.actions.map((action) => ({
-      name: action.name,
-      description: action.description,
-      args: normalizeActionArgSchema(action.args),
-      source: action.source,
-    })),
-  };
-}
diff --git a/src/actions/types.ts b/src/actions/types.ts
deleted file mode 100644
index 1ed78c4..0000000
--- a/src/actions/types.ts
+++ /dev/null
@@ -1,30 +0,0 @@
-export type ActionSurface = 'sdk' | 'cli' | 'mcp' | 'prompt';
-
-export interface ActionArgSchema {
-  name: string;
-  required: boolean;
-  type?: string;
-  description?: string;
-  schema?: Record<string, unknown>;
-}
-
-export interface ActionDefinition {
-  key: string;
-  name: string;
-  description?: string;
-  args: ActionArgSchema[];
-  source?: string;
-}
-
-export interface ActionCatalog {
-  surface: ActionSurface;
-  actions: ActionDefinition[];
-}
-
-export interface ActionAttempt {
-  method: string;
-  key?: string;
-  args: Record<string, unknown>;
-  line: number;
-  raw: string;
-}
diff --git a/src/benchmark/compare.ts b/src/benchmark/compare.ts
deleted file mode 100644
index 980ec8f..0000000
--- a/src/benchmark/compare.ts
+++ /dev/null
@@ -1,244 +0,0 @@
-import { readFileSync } from 'node:fs';
-import type { BenchmarkReport, ComparisonReport, TaskDelta, Delta } from './types.js';
-
-/**
- * Load a benchmark report from a JSON file.
- */
-export function loadReport(path: string): BenchmarkReport {
-  return JSON.parse(readFileSync(path, 'utf-8')) as BenchmarkReport;
-}
-
-/**
- * Compare baseline and current reports. Compute deltas for each task×model pair.
- */
-export function compareReports(
-  baseline: BenchmarkReport,
-  current: BenchmarkReport,
-): ComparisonReport {
-  const baselineMap = new Map<string, { passed: boolean; recall: number; toolSelection: number }>();
-  for (const r of baseline.results) {
-    const key = `${r.task.id}:${r.model.id}`;
-    baselineMap.set(key, {
-      passed: r.metrics.taskPassed,
-      recall: r.metrics.toolRecall,
-      toolSelection: r.metrics.toolSelectionAccuracy,
-    });
-  }
-
-  const currentMap = new Map<string, { passed: boolean; recall: number; toolSelection: number; taskId: string; modelId: string }>();
-  for (const r of current.results) {
-    const key = `${r.task.id}:${r.model.id}`;
-    currentMap.set(key, {
-      passed: r.metrics.taskPassed,
-      recall: r.metrics.toolRecall,
-      toolSelection: r.metrics.toolSelectionAccuracy,
-      taskId: r.task.id,
-      modelId: r.model.id,
-    });
-  }
-
-  const allKeys = new Set<string>([...baselineMap.keys(), ...currentMap.keys()]);
-
-  const taskDeltas: TaskDelta[] = [];
-  let improved = 0;
-  let regressed = 0;
-  let unchanged = 0;
-
-  for (const key of allKeys) {
-    const [taskId, ...modelParts] = key.split(':');
-    const modelId = modelParts.join(':'); // model IDs may contain ':'
-
-    const inBaseline = baselineMap.get(key);
-    const inCurrent = currentMap.get(key);
-
-    let delta: Delta;
-    let passedBefore = false;
-    let passedNow = false;
-    let recallBefore = 0;
-    let recallNow = 0;
-
-    if (inBaseline && inCurrent) {
-      passedBefore = inBaseline.passed;
-      passedNow = inCurrent.passed;
-      recallBefore = inBaseline.recall;
-      recallNow = inCurrent.recall;
-
-      if (!passedBefore && passedNow) {
-        delta = 'improved';
-        improved++;
-      } else if (passedBefore && !passedNow) {
-        delta = 'regressed';
-        regressed++;
-      } else {
-        delta = 'unchanged';
-        unchanged++;
-      }
-    } else if (inCurrent && !inBaseline) {
-      passedNow = inCurrent.passed;
-      recallNow = inCurrent.recall;
-      delta = 'new';
-    } else {
-      passedBefore = inBaseline!.passed;
-      recallBefore = inBaseline!.recall;
-      delta = 'removed';
-    }
-
-    taskDeltas.push({
-      taskId: taskId ?? key,
-      modelId,
-      passedBefore,
-      passedNow,
-      delta,
-      recallBefore,
-      recallNow,
-      toolSelectionBefore: inBaseline?.toolSelection ?? 0,
-      toolSelectionNow: inCurrent?.toolSelection ?? 0,
-    });
-  }
-
-  const deltaOrder: Record<Delta, number> = {
-    regressed: 0,
-    improved: 1,
-    new: 2,
-    removed: 3,
-    unchanged: 4,
-  };
-  taskDeltas.sort((a, b) => {
-    const orderDiff = deltaOrder[a.delta] - deltaOrder[b.delta];
-    if (orderDiff !== 0) return orderDiff;
-    if (a.taskId < b.taskId) return -1;
-    if (a.taskId > b.taskId) return 1;
-    return a.modelId.localeCompare(b.modelId);
-  });
-
-  const coverageBefore = baseline.summary.methodCoveragePercent;
-  const coverageNow = current.summary.methodCoveragePercent;
-  const accuracyBefore = baseline.summary.overallPassRate;
-  const accuracyNow = current.summary.overallPassRate;
-
-  return {
-    baseline: {
-      timestamp: baseline.timestamp,
-      skillVersion: baseline.skillVersion,
-    },
-    current: {
-      timestamp: current.timestamp,
-      skillVersion: current.skillVersion,
-    },
-    taskDeltas,
-    summary: {
-      improved,
-      regressed,
-      unchanged,
-      coverageBefore,
-      coverageNow,
-      accuracyBefore,
-      accuracyNow,
-    },
-  };
-}
-
-/**
- * Print comparison to console.
- */
-export function printComparison(comparison: ComparisonReport): void {
-  const { baseline, current, taskDeltas, summary } = comparison;
-
-  const baseSha = baseline.skillVersion.commitSha.slice(0, 8);
-  const curSha = current.skillVersion.commitSha.slice(0, 8);
-
-  console.log('');
-  console.log(`Skill Version: ${baseSha} → ${curSha}`);
-  console.log(`Baseline:      ${new Date(baseline.timestamp).toUTCString()}`);
-  console.log(`Current:       ${new Date(current.timestamp).toUTCString()}`);
-  console.log('');
-
-  // Column widths
-  const COL_TASK = 26;
-  const COL_MODEL = 20;
-  const COL_BEFORE = 10;
-  const COL_AFTER = 10;
-  const COL_DELTA = 12;
-
-  // Header
-  const header =
-    padR('Task', COL_TASK) +
-    '  ' +
-    padR('Model', COL_MODEL) +
-    '  ' +
-    padR('Baseline', COL_BEFORE) +
-    '  ' +
-    padR('Current', COL_AFTER) +
-    '  ' +
-    'Delta';
-  console.log(header);
-  console.log('─'.repeat(header.length + COL_DELTA));
-
-  const interesting = taskDeltas.filter(d => d.delta !== 'unchanged');
-  const unchangedDeltas = taskDeltas.filter(d => d.delta === 'unchanged');
-
-  const printRow = (d: TaskDelta): void => {
-    const before = d.delta === 'new' ? '—' : d.passedBefore ? '✅' : '❌';
-    const after = d.delta === 'removed' ? '—' : d.passedNow ? '✅' : '❌';
-
-    let deltaLabel: string;
-    switch (d.delta) {
-      case 'improved':
-        deltaLabel = 'IMPROVED ↑';
-        break;
-      case 'regressed':
-        deltaLabel = 'REGRESSED ↓';
-        break;
-      case 'new':
-        deltaLabel = 'new';
-        break;
-      case 'removed':
-        deltaLabel = 'removed';
-        break;
-      default:
-        deltaLabel = 'unchanged';
-    }
-
-    const modelDisplay = d.modelId.includes('/') ? d.modelId.split('/').pop()! : d.modelId;
-
-    console.log(
-      padR(d.taskId, COL_TASK) +
-        '  ' +
-        padR(modelDisplay.slice(0, COL_MODEL), COL_MODEL) +
-        '  ' +
-        padR(before, COL_BEFORE) +
-        '  ' +
-        padR(after, COL_AFTER) +
-        '  ' +
-        deltaLabel,
-    );
-  };
-
-  for (const d of interesting) {
-    printRow(d);
-  }
-
-  if (interesting.length > 0 && unchangedDeltas.length > 0) {
-    console.log(`  ... and ${unchangedDeltas.length} unchanged result(s)`);
-  } else if (unchangedDeltas.length > 0 && interesting.length === 0) {
-    console.log(`  All ${unchangedDeltas.length} result(s) unchanged.`);
-  }
-
-  console.log('');
-  console.log(
-    `Summary: ${summary.improved} improved, ${summary.regressed} regressed, ${summary.unchanged} unchanged`,
-  );
-  console.log(
-    `Coverage: ${(summary.coverageBefore * 100).toFixed(1)}% → ${(summary.coverageNow * 100).toFixed(1)}%`,
-  );
-  console.log(
-    `Accuracy: ${(summary.accuracyBefore * 100).toFixed(1)}% → ${(summary.accuracyNow * 100).toFixed(1)}%`,
-  );
-  console.log('');
-}
-
-// ── Internal helpers ──────────────────────────────────────────────────────────
-
-function padR(s: string, w: number): string {
-  return s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length);
-}
diff --git a/src/benchmark/config.ts b/src/benchmark/config.ts
deleted file mode 100644
index b5f9923..0000000
--- a/src/benchmark/config.ts
+++ /dev/null
@@ -1,160 +0,0 @@
-import { readFileSync, existsSync } from 'node:fs';
-import { resolve } from 'node:path';
-import type {
-  BenchmarkConfig,
-  TaskDefinition,
-  ModelConfig,
-  ExpectedAction,
-} from './types.js';
-import { DEFAULT_PROJECT_CONFIG_NAME, loadProjectConfig, toBenchmarkConfig } from '../project/index.js';
-export { loadMcpTools, loadCliCommands } from '../actions/loaders.js';
-
-const DEFAULT_CONFIG_NAME = DEFAULT_PROJECT_CONFIG_NAME;
-const SAFE_TASK_ID = /^[A-Za-z0-9._-]+$/;
-
-function isSafeTaskId(taskId: string): boolean {
-  return SAFE_TASK_ID.test(taskId) && taskId !== '.' && taskId !== '..';
-}
-
-/**
- * Load benchmark config from the given path or search for benchmark.config.json
- * in the current working directory.
- */
-export async function loadConfig(configPath?: string): Promise<{ config: BenchmarkConfig; configDir: string }> {
-  // Skip the dirty-git check here — it runs on every benchmark invocation (baseline,
-  // each iteration) which causes false failures when the mutation agent operates in
-  // the target repo between iterations. The optimizer manages git state via ensureReady
-  // (run once before the loop); the standalone `run` command validates via its own
-  // loadProjectConfig call in cli.ts before generating tasks.
-  const project = await loadProjectConfig(configPath ?? DEFAULT_CONFIG_NAME, { skipDirtyGitCheck: true });
-  return {
-    config: toBenchmarkConfig(project),
-    configDir: project.configDir,
-  };
-}
-
-/**
- * Load task definitions from the tasks.json path specified in config.
- * Resolves the path relative to the config file's directory or CWD.
- */
-export function loadTasks(tasksPath: string, baseDir?: string): TaskDefinition[] {
-  const resolved = resolve(baseDir ?? process.cwd(), tasksPath);
-  if (!existsSync(resolved)) {
-    throw new Error(`Tasks file not found: ${resolved}`);
-  }
-
-  let raw: string;
-  try {
-    raw = readFileSync(resolved, 'utf-8');
-  } catch (err) {
-    throw new Error(`Failed to read tasks: ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  let parsed: { tasks: Array<{ id?: unknown; prompt?: unknown; expected_actions?: unknown; verify?: unknown; expected_fetches?: unknown; capabilityId?: unknown }> };
-  try {
-    parsed = JSON.parse(raw) as typeof parsed;
-  } catch (err) {
-    throw new Error(`Invalid JSON in tasks file: ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  if (!parsed.tasks || !Array.isArray(parsed.tasks)) {
-    throw new Error(`Tasks file ${resolved}: must have a "tasks" array at the root`);
-  }
-
-  return parsed.tasks.map((task, index) => normalizeTaskDefinition(task, resolved, index));
-}
-
-function normalizeTaskDefinition(
-  task: { id?: unknown; prompt?: unknown; expected_actions?: unknown; verify?: unknown; expected_fetches?: unknown; capabilityId?: unknown },
-  resolvedPath: string,
-  index: number,
-): TaskDefinition {
-  if (typeof task.id !== 'string' || task.id.trim() === '') {
-    throw new Error(`Tasks file ${resolvedPath}: task at index ${index} must include a non-empty string id`);
-  }
-  if (!isSafeTaskId(task.id)) {
-    throw new Error(`Tasks file ${resolvedPath}: task id "${task.id}" must match ${SAFE_TASK_ID.toString()} and cannot be . or ..`);
-  }
-  if (typeof task.prompt !== 'string' || task.prompt.trim() === '') {
-    throw new Error(`Tasks file ${resolvedPath}: task ${task.id} must include a non-empty string prompt`);
-  }
-
-  const rawExpectedActions = Array.isArray(task.expected_actions) ? task.expected_actions : null;
-
-  if (!rawExpectedActions) {
-    throw new Error(`Tasks file ${resolvedPath}: task at index ${index} must include an expected_actions array`);
-  }
-
-  const expected_actions = rawExpectedActions.map((rawAction, actionIndex) => normalizeExpectedAction(rawAction, resolvedPath, index, actionIndex));
-
-  const rawVerify = Array.isArray(task.verify) ? task.verify : undefined;
-  if (rawVerify !== undefined) {
-    for (let i = 0; i < rawVerify.length; i++) {
-      if (!rawVerify[i] || typeof rawVerify[i] !== 'object') {
-        throw new Error(`Tasks file ${resolvedPath}: task ${task.id} verify[${i}] must be an object`);
-      }
-    }
-  }
-
-  const rawFetches = Array.isArray(task.expected_fetches) ? task.expected_fetches : undefined;
-  if (rawFetches !== undefined) {
-    for (let i = 0; i < rawFetches.length; i++) {
-      if (typeof rawFetches[i] !== 'string' || !(rawFetches[i] as string).trim()) {
-        throw new Error(`Tasks file ${resolvedPath}: task ${task.id} expected_fetches[${i}] must be a non-empty string`);
-      }
-    }
-  }
-
-  const capabilityId = typeof task.capabilityId === 'string' ? task.capabilityId : undefined;
-
-  return {
-    id: task.id,
-    prompt: task.prompt,
-    expected_actions,
-    verify: rawVerify as TaskDefinition['verify'] | undefined,
-    expected_fetches: rawFetches as string[] | undefined,
-    ...(capabilityId !== undefined ? { capabilityId } : {}),
-  };
-}
-
-function normalizeExpectedAction(
-  rawAction: unknown,
-  resolvedPath: string,
-  taskIndex: number,
-  actionIndex: number,
-): ExpectedAction {
-  if (!rawAction || typeof rawAction !== 'object') {
-    throw new Error(`Tasks file ${resolvedPath}: task ${taskIndex} action ${actionIndex} must be an object`);
-  }
-
-  const candidate = rawAction as { name?: unknown; args?: unknown };
-  const name = typeof candidate.name === 'string' ? candidate.name : null;
-
-  if (!name || name.trim() === '') {
-    throw new Error(`Tasks file ${resolvedPath}: task ${taskIndex} action ${actionIndex} must include a non-empty name`);
-  }
-
-  if (candidate.args !== undefined && (!candidate.args || typeof candidate.args !== 'object' || Array.isArray(candidate.args))) {
-    throw new Error(`Tasks file ${resolvedPath}: task ${taskIndex} action ${actionIndex} args must be an object when present`);
-  }
-
-  return {
-    name,
-    args: (candidate.args as Record<string, unknown> | undefined) ?? {},
-  };
-}
-
-/**
- * Helper to get a model by slug (lowercased name with non-alphanumeric replaced by hyphens).
- */
-export function slugify(name: string): string {
-  return name.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
-}
-
-export function getModelBySlug(config: BenchmarkConfig, slug: string): ModelConfig | undefined {
-  return config.llm.models.find(m => slugify(m.name) === slug || m.id === slug);
-}
-
-export function getModelsByTier(config: BenchmarkConfig, tier: string): ModelConfig[] {
-  return config.llm.models.filter(m => m.tier === tier);
-}
diff --git a/src/benchmark/coverage.ts b/src/benchmark/coverage.ts
deleted file mode 100644
index f7bce4e..0000000
--- a/src/benchmark/coverage.ts
+++ /dev/null
@@ -1,62 +0,0 @@
-import type { TaskDefinition, MethodCoverage } from './types.js';
-import { getExpectedActions, getExpectedActionName } from './types.js';
-
-// ── Coverage computation ───────────────────────────────────────────────────
-
-/**
- * Compute which methods are covered by at least one task in the task suite.
- *
- * @param tasks - The task definitions to check coverage against
- * @param allMethods - The full list of known methods (from config.code.methods or MCP tool names)
- */
-export function computeCoverage(tasks: TaskDefinition[], allMethods: string[]): MethodCoverage[] {
-  return allMethods.map((method) => {
-    const tasksCovering: string[] = [];
-
-    for (const task of tasks) {
-      const covers = getExpectedActions(task).some((tool) => getExpectedActionName(tool) === method);
-      if (covers) {
-        tasksCovering.push(task.id);
-      }
-    }
-
-    return {
-      method,
-      tasksCovering,
-      covered: tasksCovering.length > 0,
-    };
-  });
-}
-
-// ── Coverage report ────────────────────────────────────────────────────────
-
-/**
- * Print a coverage report to console.
- *
- * Example output:
- *   SDK Method Coverage:
- *   ✔ MyClient.constructor      (3 tasks)
- *   ✘ MyClient.submit           (0 tasks)
- *
- *   Coverage: 14/17 methods (82%)
- */
-export function printCoverage(coverage: MethodCoverage[]): void {
-  console.log('SDK Method Coverage:');
-
-  const maxMethodLen = Math.max(...coverage.map((c) => c.method.length));
-
-  for (const entry of coverage) {
-    const icon = entry.covered ? '✔' : '✘';
-    const padded = entry.method.padEnd(maxMethodLen);
-    const taskCount = entry.tasksCovering.length;
-    const taskLabel = taskCount === 1 ? 'task' : 'tasks';
-    console.log(`${icon} ${padded}  (${taskCount} ${taskLabel})`);
-  }
-
-  const coveredCount = coverage.filter((c) => c.covered).length;
-  const totalCount = coverage.length;
-  const percent = totalCount === 0 ? 0 : Math.round((coveredCount / totalCount) * 100);
-
-  console.log('');
-  console.log(`Coverage: ${coveredCount}/${totalCount} methods (${percent}%)`);
-}
diff --git a/src/benchmark/evaluator.ts b/src/benchmark/evaluator.ts
deleted file mode 100644
index c1bf64e..0000000
--- a/src/benchmark/evaluator.ts
+++ /dev/null
@@ -1,549 +0,0 @@
-import type {
-  ExpectedAction,
-  ExtractedCall,
-  ActionMatch,
-  TaskDefinition,
-  TaskResult,
-  ModelConfig,
-  TokenUsage,
-} from './types.js';
-import { getExpectedActionName, getExpectedActions } from './types.js';
-
-// ── Argument matching ──────────────────────────────────────────────────────
-
-/**
- * Resolve an argument value from an extracted call's args, with positional fallback.
- *
- * CLI commands often take positional arguments (e.g. `fast account set-default myname`)
- * which the extractor records as `_positional_0`, `_positional_1`, etc. When the expected
- * args use the semantic name (e.g. `{name: "myname"}`), we look for the value in positionals
- * as a fallback so that positional and named-flag invocations both match.
- */
-function resolveArgValue(args: Record<string, unknown>, key: string, expectedValue: unknown): unknown {
-  const direct = args[key];
-  if (direct !== undefined) return direct;
-  // Positional fallback: if expected is a plain string, check _positional_N entries
-  if (typeof expectedValue === 'string') {
-    for (const [k, v] of Object.entries(args)) {
-      if (k.startsWith('_positional_') && v === expectedValue) {
-        return v;
-      }
-    }
-  }
-  return undefined;
-}
-
-/**
- * Compare an extracted argument value against an expected string value.
- *
- * Rules:
- * 1. If got is a sentinel (<dynamic>, <template>, <spread>, or starts with <),
- *    we can't verify runtime values — treat as match (benefit of the doubt).
- * 2. If expected value is a regex pattern (starts and ends with `/`), use regex match.
- * 3. Type-aware comparison: booleans, numbers, null/undefined, then string normalization.
- */
-function matchArgValue(expected: unknown, got: unknown): boolean {
-  // 0. Coerce expected to string if it's not already (handles booleans, numbers from JSON)
-  const exp = typeof expected === 'string' ? expected : String(expected);
-
-  // 1a. If expected is a sentinel (<dynamic>, <template>, <identifier>),
-  //     it means "any value is acceptable" — treat as wildcard match.
-  if (exp.startsWith('<') && exp.endsWith('>')) {
-    return true;
-  }
-
-  // 1b. If got is a sentinel (<dynamic>, <template>, <spread>, or starts with <),
-  //     we can't verify runtime values — treat as match (benefit of the doubt)
-  if (typeof got === 'string' && got.startsWith('<') && got.endsWith('>')) {
-    return true;
-  }
-
-  // 2. Regex pattern: /pattern/flags
-  const regexMatch = exp.match(/^\/(.+)\/([gimsuy]*)$/);
-  if (regexMatch) {
-    try {
-      const gotStr = got === null ? 'null' : got === undefined ? 'undefined' : String(got);
-      const re = new RegExp(regexMatch[1], regexMatch[2]);
-      return re.test(gotStr);
-    } catch {
-      // Invalid regex — fall through
-    }
-  }
-
-  // 3. Type-aware comparison
-  // Boolean: expected "true"/"false" must match actual boolean
-  if (exp === 'true' || exp === 'false') {
-    const expectedBool = exp === 'true';
-    if (typeof got === 'boolean') return got === expectedBool;
-    if (typeof got === 'string') return got.toLowerCase() === exp;
-    return false;
-  }
-
-  // Number: expected numeric string must match number or numeric string
-  const expectedNum = Number(exp);
-  if (!isNaN(expectedNum) && exp.trim() !== '') {
-    if (typeof got === 'number') return got === expectedNum;
-    if (typeof got === 'string') {
-      const gotNum = Number(got);
-      return !isNaN(gotNum) && gotNum === expectedNum;
-    }
-    return false;
-  }
-
-  // Null/undefined
-  if (exp === 'null') return got === null || got === 'null';
-  if (exp === 'undefined') return got === undefined || got === 'undefined';
-
-  // String: case-insensitive, strip whitespace and common punctuation
-  const normalize = (s: string): string =>
-    s.toLowerCase().replace(/[\s,./_\-*^]+/g, '').trim();
-
-  const gotStr = got === null ? 'null' : got === undefined ? 'undefined' : String(got);
-  return normalize(exp) === normalize(gotStr);
-}
-
-function stringifyExpected(expected: unknown): string {
-  if (typeof expected === 'string') return expected;
-  try {
-    return JSON.stringify(expected);
-  } catch {
-    return String(expected);
-  }
-}
-
-function isPlainObject(value: unknown): value is Record<string, unknown> {
-  return typeof value === 'object' && value !== null && !Array.isArray(value);
-}
-
-function matchExpectedValue(
-  expected: unknown,
-  got: unknown,
-  path: string,
-  argResults: Record<string, { expected: string; got: unknown; match: boolean }>,
-): boolean {
-  // Array matching: positional match up to expected.length.
-  // Extra elements in `got` beyond expected.length are ignored (subset matching).
-  // Missing elements in `got` (got[i] is undefined) will fail the scalar/object match.
-  if (Array.isArray(expected)) {
-    if (!Array.isArray(got)) {
-      argResults[path] = { expected: stringifyExpected(expected), got, match: false };
-      return false;
-    }
-
-    let allMatch = true;
-    for (let i = 0; i < expected.length; i++) {
-      const childPath = `${path}[${i}]`;
-      if (!matchExpectedValue(expected[i], got[i], childPath, argResults)) {
-        allMatch = false;
-      }
-    }
-    return allMatch;
-  }
-
-  if (isPlainObject(expected)) {
-    if (!isPlainObject(got)) {
-      argResults[path] = { expected: stringifyExpected(expected), got, match: false };
-      return false;
-    }
-
-    let allMatch = true;
-    for (const [key, value] of Object.entries(expected)) {
-      const childPath = path ? `${path}.${key}` : key;
-      if (!matchExpectedValue(value, got[key], childPath, argResults)) {
-        allMatch = false;
-      }
-    }
-    return allMatch;
-  }
-
-  const match = matchArgValue(expected, got);
-  argResults[path] = {
-    expected: stringifyExpected(expected),
-    got,
-    match,
-  };
-  return match;
-}
-
-// ── Tool matching ──────────────────────────────────────────────────────────
-
-/**
- * Match extracted calls against expected actions.
- * Returns ActionMatch[] with match details.
- *
- * Each extracted call can only be matched to ONE expected action (greedy, first match wins).
- */
-export function matchActions(
-  expectedActions: ExpectedAction[],
-  extractedCalls: ExtractedCall[],
-): ActionMatch[] {
-  // Track which extracted call indices have already been consumed
-  const usedIndices = new Set<number>();
-
-  return expectedActions.map((expected) => {
-    const expectedMethod = getExpectedActionName(expected);
-    // If there are args to check, try to find a perfect match (method + args) first.
-    // Otherwise, find the first unused extracted call that matches the method name.
-    const hasExpectedArgs = expected.args && Object.keys(expected.args).length > 0;
-
-    if (hasExpectedArgs) {
-      // Two-pass strategy:
-      // Pass 1: look for a perfect match (method name AND all args match).
-      let perfectMatchIndex = -1;
-      for (let i = 0; i < extractedCalls.length; i++) {
-        if (usedIndices.has(i)) continue;
-        if (extractedCalls[i].method !== expectedMethod) continue;
-        // Check args
-        const trialArgResults: Record<string, { expected: string; got: unknown; match: boolean }> = {};
-        let allArgsMatch = true;
-        for (const [key, expectedValue] of Object.entries(expected.args!)) {
-          if (!matchExpectedValue(expectedValue, resolveArgValue(extractedCalls[i].args, key, expectedValue), key, trialArgResults)) {
-            allArgsMatch = false;
-          }
-        }
-        if (allArgsMatch) {
-          perfectMatchIndex = i;
-          break;
-        }
-      }
-
-      if (perfectMatchIndex !== -1) {
-        // Perfect match found — consume it and report success.
-        usedIndices.add(perfectMatchIndex);
-        const found = extractedCalls[perfectMatchIndex];
-        const argResults: Record<string, { expected: string; got: unknown; match: boolean }> = {};
-        for (const [key, expectedValue] of Object.entries(expected.args!)) {
-          matchExpectedValue(expectedValue, resolveArgValue(found.args, key, expectedValue), key, argResults);
-        }
-        return {
-          expected,
-          found,
-          methodFound: true,
-          argsCorrect: true,
-          matched: true,
-          argResults,
-        } satisfies ActionMatch;
-      }
-
-      // Pass 2: no perfect match — fall back to the first method-name match.
-      // Consume it to prevent re-matching by a later expected tool.
-      let fallbackIndex = -1;
-      for (let i = 0; i < extractedCalls.length; i++) {
-        if (usedIndices.has(i)) continue;
-        if (extractedCalls[i].method === expectedMethod) {
-          fallbackIndex = i;
-          break;
-        }
-      }
-
-      if (fallbackIndex === -1) {
-        // Method not found at all
-        return {
-          expected,
-          found: null,
-          methodFound: false,
-          argsCorrect: false,
-          matched: false,
-        } satisfies ActionMatch;
-      }
-
-      // Consume the fallback index so it can't be re-matched
-      usedIndices.add(fallbackIndex);
-      const found = extractedCalls[fallbackIndex];
-      const argResults: Record<string, { expected: string; got: unknown; match: boolean }> = {};
-      let allArgsMatch = true;
-      for (const [key, expectedValue] of Object.entries(expected.args!)) {
-        if (!matchExpectedValue(expectedValue, resolveArgValue(found.args, key, expectedValue), key, argResults)) {
-          allArgsMatch = false;
-        }
-      }
-      return {
-        expected,
-        found,
-        methodFound: true,
-        argsCorrect: allArgsMatch,
-        matched: allArgsMatch,
-        argResults,
-      } satisfies ActionMatch;
-    }
-
-    // No args to check — find the first unused extracted call matching the method name.
-    let foundIndex = -1;
-    for (let i = 0; i < extractedCalls.length; i++) {
-      if (usedIndices.has(i)) continue;
-        if (extractedCalls[i].method === expectedMethod) {
-        foundIndex = i;
-        break;
-      }
-    }
-
-    if (foundIndex === -1) {
-      // Method not found at all
-      return {
-        expected,
-        found: null,
-        methodFound: false,
-        argsCorrect: false,
-        matched: false,
-      } satisfies ActionMatch;
-    }
-
-    // No args to check — method match is sufficient
-    usedIndices.add(foundIndex);
-    return {
-      expected,
-      found: extractedCalls[foundIndex],
-      methodFound: true,
-      argsCorrect: true,
-      matched: true,
-    } satisfies ActionMatch;
-  });
-}
-
-// ── Call resolution from bindings + task expectations ───────────────────────
-
-/**
- * Resolve raw extracted calls (e.g. 'f.setup') into typed calls (e.g. 'FastClient.setup')
- * using variable bindings from the extractor and type prefixes from task expected actions.
- *
- * Algorithm:
- * 1. Collect type prefixes from expected tools (e.g. 'FastClient' from 'FastClient.setup')
- * 2. Collect standalone method names (e.g. 'fast' from { method: 'fast' })
- * 3. Build inference map: if task expects standalone 'fast' AND 'FastClient.*',
- *    and bindings show f ← fast, then fast → FastClient
- * 4. Resolve all raw calls through this map
- */
-function resolveCallsFromBindings(
-  extractedCalls: ExtractedCall[],
-  bindings: Map<string, string>,
-  expectedTools: ExpectedAction[],
-): ExtractedCall[] {
-  // 1. Collect type prefixes from expected tools.
-  const expectedPrefixes = new Set<string>();
-  for (const tool of expectedTools) {
-    const expectedMethod = getExpectedActionName(tool);
-    if (expectedMethod.includes('.')) {
-      expectedPrefixes.add(expectedMethod.split('.')[0]);
-    }
-  }
-
-  // 2. Build source-to-type map for direct class bindings and inferred factory returns.
-  const sourceToType = new Map<string, string>();
-
-  // Direct class matches (e.g. allset ← AllSetProvider, AllSetProvider is a prefix)
-  for (const [, source] of bindings) {
-    if (expectedPrefixes.has(source)) {
-      sourceToType.set(source, source);
-    }
-  }
-
-  // Inferred matches (e.g. fast is a factory, or wallet is an instance-returning method).
-  const expectedPrefixMethods = new Map<string, Set<string>>(); // prefix → set of method names
-  for (const tool of expectedTools) {
-    const expectedMethod = getExpectedActionName(tool);
-    const dotIdx = expectedMethod.indexOf('.');
-    if (dotIdx !== -1) {
-      const prefix = expectedMethod.slice(0, dotIdx);
-      const method = expectedMethod.slice(dotIdx + 1);
-      if (!expectedPrefixMethods.has(prefix)) expectedPrefixMethods.set(prefix, new Set());
-      expectedPrefixMethods.get(prefix)!.add(method);
-    }
-  }
-
-  const allSources = new Set(bindings.values());
-
-  for (const source of allSources) {
-    if (sourceToType.has(source)) continue;
-
-    const varsFromSource: string[] = [];
-    for (const [varName, bindingSource] of bindings) {
-      if (bindingSource === source) varsFromSource.push(varName);
-    }
-
-    // For each prefix, check if any variable from this source has calls
-    // whose method names match the expected methods for that prefix
-    for (const [prefix, methodNames] of expectedPrefixMethods) {
-      if (sourceToType.has(source)) break;
-      for (const varName of varsFromSource) {
-        const callsOnVar = extractedCalls
-          .filter(c => c.method.startsWith(varName + '.'))
-          .map(c => c.method.slice(varName.length + 1));
-        // Match if at least one method called on this var matches an expected method for this prefix
-        const hasMatchingMethod = callsOnVar.some(m => methodNames.has(m));
-        if (hasMatchingMethod) {
-          sourceToType.set(source, prefix);
-          break;
-        }
-      }
-    }
-  }
-
-  // 3. Build var-to-type map
-  const varToType = new Map<string, string>();
-  for (const [varName, source] of bindings) {
-    const resolvedType = sourceToType.get(source);
-    if (resolvedType) {
-      varToType.set(varName, resolvedType);
-    }
-  }
-
-  // 4. Resolve each call
-  return extractedCalls.map(call => {
-    const dotIndex = call.method.indexOf('.');
-    if (dotIndex !== -1) {
-      const obj = call.method.slice(0, dotIndex);
-      const rest = call.method.slice(dotIndex + 1);
-      const resolvedType = varToType.get(obj);
-      if (resolvedType) {
-        return { ...call, method: `${resolvedType}.${rest}` };
-      }
-    }
-    return call;
-  });
-}
-
-// ── Task evaluation ────────────────────────────────────────────────────────
-
-/**
- * Evaluate a single task result: match tools, check code patterns, compute metrics.
- */
-export function evaluateTask(params: {
-  task: TaskDefinition;
-  model: ModelConfig;
-  generatedCode: string | null;
-  rawResponse: string;
-  extractedCalls: ExtractedCall[];
-  llmLatencyMs: number;
-  tokenUsage?: TokenUsage;
-  error?: string;
-  knownMethods: Set<string>;
-  bindings?: Map<string, string>;  // variable → source function/class from extractor
-  surface: 'sdk' | 'cli' | 'mcp' | 'prompt';
-}): TaskResult {
-  const {
-    task,
-    model,
-    generatedCode,
-    rawResponse,
-    llmLatencyMs,
-    tokenUsage,
-    error,
-    knownMethods,
-    bindings,
-    surface,
-  } = params;
-
-  let extractedCalls = params.extractedCalls;
-  const expectedActions = getExpectedActions(task);
-  if (bindings && bindings.size > 0) {
-    extractedCalls = resolveCallsFromBindings(extractedCalls, bindings, expectedActions);
-  }
-
-  const actionMatches = matchActions(expectedActions, extractedCalls);
-
-  const codePatternResults: Record<string, boolean> = {};
-  let allCodePatternsPass = true;
-
-  if (task.verify && generatedCode !== null) {
-    for (const verification of task.verify) {
-      if (verification.code_pattern) {
-        const pattern = verification.code_pattern;
-        let matches: boolean;
-        try {
-          const re = new RegExp(pattern);
-          matches = re.test(generatedCode);
-        } catch {
-          matches = false;
-        }
-        codePatternResults[pattern] = matches;
-        if (!matches) allCodePatternsPass = false;
-      }
-    }
-  } else if (task.verify) {
-    for (const verification of task.verify) {
-      if (verification.code_pattern) {
-        codePatternResults[verification.code_pattern] = false;
-        allCodePatternsPass = false;
-      }
-    }
-  }
-
-  const expectedCount = expectedActions.length;
-  const matchedCount = actionMatches.filter((m) => m.matched).length;
-
-  // recall = matched / expected; 1.0 when there are no expectations
-  const toolRecall = expectedCount === 0 ? 1.0 : matchedCount / expectedCount;
-
-  // precision = matched / known calls extracted; 0.0 when nothing was extracted
-  const knownCallCount = extractedCalls.filter((c) => knownMethods.has(c.method)).length;
-  const toolPrecision = knownCallCount === 0 ? 0.0 : matchedCount / knownCallCount;
-
-  const hasCodePatterns = Object.keys(codePatternResults).length > 0;
-  const taskPassed =
-    matchedCount === expectedCount &&
-    (expectedCount > 0 || matchedCount === 0) &&
-    (!hasCodePatterns || allCodePatternsPass);
-
-  const methodsFoundCount = actionMatches.filter(m => m.methodFound).length;
-  const toolSelectionAccuracy = expectedCount === 0 ? 1.0 : methodsFoundCount / expectedCount;
-
-  const argAccuracy = methodsFoundCount === 0 ? 1.0
-    : actionMatches.filter(m => m.methodFound && m.argsCorrect).length / methodsFoundCount;
-
-  const allExtractedMethods = extractedCalls.map(c => c.method);
-  const expectedMethods = new Set(expectedActions.map((action) => getExpectedActionName(action)));
-
-  // Collect known class/type prefixes for SDK hallucination filtering (e.g. "FastClient" from "FastClient.setup")
-  const sdkPrefixes = new Set<string>();
-  for (const m of knownMethods) {
-    if (m.includes('.')) {
-      sdkPrefixes.add(m.split('.')[0]);
-    }
-  }
-
-  const unnecessaryActions = allExtractedMethods.filter(
-    m => knownMethods.has(m) && !expectedMethods.has(m)
-  );
-
-  const hallucinatedActions = allExtractedMethods.filter(m => {
-    if (knownMethods.has(m)) return false;
-
-    if (surface === 'sdk') {
-      // SDK: only dotted calls that look like SDK API usage are hallucinations.
-      // Unknown helpers (non-dotted names) should not be flagged.
-      const dotIdx = m.indexOf('.');
-      if (dotIdx === -1) return false;
-      const prefix = m.slice(0, dotIdx);
-      return sdkPrefixes.has(prefix);
-    }
-
-    // CLI/MCP: unknown command path or tool name is hallucinated.
-    return true;
-  });
-
-  const hallucinationRate = extractedCalls.length === 0 ? 0
-    : hallucinatedActions.length / extractedCalls.length;
-
-  return {
-    task,
-    model,
-    generatedCode,
-    rawResponse,
-    extractedCalls,
-    actionMatches,
-    codePatternResults: hasCodePatterns ? codePatternResults : undefined,
-    metrics: {
-      toolPrecision,
-      toolRecall,
-      taskPassed,
-      toolSelectionAccuracy,
-      argAccuracy,
-      unnecessaryActions,
-      hallucinatedActions,
-      hallucinationRate,
-    },
-    llmLatencyMs,
-    tokenUsage,
-    error,
-  };
-}
diff --git a/src/benchmark/extractors/cli-extractor.ts b/src/benchmark/extractors/cli-extractor.ts
deleted file mode 100644
index ee1248c..0000000
--- a/src/benchmark/extractors/cli-extractor.ts
+++ /dev/null
@@ -1,348 +0,0 @@
-import type { ExtractedCall } from '../types.js';
-
-type CommandLine = { text: string; line: number };
-
-/**
- * Strict v1 contract: exactly one fenced bash/sh block.
- * Returns null when none or multiple shell blocks are present.
- */
-export function extractShellBlock(markdown: string): string | null {
-  const blockRegex = /```([^\n`]*)\n([\s\S]*?)```/g;
-  const shellBlocks: string[] = [];
-
-  for (const match of markdown.matchAll(blockRegex)) {
-    const lang = (match[1] ?? '').trim().toLowerCase();
-    if (lang === 'bash' || lang === 'sh') {
-      shellBlocks.push((match[2] ?? '').trim());
-    }
-  }
-
-  if (shellBlocks.length !== 1) return null;
-  return shellBlocks[0];
-}
-
-/**
- * Parse one markdown response with a single fenced shell block into calls.
- */
-export function extractFromCliMarkdown(markdown: string, knownCommands?: readonly string[]): ExtractedCall[] {
-  const shell = extractShellBlock(markdown);
-  if (!shell) return [];
-
-  return parseShellCommands(shell, knownCommands);
-}
-
-/**
- * Parse shell script content into extracted CLI calls.
- */
-export function parseShellCommands(shell: string, knownCommands?: readonly string[]): ExtractedCall[] {
-  const commands = splitCommands(shell);
-  const calls: ExtractedCall[] = [];
-
-  for (const cmd of commands) {
-    const call = parseSingleCommand(cmd.text, cmd.line, knownCommands);
-    if (call) calls.push(call);
-  }
-
-  return calls;
-}
-
-function splitCommands(shell: string): CommandLine[] {
-  const lines = shell.replace(/\r\n/g, '\n').split('\n');
-  const commands: CommandLine[] = [];
-
-  let current = '';
-  let startLine = 1;
-
-  for (let i = 0; i < lines.length; i++) {
-    const trimmed = lines[i].trim();
-    if (!trimmed || trimmed.startsWith('#')) continue;
-
-    if (!current) {
-      startLine = i + 1;
-      current = trimmed;
-    } else {
-      current = `${current} ${trimmed}`;
-    }
-
-    if (current.endsWith('\\')) {
-      current = current.slice(0, -1).trimEnd();
-      continue;
-    }
-
-    commands.push(...splitCommandChain(current.trim(), startLine));
-    current = '';
-  }
-
-  if (current) {
-    commands.push(...splitCommandChain(current.trim(), startLine));
-  }
-
-  return commands;
-}
-
-function splitCommandChain(command: string, line: number): CommandLine[] {
-  const parts: CommandLine[] = [];
-  let current = '';
-  let quote: '"' | "'" | null = null;
-  let escaping = false;
-
-  const flush = () => {
-    const trimmed = current.trim();
-    if (trimmed) parts.push({ text: trimmed, line });
-    current = '';
-  };
-
-  for (let i = 0; i < command.length; i++) {
-    const ch = command[i];
-    const next = command[i + 1];
-
-    if (escaping) {
-      current += ch;
-      escaping = false;
-      continue;
-    }
-
-    if (ch === '\\' && quote !== "'") {
-      escaping = true;
-      current += ch;
-      continue;
-    }
-
-    if (quote) {
-      current += ch;
-      if (ch === quote) quote = null;
-      continue;
-    }
-
-    if (ch === '"' || ch === "'") {
-      quote = ch as '"' | "'";
-      current += ch;
-      continue;
-    }
-
-    if ((ch === '&' && next === '&') || (ch === '|' && next === '|')) {
-      flush();
-      i++;
-      continue;
-    }
-
-    if (ch === ';' || ch === '|') {
-      flush();
-      continue;
-    }
-
-    current += ch;
-  }
-
-  flush();
-  return parts;
-}
-
-// Shell control-flow keywords that can appear as the first token of a split command
-// (e.g. "then fast account list" after splitting "if ...; then fast account list; fi" on ';')
-const SHELL_CONTROL_KEYWORDS = new Set(['then', 'else', 'elif', 'do', 'fi', 'done', 'esac']);
-
-function parseSingleCommand(command: string, line: number, knownCommands?: readonly string[]): ExtractedCall | null {
-  const tokens = tokenizeShell(command);
-  if (tokens.length === 0) return null;
-
-  let i = 0;
-  const env: Record<string, string> = {};
-
-  // Skip leading shell control-flow keywords (then, else, elif, do, fi, done, esac)
-  while (i < tokens.length && SHELL_CONTROL_KEYWORDS.has(tokens[i])) {
-    i++;
-  }
-
-  while (i < tokens.length && isEnvAssignment(tokens[i])) {
-    const [key, ...rest] = tokens[i].split('=');
-    env[key] = rest.join('=');
-    i++;
-  }
-
-  if (i >= tokens.length) return null;
-
-  const { methodParts, nextIndex } = resolveMethod(tokens, i, knownCommands);
-  i = nextIndex;
-
-  const args: Record<string, unknown> = {};
-  if (Object.keys(env).length > 0) {
-    args.env = env;
-  }
-
-  let positionalIndex = 0;
-  let forcePositional = false;
-
-  while (i < tokens.length) {
-    const token = tokens[i];
-
-    if (forcePositional) {
-      args[`_positional_${positionalIndex++}`] = token;
-      i++;
-      continue;
-    }
-
-    if (token === '--') {
-      forcePositional = true;
-      i++;
-      continue;
-    }
-
-    if (token.startsWith('--')) {
-      const withoutPrefix = token.slice(2);
-      const eqIndex = withoutPrefix.indexOf('=');
-
-      if (eqIndex >= 0) {
-        const key = withoutPrefix.slice(0, eqIndex);
-        const value = withoutPrefix.slice(eqIndex + 1);
-        args[key] = value;
-        i++;
-        continue;
-      }
-
-      const key = withoutPrefix;
-      const next = tokens[i + 1];
-      if (next && !next.startsWith('-')) {
-        args[key] = next;
-        i += 2;
-      } else {
-        args[key] = true;
-        i++;
-      }
-      continue;
-    }
-
-    if (token.startsWith('-') && token !== '-') {
-      const key = token.slice(1);
-      const next = tokens[i + 1];
-      if (next && !next.startsWith('-')) {
-        args[key] = next;
-        i += 2;
-      } else {
-        args[key] = true;
-        i++;
-      }
-      continue;
-    }
-
-    args[`_positional_${positionalIndex++}`] = token;
-    i++;
-  }
-
-  return {
-    method: methodParts.join(' '),
-    args,
-    line,
-    raw: command,
-  };
-}
-
-function findKnownCommand(
-  tokens: string[],
-  fromIndex: number,
-  knownSet: Set<string>,
-): { methodParts: string[]; nextIndex: number } | null {
-  for (let end = tokens.length; end > fromIndex; end--) {
-    const candidateTokens = tokens.slice(fromIndex, end);
-    if (candidateTokens.some((token) => token === '--' || token.startsWith('-'))) {
-      continue;
-    }
-    if (knownSet.has(candidateTokens.join(' '))) {
-      return { methodParts: candidateTokens, nextIndex: end };
-    }
-  }
-  return null;
-}
-
-function resolveMethod(
-  tokens: string[],
-  startIndex: number,
-  knownCommands?: readonly string[],
-): { methodParts: string[]; nextIndex: number } {
-  if (knownCommands && knownCommands.length > 0) {
-    const knownSet = new Set(knownCommands.map((command) => command.trim()).filter(Boolean));
-
-    // Try matching from startIndex, then skip up to 2 tokens to handle prefixes like "npx skill-optimizer" or "fast".
-    // Capped at 2 because wider skips can land on a flag's value (e.g. "--config run" would match "run") when the
-    // command precedes the subcommand in the invocation.
-    let match: ReturnType<typeof findKnownCommand> = null;
-    for (let skip = 0; skip <= 2 && !match; skip++) {
-      if (startIndex + skip < tokens.length) {
-        match = findKnownCommand(tokens, startIndex + skip, knownSet);
-      }
-    }
-
-    if (match) return match;
-  }
-
-  const executable = tokens[startIndex];
-  let nextIndex = startIndex + 1;
-  const methodParts = [executable];
-  while (nextIndex < tokens.length) {
-    const token = tokens[nextIndex];
-    if (token === '--' || token.startsWith('-')) break;
-    if (!isLikelySubcommand(token)) break;
-
-    methodParts.push(token);
-    nextIndex++;
-  }
-
-  return { methodParts, nextIndex };
-}
-
-function tokenizeShell(command: string): string[] {
-  const tokens: string[] = [];
-  let current = '';
-  let quote: '"' | "'" | null = null;
-  let escaping = false;
-
-  for (let i = 0; i < command.length; i++) {
-    const ch = command[i];
-
-    if (escaping) {
-      current += ch;
-      escaping = false;
-      continue;
-    }
-
-    if (ch === '\\' && quote !== "'") {
-      escaping = true;
-      continue;
-    }
-
-    if (quote) {
-      if (ch === quote) {
-        quote = null;
-      } else {
-        current += ch;
-      }
-      continue;
-    }
-
-    if (ch === '"' || ch === "'") {
-      quote = ch as '"' | "'";
-      continue;
-    }
-
-    if (/\s/.test(ch)) {
-      if (current) {
-        tokens.push(current);
-        current = '';
-      }
-      continue;
-    }
-
-    current += ch;
-  }
-
-  if (current) tokens.push(current);
-  return tokens;
-}
-
-function isEnvAssignment(token: string): boolean {
-  return /^[A-Za-z_][A-Za-z0-9_]*=.*/.test(token);
-}
-
-function isLikelySubcommand(token: string): boolean {
-  return /^[A-Za-z][A-Za-z-]*$/.test(token);
-}
diff --git a/src/benchmark/extractors/code-analyzer.ts b/src/benchmark/extractors/code-analyzer.ts
deleted file mode 100644
index ab6bee4..0000000
--- a/src/benchmark/extractors/code-analyzer.ts
+++ /dev/null
@@ -1,342 +0,0 @@
-import Parser from 'web-tree-sitter';
-
-import type { ExtractedCall } from '../types.js';
-import { getSdkParser } from './sdk/parser.js';
-
-// ── Lazy singleton parser ──────────────────────────────────────────────────
-
-async function initParser(): Promise<Parser> {
-  return getSdkParser('typescript');
-}
-
-// ── Argument extraction ────────────────────────────────────────────────────
-
-type LiteralMap = Map<string, unknown>;
-
-function parseArrayLiteral(node: Parser.SyntaxNode, literalMap: LiteralMap = new Map()): unknown[] {
-  const result: unknown[] = [];
-
-  for (const child of node.namedChildren) {
-    if (child.type === 'spread_element') {
-      result.push('<spread>');
-      continue;
-    }
-    result.push(extractValue(child, literalMap));
-  }
-
-  return result;
-}
-
-/**
- * Extract a value from a tree-sitter node.
- * Returns nested structures for object/array literals and sentinels for dynamic values.
- */
-function extractValue(node: Parser.SyntaxNode, literalMap: LiteralMap = new Map()): unknown {
-  switch (node.type) {
-    case 'object':
-      return parseObjectLiteral(node, literalMap);
-    case 'array':
-      return parseArrayLiteral(node, literalMap);
-    case 'string':
-    case 'string_fragment': {
-      // Strip surrounding quotes
-      const text = node.text;
-      if (
-        (text.startsWith('"') && text.endsWith('"')) ||
-        (text.startsWith("'") && text.endsWith("'")) ||
-        (text.startsWith('`') && text.endsWith('`'))
-      ) {
-        return text.slice(1, -1);
-      }
-      return text;
-    }
-    case 'number':
-      return node.text;
-    case 'true':
-      return true;
-    case 'false':
-      return false;
-    case 'null':
-      return null;
-    case 'undefined':
-      return undefined;
-    case 'template_string':
-      return '<template>';
-    case 'identifier':
-      return literalMap.has(node.text) ? literalMap.get(node.text)! : `<${node.text}>`;
-    case 'member_expression':
-      return `<${node.text}>`;
-    case 'binary_expression':
-    case 'call_expression':
-    case 'await_expression':
-      return '<dynamic>';
-    default:
-      return `<${node.type}>`;
-  }
-}
-
-/**
- * Parse an object literal node into a key-value record.
- */
-function parseObjectLiteral(node: Parser.SyntaxNode, literalMap: LiteralMap = new Map()): Record<string, unknown> {
-  const result: Record<string, unknown> = {};
-
-  for (const child of node.namedChildren) {
-    if (child.type === 'pair') {
-      const keyNode = child.namedChildren[0];
-      const valueNode = child.namedChildren[1];
-      if (!keyNode || !valueNode) continue;
-
-      // Key can be identifier, string, or computed
-      let key: string;
-      if (keyNode.type === 'property_identifier' || keyNode.type === 'identifier') {
-        key = keyNode.text;
-      } else if (keyNode.type === 'string') {
-        key = keyNode.text.slice(1, -1);
-      } else {
-        key = keyNode.text;
-      }
-
-      result[key] = extractValue(valueNode, literalMap);
-    } else if (child.type === 'shorthand_property_identifier') {
-      // { foo } shorthand
-      result[child.text] = `<${child.text}>`;
-    } else if (child.type === 'spread_element') {
-      result['...spread'] = '<spread>';
-    }
-  }
-
-  return result;
-}
-
-/**
- * Parse the arguments node of a call/new expression.
- * Returns a record of extracted arguments.
- * All positional args use `_positional_${index}` keys.
- */
-function parseArguments(argsNode: Parser.SyntaxNode, literalMap: LiteralMap = new Map()): Record<string, unknown> {
-  const args: Record<string, unknown> = {};
-  let positionalIndex = 0;
-
-  for (const child of argsNode.namedChildren) {
-    if (child.type === 'object') {
-      // Merge object literal into args (first object wins for named args)
-      const obj = parseObjectLiteral(child, literalMap);
-      Object.assign(args, obj);
-    } else if (child.type === 'string') {
-      const value = extractValue(child, literalMap);
-      const key = `_positional_${positionalIndex}`;
-      args[key] = value;
-      positionalIndex++;
-    } else if (child.type === 'number') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = child.text;
-      positionalIndex++;
-    } else if (child.type === 'template_string') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = '<template>';
-      positionalIndex++;
-    } else if (child.type === 'array') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = parseArrayLiteral(child, literalMap);
-      positionalIndex++;
-    } else if (child.type === 'identifier') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = extractValue(child, literalMap);
-      positionalIndex++;
-    } else if (child.type === 'true' || child.type === 'false') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = child.type === 'true';
-      positionalIndex++;
-    } else if (child.type === 'null') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = null;
-      positionalIndex++;
-    } else if (child.type === 'member_expression' || child.type === 'call_expression') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = '<dynamic>';
-      positionalIndex++;
-    } else if (child.type === 'await_expression') {
-      const key = `_positional_${positionalIndex}`;
-      args[key] = '<dynamic>';
-      positionalIndex++;
-    }
-    // Skip commas and other punctuation (non-named children)
-  }
-
-  return args;
-}
-
-// ── Variable tracking ──────────────────────────────────────────────────────
-
-
-/**
- * Collect literal variable bindings from the AST via a single top-down pass.
- *
- * Limitation: forward references are not resolved. If code declares
- * `const result = x402Pay({ wallet: myWallet })` before `const myWallet = { type: 'evm' }`,
- * `myWallet` will resolve to a sentinel `<myWallet>` instead of the object literal.
- * In practice, LLM-generated code almost always declares variables before use.
- */
-function collectLiteralBindings(rootNode: Parser.SyntaxNode): LiteralMap {
-  const literalMap: LiteralMap = new Map();
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'variable_declarator' || node.type === 'assignment_expression') {
-      const nameNode =
-        node.type === 'variable_declarator'
-          ? node.childForFieldName('name')
-          : node.childForFieldName('left');
-      const valueNode =
-        node.type === 'variable_declarator'
-          ? node.childForFieldName('value')
-          : node.childForFieldName('right');
-
-      if (nameNode?.type === 'identifier' && valueNode) {
-        if (['object', 'array', 'string', 'number', 'true', 'false', 'null', 'template_string', 'identifier'].includes(valueNode.type)) {
-          literalMap.set(nameNode.text, extractValue(valueNode, literalMap));
-        }
-      }
-    }
-
-    for (const child of node.children) visit(child);
-  }
-
-  visit(rootNode);
-  return literalMap;
-}
-
-
-// ── Generic extraction (no config hints) ───────────────────────────────────
-
-interface RawExtraction {
-  calls: ExtractedCall[];
-  bindings: Map<string, string>;  // variable → source function/class name
-}
-
-/**
- * Walk the tree and collect ALL variable-to-source bindings generically.
- * Handles:
- *   const f = fast(...)         → f bound to 'fast'
- *   const allset = new AllSetProvider(...) → allset bound to 'AllSetProvider'
- *   const w = await FastWallet.fromKeyfile(...) → w bound to 'FastWallet'
- */
-function collectAllVariableBindings(rootNode: Parser.SyntaxNode): Map<string, string> {
-  const bindings = new Map<string, string>();
-
-  function getSourceName(node: Parser.SyntaxNode): string | null {
-    // Unwrap await
-    if (node.type === 'await_expression') {
-      const inner = node.namedChildren[0];
-      if (inner) return getSourceName(inner);
-      return null;
-    }
-    // new ClassName(...)
-    if (node.type === 'new_expression') {
-      const ctor = node.childForFieldName('constructor');
-      if (ctor?.type === 'identifier') return ctor.text;
-      return null;
-    }
-    // func(...) or Class.staticMethod(...)
-    if (node.type === 'call_expression') {
-      const fn = node.childForFieldName('function');
-      if (!fn) return null;
-      if (fn.type === 'identifier') return fn.text;
-      if (fn.type === 'member_expression') {
-        const obj = fn.childForFieldName('object');
-        if (obj?.type === 'identifier') return obj.text;
-      }
-      return null;
-    }
-    return null;
-  }
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'variable_declarator' || node.type === 'assignment_expression') {
-      const nameNode = node.type === 'variable_declarator'
-        ? node.childForFieldName('name')
-        : node.childForFieldName('left');
-      const valueNode = node.type === 'variable_declarator'
-        ? node.childForFieldName('value')
-        : node.childForFieldName('right');
-
-      if (nameNode?.type === 'identifier' && valueNode) {
-        const source = getSourceName(valueNode);
-        if (source) bindings.set(nameNode.text, source);
-      }
-    }
-    for (const child of node.children) visit(child);
-  }
-
-  visit(rootNode);
-  return bindings;
-}
-
-/**
- * Walk the tree and collect ALL call/new expressions (no filtering).
- */
-function collectAllCalls(
-  rootNode: Parser.SyntaxNode,
-  bindings: Map<string, string>,
-  literalMap: LiteralMap,
-): ExtractedCall[] {
-  const calls: ExtractedCall[] = [];
-  const visited = new Set<number>();
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'new_expression' && !visited.has(node.id)) {
-      visited.add(node.id);
-      const ctor = node.childForFieldName('constructor');
-      if (ctor?.type === 'identifier') {
-        const method = `${ctor.text}.constructor`;
-        const argsNode = node.childForFieldName('arguments');
-        const args = argsNode ? parseArguments(argsNode, literalMap) : {};
-        calls.push({ method, args, line: node.startPosition.row + 1, raw: node.text });
-      }
-    } else if (node.type === 'call_expression' && !visited.has(node.id)) {
-      visited.add(node.id);
-      const fnNode = node.childForFieldName('function');
-      if (!fnNode) { /* skip */ }
-      else if (fnNode.type === 'identifier') {
-        // standalone: fast(...), x402Pay(...), console.log(...)
-        const method = fnNode.text;
-        const argsNode = node.childForFieldName('arguments');
-        const args = argsNode ? parseArguments(argsNode, literalMap) : {};
-        calls.push({ method, args, line: node.startPosition.row + 1, raw: node.text });
-      } else if (fnNode.type === 'member_expression') {
-        // member: f.setup(), allset.sendToFast(), FastWallet.fromKeyfile()
-        const objNode = fnNode.childForFieldName('object');
-        const propNode = fnNode.childForFieldName('property');
-        if (objNode && propNode) {
-          const objectName = objNode.text.replace(/\?$/, '');
-          const propertyName = propNode.text;
-          const method = `${objectName}.${propertyName}`;
-          const argsNode = node.childForFieldName('arguments');
-          const args = argsNode ? parseArguments(argsNode, literalMap) : {};
-          calls.push({ method, args, line: node.startPosition.row + 1, raw: node.text });
-        }
-      }
-    }
-    for (const child of node.children) visit(child);
-  }
-
-  visit(rootNode);
-  return calls;
-}
-
-/**
- * Parse TypeScript code and extract ALL calls generically — no config hints needed.
- * Returns raw calls (e.g. 'f.setup', not 'FastClient.setup') plus a binding graph
- * that the evaluator can use to resolve types from task expectations.
- */
-export async function extractAllFromCode(code: string): Promise<RawExtraction> {
-  const p = await initParser();
-  const tree = p.parse(code);
-  const root = tree.rootNode;
-  const literalMap = collectLiteralBindings(root);
-  const bindings = collectAllVariableBindings(root);
-  const calls = collectAllCalls(root, bindings, literalMap);
-  calls.sort((a, b) => a.line - b.line);
-  return { calls, bindings };
-}
-
diff --git a/src/benchmark/extractors/code-extractor.ts b/src/benchmark/extractors/code-extractor.ts
deleted file mode 100644
index 85d484b..0000000
--- a/src/benchmark/extractors/code-extractor.ts
+++ /dev/null
@@ -1,29 +0,0 @@
-import type { SdkLanguage } from '../types.js';
-
-const SDK_FENCE_TAGS: Record<SdkLanguage, string[]> = {
-  typescript: ['typescript', 'ts', 'javascript', 'js', ''],
-  python: ['python', 'py', ''],
-  rust: ['rust', 'rs', ''],
-};
-
-/**
- * Extract the first SDK-language code block from markdown.
- * Returns the code content or null if no code block found.
- */
-export function extractSdkCodeBlock(markdown: string, language: SdkLanguage): string | null {
-  const regex = /```([^\n`]*)\n([\s\S]*?)```/g;
-  const allowedTags = new Set(SDK_FENCE_TAGS[language]);
-
-  for (const match of markdown.matchAll(regex)) {
-    const tag = (match[1] ?? '').trim().toLowerCase();
-    if (allowedTags.has(tag)) {
-      return (match[2] ?? '').trim() || null;
-    }
-  }
-
-  return null;
-}
-
-export function extractCodeBlock(markdown: string): string | null {
-  return extractSdkCodeBlock(markdown, 'typescript');
-}
diff --git a/src/benchmark/extractors/index.ts b/src/benchmark/extractors/index.ts
deleted file mode 100644
index bb935e5..0000000
--- a/src/benchmark/extractors/index.ts
+++ /dev/null
@@ -1,58 +0,0 @@
-import type { ExtractedCall, BenchmarkConfig, LLMResponse } from '../types.js';
-import { extractSdkCodeBlock } from './code-extractor.js';
-import { extractFromCliMarkdown, extractShellBlock } from './cli-extractor.js';
-import { extractFromToolCalls } from './mcp-extractor.js';
-import { extractSdkFromCode } from './sdk/registry.js';
-
-/**
- * Extract SDK/tool calls from an LLM response based on the configured surface.
- *
- * SDK surface: extract TypeScript block from markdown → tree-sitter parse → ExtractedCall[]
- * CLI surface: extract shell block from markdown → parse command invocations → ExtractedCall[]
- * MCP surface: read tool_calls from response → ExtractedCall[]
- */
-export async function extract(
-  response: LLMResponse,
-  config: BenchmarkConfig,
-): Promise<{ calls: ExtractedCall[]; generatedCode: string | null; bindings?: Map<string, string> }> {
-  const extended = config as BenchmarkConfig & {
-    surface?: 'sdk' | 'cli' | 'mcp' | 'prompt';
-    mode?: 'code' | 'mcp';
-    sdk?: unknown;
-    cli?: { commandDefinitions?: Array<{ command: string }> };
-    code?: unknown;
-  };
-  const surface = extended.surface;
-  const sdkConfig = (extended.sdk ?? extended.code) as BenchmarkConfig['sdk'] | undefined;
-  const knownCommands = Array.isArray(extended.cli?.commandDefinitions)
-    ? extended.cli.commandDefinitions.map((definition) => definition.command)
-    : undefined;
-
-  if (surface === 'prompt') {
-    // Prompt surface: no extraction — response is plain text, not tool calls or code.
-    return { calls: [], generatedCode: null };
-  }
-
-  if (surface === 'mcp' || extended.mode === 'mcp') {
-    const calls = extractFromToolCalls(response);
-    return { calls, generatedCode: null };
-  }
-
-  if (surface === 'cli') {
-    const generatedCode = extractShellBlock(response.content);
-    const calls = extractFromCliMarkdown(response.content, knownCommands);
-    return { calls, generatedCode };
-  }
-
-  if (!sdkConfig) {
-    throw new Error('SDK surface requires "sdk" section in config');
-  }
-
-  const generatedCode = extractSdkCodeBlock(response.content, sdkConfig.language);
-  if (!generatedCode) {
-    return { calls: [], generatedCode: null };
-  }
-
-  const { calls, bindings } = await extractSdkFromCode(generatedCode, sdkConfig.language);
-  return { calls, generatedCode, bindings };
-}
diff --git a/src/benchmark/extractors/mcp-extractor.ts b/src/benchmark/extractors/mcp-extractor.ts
deleted file mode 100644
index 73e3866..0000000
--- a/src/benchmark/extractors/mcp-extractor.ts
+++ /dev/null
@@ -1,18 +0,0 @@
-import type { ExtractedCall, LLMResponse } from '../types.js';
-
-/**
- * Extract tool calls from a structured LLM response (MCP mode).
- * In MCP mode, the LLM returns tool_calls directly — no code parsing needed.
- */
-export function extractFromToolCalls(response: LLMResponse): ExtractedCall[] {
-  if (!response.toolCalls || response.toolCalls.length === 0) {
-    return [];
-  }
-
-  return response.toolCalls.map((tc, index) => ({
-    method: tc.name,
-    args: tc.arguments,
-    line: index,  // no meaningful line number in MCP mode
-    raw: JSON.stringify(tc),
-  }));
-}
diff --git a/src/benchmark/extractors/sdk/parser.ts b/src/benchmark/extractors/sdk/parser.ts
deleted file mode 100644
index 6c13ed4..0000000
--- a/src/benchmark/extractors/sdk/parser.ts
+++ /dev/null
@@ -1,37 +0,0 @@
-import Parser from 'web-tree-sitter';
-import { createRequire } from 'node:module';
-
-import type { SdkLanguage } from '../../types.js';
-
-const require = createRequire(import.meta.url);
-
-const WASM_BY_LANGUAGE: Record<SdkLanguage, string> = {
-  typescript: 'tree-sitter-wasms/out/tree-sitter-typescript.wasm',
-  python: 'tree-sitter-wasms/out/tree-sitter-python.wasm',
-  rust: 'tree-sitter-wasms/out/tree-sitter-rust.wasm',
-};
-
-let parserInit: Promise<void> | null = null;
-const parserCache = new Map<SdkLanguage, Promise<Parser>>();
-
-async function ensureParserInit(): Promise<void> {
-  if (!parserInit) {
-    parserInit = Parser.init();
-  }
-  await parserInit;
-}
-
-export async function getSdkParser(language: SdkLanguage): Promise<Parser> {
-  if (!parserCache.has(language)) {
-    parserCache.set(language, (async () => {
-      await ensureParserInit();
-      const parser = new Parser();
-      const wasmPath = require.resolve(WASM_BY_LANGUAGE[language]);
-      const grammar = await Parser.Language.load(wasmPath);
-      parser.setLanguage(grammar);
-      return parser;
-    })());
-  }
-
-  return parserCache.get(language)!;
-}
diff --git a/src/benchmark/extractors/sdk/python.ts b/src/benchmark/extractors/sdk/python.ts
deleted file mode 100644
index 165d16a..0000000
--- a/src/benchmark/extractors/sdk/python.ts
+++ /dev/null
@@ -1,195 +0,0 @@
-import type Parser from 'web-tree-sitter';
-
-import type { ExtractedCall } from '../../types.js';
-import { getSdkParser } from './parser.js';
-import { child, isTypeLike, sortCalls, stripQuoted } from './shared.js';
-import type { RawSdkExtraction, SdkLanguageAdapter } from './types.js';
-
-type LiteralMap = Map<string, unknown>;
-type BindingMap = Map<string, string>;
-
-function extractPythonValue(node: Parser.SyntaxNode, literalMap: LiteralMap): unknown {
-  switch (node.type) {
-    case 'string':
-      return stripQuoted(node.text);
-    case 'integer':
-    case 'float':
-      return node.text;
-    case 'true':
-      return true;
-    case 'false':
-      return false;
-    case 'none':
-      return null;
-    case 'identifier':
-      return literalMap.has(node.text) ? literalMap.get(node.text)! : `<${node.text}>`;
-    case 'list': {
-      const result: unknown[] = [];
-      for (const childNode of node.namedChildren) {
-        result.push(extractPythonValue(childNode, literalMap));
-      }
-      return result;
-    }
-    case 'dictionary': {
-      const result: Record<string, unknown> = {};
-      for (const pair of node.namedChildren) {
-        if (pair.type !== 'pair') continue;
-        const keyNode = pair.namedChildren[0];
-        const valueNode = pair.namedChildren[1];
-        if (!keyNode || !valueNode) continue;
-        const key = stripQuoted(keyNode.text);
-        result[key] = extractPythonValue(valueNode, literalMap);
-      }
-      return result;
-    }
-    case 'call':
-      return '<dynamic>';
-    case 'attribute':
-      return `<${node.text}>`;
-    default:
-      return `<${node.type}>`;
-  }
-}
-
-function parsePythonArgs(argsNode: Parser.SyntaxNode | null, literalMap: LiteralMap): Record<string, unknown> {
-  if (!argsNode) return {};
-
-  const args: Record<string, unknown> = {};
-  let positionalIndex = 0;
-
-  for (const childNode of argsNode.namedChildren) {
-    if (childNode.type === 'keyword_argument') {
-      const nameNode = child(childNode, 'name');
-      const valueNode = child(childNode, 'value');
-      if (!nameNode || !valueNode) continue;
-      args[nameNode.text] = extractPythonValue(valueNode, literalMap);
-      continue;
-    }
-
-    args[`_positional_${positionalIndex++}`] = extractPythonValue(childNode, literalMap);
-  }
-
-  return args;
-}
-
-function collectPythonLiteralBindings(root: Parser.SyntaxNode): LiteralMap {
-  const literalMap: LiteralMap = new Map();
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'assignment') {
-      const left = child(node, 'left');
-      const right = child(node, 'right');
-      if (left?.type === 'identifier' && right && ['string', 'integer', 'float', 'list', 'dictionary', 'true', 'false', 'none', 'identifier'].includes(right.type)) {
-        literalMap.set(left.text, extractPythonValue(right, literalMap));
-      }
-    }
-
-    for (const childNode of node.namedChildren) visit(childNode);
-  }
-
-  visit(root);
-  return literalMap;
-}
-
-function pythonBindingFromCall(node: Parser.SyntaxNode, bindings: BindingMap): string | null {
-  const fnNode = child(node, 'function');
-  if (!fnNode) return null;
-
-  if (fnNode.type === 'identifier') {
-    return fnNode.text;
-  }
-
-  if (fnNode.type === 'attribute') {
-    const objectNode = child(fnNode, 'object');
-    const attributeNode = child(fnNode, 'attribute');
-    if (!objectNode || !attributeNode) return null;
-    if (isTypeLike(objectNode.text)) return objectNode.text;
-    return attributeNode.text;
-  }
-
-  return null;
-}
-
-function unwrapPythonCall(node: Parser.SyntaxNode | null): Parser.SyntaxNode | null {
-  if (!node) return null;
-  if (node.type === 'call') return node;
-  if (node.type === 'await') {
-    return node.namedChildren[0] ?? null;
-  }
-
-  return null;
-}
-
-function collectPythonBindings(root: Parser.SyntaxNode): BindingMap {
-  const bindings: BindingMap = new Map();
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'assignment') {
-      const left = child(node, 'left');
-      const right = child(node, 'right');
-      const callNode = unwrapPythonCall(right);
-      if (left?.type === 'identifier' && callNode?.type === 'call') {
-        const source = pythonBindingFromCall(callNode, bindings);
-        if (source) bindings.set(left.text, source);
-      }
-    }
-
-    for (const childNode of node.namedChildren) visit(childNode);
-  }
-
-  visit(root);
-  return bindings;
-}
-
-function collectPythonCalls(root: Parser.SyntaxNode, bindings: BindingMap, literalMap: LiteralMap): ExtractedCall[] {
-  const calls: ExtractedCall[] = [];
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'call') {
-      const fnNode = child(node, 'function');
-      const argsNode = child(node, 'arguments');
-      if (fnNode?.type === 'identifier') {
-        const method = isTypeLike(fnNode.text) ? `${fnNode.text}.constructor` : fnNode.text;
-        calls.push({
-          method,
-          args: parsePythonArgs(argsNode, literalMap),
-          line: node.startPosition.row + 1,
-          raw: node.text,
-        });
-      } else if (fnNode?.type === 'attribute') {
-        const objectNode = child(fnNode, 'object');
-        const attributeNode = child(fnNode, 'attribute');
-        if (objectNode && attributeNode) {
-          const owner = bindings.get(objectNode.text) ?? objectNode.text;
-          calls.push({
-            method: `${owner}.${attributeNode.text}`,
-            args: parsePythonArgs(argsNode, literalMap),
-            line: node.startPosition.row + 1,
-            raw: node.text,
-          });
-        }
-      }
-    }
-
-    for (const childNode of node.namedChildren) visit(childNode);
-  }
-
-  visit(root);
-  return sortCalls(calls);
-}
-
-async function extractPythonSdk(code: string): Promise<RawSdkExtraction> {
-  const parser = await getSdkParser('python');
-  const tree = parser.parse(code);
-  const root = tree.rootNode;
-  const literalMap = collectPythonLiteralBindings(root);
-  const bindings = collectPythonBindings(root);
-  const calls = collectPythonCalls(root, bindings, literalMap);
-  return { calls, bindings };
-}
-
-export const pythonSdkAdapter: SdkLanguageAdapter = {
-  language: 'python',
-  fenceTags: ['python', 'py', ''],
-  extract: extractPythonSdk,
-};
diff --git a/src/benchmark/extractors/sdk/registry.ts b/src/benchmark/extractors/sdk/registry.ts
deleted file mode 100644
index 0e94e35..0000000
--- a/src/benchmark/extractors/sdk/registry.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-import type { SdkLanguage } from '../../types.js';
-import { pythonSdkAdapter } from './python.js';
-import { rustSdkAdapter } from './rust.js';
-import { typescriptSdkAdapter } from './typescript.js';
-import type { RawSdkExtraction, SdkLanguageAdapter } from './types.js';
-
-const ADAPTERS: Record<SdkLanguage, SdkLanguageAdapter> = {
-  typescript: typescriptSdkAdapter,
-  python: pythonSdkAdapter,
-  rust: rustSdkAdapter,
-};
-
-export function getSdkAdapter(language: SdkLanguage): SdkLanguageAdapter {
-  return ADAPTERS[language];
-}
-
-export async function extractSdkFromCode(code: string, language: SdkLanguage): Promise<RawSdkExtraction> {
-  return getSdkAdapter(language).extract(code);
-}
diff --git a/src/benchmark/extractors/sdk/rust.ts b/src/benchmark/extractors/sdk/rust.ts
deleted file mode 100644
index e0e72cc..0000000
--- a/src/benchmark/extractors/sdk/rust.ts
+++ /dev/null
@@ -1,239 +0,0 @@
-import type Parser from 'web-tree-sitter';
-
-import type { ExtractedCall } from '../../types.js';
-import { getSdkParser } from './parser.js';
-import { child, sortCalls, stripQuoted } from './shared.js';
-import type { RawSdkExtraction, SdkLanguageAdapter } from './types.js';
-
-type LiteralMap = Map<string, unknown>;
-type BindingMap = Map<string, string>;
-
-function extractRustValue(node: Parser.SyntaxNode, literalMap: LiteralMap): unknown {
-  switch (node.type) {
-    case 'string_literal':
-      return stripQuoted(node.text);
-    case 'integer_literal':
-    case 'float_literal':
-      return node.text;
-    case 'boolean_literal':
-      return node.text === 'true';
-    case 'identifier':
-      return literalMap.has(node.text) ? literalMap.get(node.text)! : `<${node.text}>`;
-    case 'array_expression': {
-      const result: unknown[] = [];
-      for (const childNode of node.namedChildren) {
-        result.push(extractRustValue(childNode, literalMap));
-      }
-      return result;
-    }
-    case 'struct_expression': {
-      const result: Record<string, unknown> = {};
-      const bodyNode = child(node, 'body');
-      if (!bodyNode) return result;
-      for (const fieldNode of bodyNode.namedChildren) {
-        if (fieldNode.type !== 'field_initializer') continue;
-        const nameNode = child(fieldNode, 'name');
-        const valueNode = child(fieldNode, 'value');
-        if (!nameNode || !valueNode) continue;
-        result[nameNode.text] = extractRustValue(valueNode, literalMap);
-      }
-      return result;
-    }
-    case 'call_expression': {
-      const fnNode = child(node, 'function');
-      const argsNode = child(node, 'arguments');
-      if (fnNode?.type === 'field_expression') {
-        const valueNode = child(fnNode, 'value');
-        const fieldNode = child(fnNode, 'field');
-        if (fieldNode?.text === 'into' && valueNode && argsNode?.namedChildren.length === 0) {
-          return extractRustValue(valueNode, literalMap);
-        }
-      }
-      return '<dynamic>';
-    }
-    default:
-      return `<${node.type}>`;
-  }
-}
-
-function parseRustArgs(argsNode: Parser.SyntaxNode | null, literalMap: LiteralMap): Record<string, unknown> {
-  if (!argsNode) return {};
-
-  const args: Record<string, unknown> = {};
-  let positionalIndex = 0;
-
-  for (const childNode of argsNode.namedChildren) {
-    if (childNode.type === 'struct_expression') {
-      Object.assign(args, extractRustValue(childNode, literalMap) as Record<string, unknown>);
-      continue;
-    }
-
-    args[`_positional_${positionalIndex++}`] = extractRustValue(childNode, literalMap);
-  }
-
-  return args;
-}
-
-function collectRustLiteralBindings(root: Parser.SyntaxNode): LiteralMap {
-  const literalMap: LiteralMap = new Map();
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'let_declaration') {
-      const patternNode = child(node, 'pattern');
-      const valueNode = child(node, 'value');
-      if (patternNode?.type === 'identifier' && valueNode && ['string_literal', 'integer_literal', 'float_literal', 'boolean_literal', 'array_expression', 'struct_expression', 'identifier'].includes(valueNode.type)) {
-        literalMap.set(patternNode.text, extractRustValue(valueNode, literalMap));
-      }
-    }
-
-    for (const childNode of node.namedChildren) visit(childNode);
-  }
-
-  visit(root);
-  return literalMap;
-}
-
-function rustBindingFromCall(node: Parser.SyntaxNode): string | null {
-  const fnNode = child(node, 'function');
-  if (!fnNode) return null;
-
-  if (fnNode.type === 'identifier') return fnNode.text;
-  if (fnNode.type === 'scoped_identifier') {
-    const pathNode = child(fnNode, 'path');
-    return pathNode?.text ?? null;
-  }
-  if (fnNode.type === 'field_expression') {
-    const fieldNode = child(fnNode, 'field');
-    return fieldNode?.text ?? null;
-  }
-
-  return null;
-}
-
-function resolveRustOwner(node: Parser.SyntaxNode | null, bindings: BindingMap): string | null {
-  if (!node) return null;
-
-  if (node.type === 'identifier') {
-    return bindings.get(node.text) ?? node.text;
-  }
-
-  if (node.type === 'try_expression') {
-    return resolveRustOwner(node.namedChildren[0] ?? null, bindings);
-  }
-
-  if (node.type === 'call_expression') {
-    const fnNode = child(node, 'function');
-    if (!fnNode) return null;
-    if (fnNode.type === 'scoped_identifier') {
-      return child(fnNode, 'path')?.text ?? null;
-    }
-    if (fnNode.type === 'field_expression') {
-      const valueNode = child(fnNode, 'value');
-      const fieldNode = child(fnNode, 'field');
-      if (valueNode?.type === 'identifier' && fieldNode) {
-        return fieldNode.text;
-      }
-      return resolveRustOwner(valueNode, bindings);
-    }
-    if (fnNode.type === 'identifier') {
-      return fnNode.text;
-    }
-  }
-
-  if (node.type === 'field_expression') {
-    return resolveRustOwner(child(node, 'value'), bindings);
-  }
-
-  return null;
-}
-
-function unwrapRustCall(node: Parser.SyntaxNode | null): Parser.SyntaxNode | null {
-  if (!node) return null;
-  if (node.type === 'call_expression') return node;
-  if (node.type === 'try_expression') return node.namedChildren[0] ?? null;
-  return null;
-}
-
-function collectRustBindings(root: Parser.SyntaxNode): BindingMap {
-  const bindings: BindingMap = new Map();
-
-  function visit(node: Parser.SyntaxNode): void {
-    if (node.type === 'let_declaration') {
-      const patternNode = child(node, 'pattern');
-      const valueNode = child(node, 'value');
-      const callNode = unwrapRustCall(valueNode);
-      if (patternNode?.type === 'identifier' && callNode?.type === 'call_expression') {
-        const source = rustBindingFromCall(callNode);
-        if (source) bindings.set(patternNode.text, source);
-      }
-    }
-
-    for (const childNode of node.namedChildren) visit(childNode);
-  }
-
-  visit(root);
-  return bindings;
-}
-
-function collectRustCalls(root: Parser.SyntaxNode, bindings: BindingMap, literalMap: LiteralMap): ExtractedCall[] {
-  const calls: ExtractedCall[] = [];
-
-  function visit(node: Parser.SyntaxNode): void {
-    for (const childNode of node.namedChildren) visit(childNode);
-
-    if (node.type === 'call_expression') {
-      const fnNode = child(node, 'function');
-      const argsNode = child(node, 'arguments');
-      if (fnNode?.type === 'identifier') {
-        calls.push({
-          method: fnNode.text,
-          args: parseRustArgs(argsNode, literalMap),
-          line: node.startPosition.row + 1,
-          raw: node.text,
-        });
-      } else if (fnNode?.type === 'scoped_identifier') {
-        const pathNode = child(fnNode, 'path');
-        const nameNode = child(fnNode, 'name');
-        if (pathNode && nameNode) {
-          calls.push({
-            method: `${pathNode.text}.${nameNode.text}`,
-            args: parseRustArgs(argsNode, literalMap),
-            line: node.startPosition.row + 1,
-            raw: node.text,
-          });
-        }
-      } else if (fnNode?.type === 'field_expression') {
-        const valueNode = child(fnNode, 'value');
-        const fieldNode = child(fnNode, 'field');
-        const owner = resolveRustOwner(valueNode, bindings);
-        if (owner && fieldNode) {
-          calls.push({
-            method: `${owner}.${fieldNode.text}`,
-            args: parseRustArgs(argsNode, literalMap),
-            line: node.startPosition.row + 1,
-            raw: node.text,
-          });
-        }
-      }
-    }
-  }
-
-  visit(root);
-  return sortCalls(calls);
-}
-
-async function extractRustSdk(code: string): Promise<RawSdkExtraction> {
-  const parser = await getSdkParser('rust');
-  const tree = parser.parse(code);
-  const root = tree.rootNode;
-  const literalMap = collectRustLiteralBindings(root);
-  const bindings = collectRustBindings(root);
-  const calls = collectRustCalls(root, bindings, literalMap);
-  return { calls, bindings };
-}
-
-export const rustSdkAdapter: SdkLanguageAdapter = {
-  language: 'rust',
-  fenceTags: ['rust', 'rs', ''],
-  extract: extractRustSdk,
-};
diff --git a/src/benchmark/extractors/sdk/shared.ts b/src/benchmark/extractors/sdk/shared.ts
deleted file mode 100644
index 2a90133..0000000
--- a/src/benchmark/extractors/sdk/shared.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-import type Parser from 'web-tree-sitter';
-
-import type { ExtractedCall } from '../../types.js';
-
-export function stripQuoted(text: string): string {
-  if (
-    (text.startsWith('"') && text.endsWith('"')) ||
-    (text.startsWith("'") && text.endsWith("'")) ||
-    (text.startsWith('`') && text.endsWith('`'))
-  ) {
-    return text.slice(1, -1);
-  }
-
-  return text;
-}
-
-export function isTypeLike(name: string): boolean {
-  return /^[A-Z]/.test(name);
-}
-
-export function sortCalls(calls: ExtractedCall[]): ExtractedCall[] {
-  calls.sort((a, b) => a.line - b.line);
-  return calls;
-}
-
-export function child(node: Parser.SyntaxNode, field: string): Parser.SyntaxNode | null {
-  return node.childForFieldName(field);
-}
diff --git a/src/benchmark/extractors/sdk/types.ts b/src/benchmark/extractors/sdk/types.ts
deleted file mode 100644
index d53d105..0000000
--- a/src/benchmark/extractors/sdk/types.ts
+++ /dev/null
@@ -1,12 +0,0 @@
-import type { ExtractedCall, SdkLanguage } from '../../types.js';
-
-export interface RawSdkExtraction {
-  calls: ExtractedCall[];
-  bindings?: Map<string, string>;
-}
-
-export interface SdkLanguageAdapter {
-  language: SdkLanguage;
-  fenceTags: string[];
-  extract(code: string): Promise<RawSdkExtraction>;
-}
diff --git a/src/benchmark/extractors/sdk/typescript.ts b/src/benchmark/extractors/sdk/typescript.ts
deleted file mode 100644
index 22519e3..0000000
--- a/src/benchmark/extractors/sdk/typescript.ts
+++ /dev/null
@@ -1,13 +0,0 @@
-import { extractAllFromCode } from '../code-analyzer.js';
-
-import type { RawSdkExtraction, SdkLanguageAdapter } from './types.js';
-
-async function extractTypeScriptSdk(code: string): Promise<RawSdkExtraction> {
-  return extractAllFromCode(code);
-}
-
-export const typescriptSdkAdapter: SdkLanguageAdapter = {
-  language: 'typescript',
-  fenceTags: ['typescript', 'ts', 'javascript', 'js', ''],
-  extract: extractTypeScriptSdk,
-};
diff --git a/src/benchmark/index.ts b/src/benchmark/index.ts
deleted file mode 100644
index 3121a70..0000000
--- a/src/benchmark/index.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-// Public API
-export { runBenchmark, type RunnerOptions } from './runner.js';
-export { loadConfig, loadTasks, loadMcpTools, loadCliCommands } from './config.js';
-export { createLLMClient } from './llm/index.js';
-export { extract } from './extractors/index.js';
-export { evaluateTask } from './evaluator.js';
-export { computeCoverage } from './coverage.js';
-export { loadReport, compareReports } from './compare.js';
-export { generateMarkdown, printSummary } from './reporter.js';
-export { fetchSkill } from './skill-fetcher.js';
-export { initBenchmark } from './init.js';
-export { extractSdkFromCode, getSdkAdapter } from './extractors/sdk/registry.js';
-
-// Re-export key types
-export type {
-  BenchmarkSurface,
-  SdkLanguage,
-  BenchmarkConfig,
-  SdkSurfaceConfig,
-  CliSurfaceConfig,
-  CliCommandDefinition,
-  CliCommandOptionDefinition,
-  McpSurfaceConfig,
-  LLMConfig,
-  TaskDefinition, ExpectedAction, ExtractedCall, ActionMatch,
-  TaskResult, BenchmarkReport, ComparisonReport,
-  ModelConfig, Tier, LLMResponse, ToolCallResult,
-} from './types.js';
diff --git a/src/benchmark/init.ts b/src/benchmark/init.ts
deleted file mode 100644
index dd0fe2a..0000000
--- a/src/benchmark/init.ts
+++ /dev/null
@@ -1,212 +0,0 @@
-import { writeFileSync, existsSync, mkdirSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-export function initBenchmark(targetDir: string = process.cwd(), surface: 'sdk' | 'cli' | 'mcp' | 'prompt' = 'sdk'): void {
-  const generatedDir = resolve(targetDir, '.skill-optimizer');
-  mkdirSync(generatedDir, { recursive: true });
-
-  const configPath = resolve(generatedDir, 'skill-optimizer.json');
-
-  if (existsSync(configPath)) {
-    console.log(`[init] Skipping ${configPath} (already exists)`);
-  } else {
-    writeFileSync(configPath, JSON.stringify(buildConfig(surface), null, 2) + '\n', 'utf-8');
-    console.log(`[init] Created ${configPath}`);
-  }
-
-  if (surface === 'cli') {
-    const commandsPath = resolve(generatedDir, 'cli-commands.json');
-    if (existsSync(commandsPath)) {
-      console.log(`[init] Skipping ${commandsPath} (already exists)`);
-    } else {
-      const commands = [
-        {
-          command: 'example-create',
-          description: 'Create a new item',
-          options: [
-            { name: '--name', takesValue: true, description: 'Name for the item' },
-          ],
-        },
-        {
-          command: 'example-list',
-          description: 'List all items',
-          options: [
-            { name: '--format', takesValue: true, description: 'Output format: json | table (default: table)' },
-          ],
-        },
-      ];
-      writeFileSync(commandsPath, JSON.stringify(commands, null, 2) + '\n', 'utf-8');
-      console.log(`[init] Created ${commandsPath} (template — edit with your real commands)`);
-    }
-  }
-
-  if (surface === 'mcp') {
-    const toolsPath = resolve(generatedDir, 'tools.json');
-    if (existsSync(toolsPath)) {
-      console.log(`[init] Skipping ${toolsPath} (already exists)`);
-    } else {
-      const tools = [
-        {
-          type: 'function',
-          function: {
-            name: 'get_data',
-            description: 'Get data for a given item ID',
-            parameters: {
-              type: 'object',
-              properties: {
-                item_id: { type: 'string', description: 'The item identifier' },
-              },
-              required: ['item_id'],
-            },
-          },
-        },
-        {
-          type: 'function',
-          function: {
-            name: 'send_data',
-            description: 'Send data to a recipient',
-            parameters: {
-              type: 'object',
-              properties: {
-                value: { type: 'string', description: 'The data to send' },
-                recipient: { type: 'string', description: 'The recipient identifier' },
-              },
-              required: ['value', 'recipient'],
-            },
-          },
-        },
-      ];
-      writeFileSync(toolsPath, JSON.stringify(tools, null, 2) + '\n', 'utf-8');
-      console.log(`[init] Created ${toolsPath} (template — edit with your real tools)`);
-    }
-  }
-
-  console.log('\n[init] Done!');
-  console.log(`  Surface:    ${surface}`);
-  console.log(`  Config:     ${configPath}`);
-  console.log(`  Artifacts:  ${generatedDir}/`);
-  console.log('');
-  console.log('  Next steps:');
-  console.log('  1. Edit skill-optimizer.json:');
-  console.log('       target.repoPath  → path to your repo (default: current dir)');
-  console.log('       target.skill     → path to your SKILL.md');
-
-  if (surface === 'sdk') {
-    console.log('       target.discovery.sources → entry file(s) for SDK discovery');
-  } else if (surface === 'cli') {
-    console.log('       target.discovery.sources → CLI entry file (for code-first discovery)');
-    console.log('       .skill-optimizer/cli-commands.json → replace template with your real commands');
-    console.log('       (cli-commands.json is used as a fallback if code-first discovery finds nothing)');
-  } else if (surface === 'prompt') {
-    console.log('       target.skill → path to your SKILL.md or prompt document');
-    console.log('       (no discovery sources needed — capabilities are read directly from the skill file)');
-  } else {
-    console.log('       target.discovery.sources → MCP server file (for code-first discovery)');
-    console.log('       .skill-optimizer/tools.json → replace template with your real tools');
-    console.log('       (tools.json is used as a fallback if code-first discovery finds nothing)');
-  }
-
-  console.log('       benchmark.models → update with real OpenRouter model IDs');
-  console.log('  2. Create SKILL.md — explain your surface to the model');
-  console.log('  3. Run: skill-optimizer optimize --config ./.skill-optimizer/skill-optimizer.json');
-}
-
-function buildConfig(surface: 'sdk' | 'cli' | 'mcp' | 'prompt'): object {
-  const commonBenchmark = {
-    format: 'pi',
-    timeout: 240000,
-    taskGeneration: {
-      enabled: true,
-      maxTasks: 20,
-      outputDir: '.',
-    },
-    models: [
-      { id: 'openrouter/anthropic/claude-sonnet-4.6', name: 'Claude Sonnet 4.6', tier: 'flagship' },
-      { id: 'openrouter/deepseek/deepseek-v3.2', name: 'DeepSeek V3.2', tier: 'flagship' },
-      { id: 'openrouter/google/gemini-2.5-flash', name: 'Gemini 2.5 Flash', tier: 'mid' },
-    ],
-    output: {
-      dir: '../benchmark-results',
-    },
-    verdict: {
-      perModelFloor: 0.6,
-      targetWeightedAverage: 0.7,
-    },
-  };
-
-  const commonOptimize = {
-    model: 'openrouter/anthropic/claude-sonnet-4.6',
-    allowedPaths: ['./SKILL.md'],
-    validation: [],
-    maxIterations: 5,
-  };
-
-  if (surface === 'sdk') {
-    return {
-      name: 'my-sdk',
-      target: {
-        surface: 'sdk',
-        repoPath: '..',
-        skill: '../SKILL.md',
-        discovery: {
-          mode: 'auto',
-          sources: ['../src/index.ts'],
-        },
-      },
-      benchmark: commonBenchmark,
-      optimize: commonOptimize,
-    };
-  }
-
-  if (surface === 'cli') {
-    return {
-      name: 'my-cli',
-      target: {
-        surface: 'cli',
-        repoPath: '..',
-        skill: '../SKILL.md',
-        discovery: {
-          mode: 'auto',
-          sources: ['../src/cli.ts'],
-        },
-        cli: {
-          commands: './cli-commands.json',
-        },
-      },
-      benchmark: commonBenchmark,
-      optimize: commonOptimize,
-    };
-  }
-
-  if (surface === 'prompt') {
-    return {
-      name: 'my-prompt',
-      target: {
-        surface: 'prompt',
-        repoPath: '..',
-        skill: '../SKILL.md',
-      },
-      benchmark: commonBenchmark,
-      optimize: commonOptimize,
-    };
-  }
-
-  // mcp
-  return {
-    name: 'my-mcp',
-    target: {
-      surface: 'mcp',
-      repoPath: '..',
-      skill: '../SKILL.md',
-      discovery: {
-        mode: 'auto',
-        sources: ['../src/server.ts'],
-      },
-      mcp: {
-        tools: './tools.json',
-      },
-    },
-    benchmark: commonBenchmark,
-    optimize: commonOptimize,
-  };
-}
diff --git a/src/benchmark/llm/anthropic-format.ts b/src/benchmark/llm/anthropic-format.ts
deleted file mode 100644
index 542f1ac..0000000
--- a/src/benchmark/llm/anthropic-format.ts
+++ /dev/null
@@ -1,229 +0,0 @@
-import type { LLMResponse, McpToolDefinition, ToolExecutor } from '../types.js';
-import {
-  isAbortError,
-  isRetryableError,
-  sleep,
-  truncateToolResult,
-  undefinedIfEmpty,
-  normalizeUsage,
-} from './shared.js';
-import { createToolNameAliasCodec } from './tool-name-aliases.js';
-
-interface CallParams {
-  baseUrl: string;
-  apiKey: string | undefined;
-  timeout: number;
-  extraHeaders: Record<string, string>;
-  modelId: string;
-  system: string;
-  user: string;
-}
-
-interface CallWithToolsParams extends CallParams {
-  tools: McpToolDefinition[];
-}
-
-interface AnthropicTool {
-  name: string;
-  description?: string;
-  input_schema: {
-    type: 'object';
-    properties?: Record<string, unknown>;
-    required?: string[];
-  };
-}
-
-function toAnthropicTool(tool: McpToolDefinition): AnthropicTool {
-  return {
-    name: tool.function.name,
-    ...(tool.function.description !== undefined && { description: tool.function.description }),
-    input_schema: {
-      type: 'object',
-      ...(tool.function.parameters?.properties !== undefined && {
-        properties: tool.function.parameters.properties,
-      }),
-      ...(tool.function.parameters?.required !== undefined && {
-        required: tool.function.parameters.required,
-      }),
-    },
-  };
-}
-
-/**
- * Regular chat completion (code mode).
- * POST {baseUrl}/v1/messages
- */
-export async function chatAnthropic(params: CallParams): Promise<LLMResponse> {
-  const body = {
-    model: params.modelId,
-    max_tokens: 8192,
-    system: params.system,
-    messages: [{ role: 'user', content: params.user }],
-    temperature: 0.2,
-  };
-  return callWithRetry(params, body);
-}
-
-/**
- * Chat with tools (MCP mode).
- * POST {baseUrl}/v1/messages with tools array
- */
-export async function chatWithToolsAnthropic(params: CallWithToolsParams): Promise<LLMResponse> {
-  const toolCodec = createToolNameAliasCodec(params.tools);
-  const body = {
-    model: params.modelId,
-    max_tokens: 8192,
-    system: params.system,
-    messages: [{ role: 'user', content: params.user }],
-    tools: toolCodec.tools.map(toAnthropicTool),
-    temperature: 0.2,
-  };
-  return callWithRetry(params, body, toolCodec.toCanonical);
-}
-
-async function doFetch(params: CallParams, body: Record<string, unknown>): Promise<LLMResponse> {
-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), params.timeout);
-
-  const headers: Record<string, string> = {
-    'Content-Type': 'application/json',
-    'anthropic-version': '2023-06-01',
-    ...params.extraHeaders,
-  };
-  if (params.apiKey) {
-    headers['x-api-key'] = params.apiKey;
-  }
-
-  let response: Response;
-  try {
-    response = await fetch(`${params.baseUrl}/v1/messages`, {
-      method: 'POST',
-      headers,
-      body: JSON.stringify(body),
-      signal: controller.signal,
-    });
-  } finally {
-    clearTimeout(timer);
-  }
-
-  if (!response.ok) {
-    const text = await response.text();
-    const err = new Error(`Anthropic API error ${response.status}: ${text}`) as Error & { status: number };
-    err.status = response.status;
-    throw err;
-  }
-
-  const data = await response.json() as {
-    content: Array<
-      | { type: 'text'; text: string }
-      | { type: 'tool_use'; name: string; input: Record<string, unknown> }
-    >;
-    usage?: {
-      input_tokens: number;
-      output_tokens: number;
-    };
-  };
-
-  if (!data.content || !Array.isArray(data.content)) {
-    throw new Error(`Anthropic API returned unexpected response: missing content array`);
-  }
-
-  const textBlock = data.content.find((b) => b.type === 'text') as
-    | { type: 'text'; text: string }
-    | undefined;
-  const content = textBlock?.text ?? '';
-
-  const toolUseBlocks = data.content.filter((b) => b.type === 'tool_use') as Array<{
-    type: 'tool_use';
-    name: string;
-    input: Record<string, unknown>;
-  }>;
-  const toolCalls =
-    toolUseBlocks.length > 0
-      ? toolUseBlocks.map((block) => ({
-          name: block.name,
-          arguments: block.input,
-        }))
-      : undefined;
-
-  const usage = data.usage
-    ? {
-        prompt: data.usage.input_tokens,
-        completion: data.usage.output_tokens,
-        total: data.usage.input_tokens + data.usage.output_tokens,
-      }
-    : undefined;
-
-  return { content, toolCalls, usage };
-}
-
-interface AgentLoopParams extends CallWithToolsParams {
-  executor: ToolExecutor;
-  maxTurns: number;
-}
-
-export async function chatAgentLoopAnthropic(params: AgentLoopParams): Promise<LLMResponse> {
-  const toolCodec = createToolNameAliasCodec(params.tools);
-  const messages: Array<Record<string, unknown>> = [{ role: 'user', content: params.user }];
-  const allToolCalls: Array<{ name: string; arguments: Record<string, unknown> }> = [];
-  const totalUsage = { prompt: 0, completion: 0, total: 0 };
-
-  for (let turn = 0; turn < params.maxTurns; turn++) {
-    const body: Record<string, unknown> = {
-      model: params.modelId, max_tokens: 8192, system: params.system,
-      messages, tools: toolCodec.tools.map(toAnthropicTool), temperature: 0.2,
-    };
-    const response = await callWithRetry(params, body, toolCodec.toCanonical);
-    if (response.usage) {
-      totalUsage.prompt += response.usage.prompt;
-      totalUsage.completion += response.usage.completion;
-      totalUsage.total += response.usage.total;
-    }
-    if (!response.toolCalls || response.toolCalls.length === 0) {
-      return { content: response.content, toolCalls: undefinedIfEmpty(allToolCalls), usage: normalizeUsage(totalUsage) };
-    }
-    allToolCalls.push(...response.toolCalls);
-    const assistantContent = [
-      ...(response.content ? [{ type: 'text', text: response.content }] : []),
-      ...response.toolCalls.map((tc, i) => ({ type: 'tool_use', id: `toolu_${turn}_${i}`, name: toolCodec.toProvider(tc.name), input: tc.arguments })),
-    ];
-    messages.push({ role: 'assistant', content: assistantContent });
-    const toolResults = [];
-    for (let i = 0; i < response.toolCalls.length; i++) {
-      const tc = response.toolCalls[i];
-      let result: string;
-      try { result = await params.executor(tc.name, tc.arguments); }
-      catch (err) { result = `Error: ${err instanceof Error ? err.message : String(err)}`; }
-      toolResults.push({ type: 'tool_result', tool_use_id: `toolu_${turn}_${i}`, content: truncateToolResult(result) });
-    }
-    messages.push({ role: 'user', content: toolResults });
-  }
-  return { content: '', toolCalls: undefinedIfEmpty(allToolCalls), usage: normalizeUsage(totalUsage) };
-}
-
-async function callWithRetry(
-  params: CallParams,
-  body: Record<string, unknown>,
-  toCanonicalToolName: (name: string) => string = (name) => name,
-): Promise<LLMResponse> {
-  const applyCodec = (response: LLMResponse): LLMResponse => {
-    if (!response.toolCalls || response.toolCalls.length === 0) return response;
-    return {
-      ...response,
-      toolCalls: response.toolCalls.map((toolCall) => ({
-        ...toolCall,
-        name: toCanonicalToolName(toolCall.name),
-      })),
-    };
-  };
-
-  try {
-    return applyCodec(await doFetch(params, body));
-  } catch (err) {
-    if (isAbortError(err)) throw err;
-    if (!isRetryableError(err)) throw err;
-    // Retry once after 3s
-    await sleep(3_000);
-    return applyCodec(await doFetch(params, body));
-  }
-}
diff --git a/src/benchmark/llm/index.ts b/src/benchmark/llm/index.ts
deleted file mode 100644
index 24c2c23..0000000
--- a/src/benchmark/llm/index.ts
+++ /dev/null
@@ -1,230 +0,0 @@
-import type { LLMConfig, LLMResponse, McpToolDefinition, ToolExecutor } from '../types.js';
-import { chatOpenAI, chatWithToolsOpenAI, chatAgentLoopOpenAI } from './openai-format.js';
-import { chatAnthropic, chatWithToolsAnthropic, chatAgentLoopAnthropic } from './anthropic-format.js';
-import { chatPi, chatWithToolsPi, chatAgentLoopPi } from './pi-format.js';
-import { requireConfiguredApiKey, resolveApiCredential } from '../../runtime/pi/index.js';
-
-interface LLMClient {
-  /** Regular chat — LLM returns text output (SDK/CLI surfaces) */
-  chat(modelId: string, system: string, user: string): Promise<LLMResponse>;
-  /** Chat with tools — LLM returns structured tool_calls (MCP surface) */
-  chatWithTools(modelId: string, system: string, user: string, tools: McpToolDefinition[]): Promise<LLMResponse>;
-  /** Agentic multi-turn loop — LLM can call tools and receive results across multiple turns */
-  chatAgentLoop(
-    modelId: string, system: string, user: string,
-    tools: McpToolDefinition[], executor: ToolExecutor, maxTurns?: number,
-  ): Promise<LLMResponse>;
-}
-
-/**
- * Strip the provider prefix from a model ID when talking directly to a provider API.
- *
- * Only strips "anthropic/" and "openai/" prefixes — the prefixes that belong to
- * direct-API configs. "openrouter/" prefixes are intentionally left intact: they
- * signal that the ID belongs to format:'pi' (OpenRouter), so encountering one here
- * means the config is misconfigured and we want a fast, visible API error rather
- * than silently misrouting the request.
- */
-function stripProviderPrefix(modelId: string): string {
-  if (modelId.startsWith('anthropic/')) return modelId.slice('anthropic/'.length);
-  if (modelId.startsWith('openai/')) return modelId.slice('openai/'.length);
-  return modelId;
-}
-
-/**
- * Create an LLM client from config.
- */
-export function createLLMClient(config: LLMConfig): LLMClient {
-  const baseUrl = config.baseUrl?.replace(/\/+$/, ''); // strip trailing slash
-  const timeout = config.timeout ?? 240_000;
-  const extraHeaders = config.headers ?? {};
-
-  function resolveOpenAICredential() {
-    if (config.format !== 'openai') return undefined;
-    return resolveApiCredential({
-      provider: 'openai',
-      authMode: config.authMode,
-      apiKeyEnv: config.apiKeyEnv,
-    });
-  }
-
-  const resolveDirectApiKey = (provider: 'openai' | 'anthropic', openAICredential?: ReturnType<typeof resolveOpenAICredential>): string =>
-    provider === 'openai' && openAICredential?.apiKey
-      ? openAICredential.apiKey
-      : requireConfiguredApiKey({
-        provider,
-        authMode: config.authMode,
-        apiKeyEnv: config.apiKeyEnv,
-      });
-  const toOpenAIProviderModelRef = (modelId: string): string =>
-    modelId.includes('/') ? modelId : `openai/${modelId}`;
-
-  // When format is 'anthropic' or 'openai', we're talking directly to a provider
-  // API that doesn't understand prefixed model IDs like "anthropic/claude-sonnet-4-6".
-  // Strip the prefix so only the bare model name (e.g. "claude-sonnet-4-6") is sent.
-  const shouldStripPrefix = config.format === 'anthropic' || config.format === 'openai';
-
-  return {
-    async chat(modelId, system, user) {
-      const openAICredential = resolveOpenAICredential();
-      const resolvedModelId = shouldStripPrefix ? stripProviderPrefix(modelId) : modelId;
-      if (config.format === 'pi') {
-        return chatPi({
-          timeout,
-          modelId,
-          system,
-          user,
-          authMode: config.authMode,
-          apiKeyEnv: config.apiKeyEnv,
-          headers: config.headers,
-        });
-      }
-      // openAICredential is set only when config.format === 'openai'
-      if (openAICredential?.source === 'codex') {
-        // Pass authMode:'codex' so Pi re-reads ~/.codex/auth.json and sets source:'codex',
-        // which is required for resolvePiModel to route to the openai-codex provider
-        // (synthesizeOpenAICodexModel guards on provider === 'openai-codex'). Using
-        // apiKeyOverride here would return source:'override' and break that routing.
-        return chatPi({
-          timeout,
-          modelId: toOpenAIProviderModelRef(modelId),
-          system,
-          user,
-          authMode: 'codex',
-          apiKeyEnv: config.apiKeyEnv,
-          headers: config.headers,
-        });
-      }
-      if (config.format === 'anthropic') {
-        return chatAnthropic({
-          baseUrl: baseUrl!,
-          apiKey: resolveDirectApiKey('anthropic'),
-          timeout,
-          extraHeaders,
-          modelId: resolvedModelId,
-          system,
-          user,
-        });
-      }
-      return chatOpenAI({
-        baseUrl: baseUrl!,
-        apiKey: resolveDirectApiKey('openai', openAICredential),
-        timeout,
-        extraHeaders,
-        modelId: resolvedModelId,
-        system,
-        user,
-      });
-    },
-    async chatWithTools(modelId, system, user, tools) {
-      const openAICredential = resolveOpenAICredential();
-      const resolvedModelId = shouldStripPrefix ? stripProviderPrefix(modelId) : modelId;
-      if (config.format === 'pi') {
-        return chatWithToolsPi({
-          timeout,
-          modelId,
-          system,
-          user,
-          tools,
-          authMode: config.authMode,
-          apiKeyEnv: config.apiKeyEnv,
-          headers: config.headers,
-        });
-      }
-      // openAICredential is set only when config.format === 'openai'
-      if (openAICredential?.source === 'codex') {
-        return chatWithToolsPi({
-          timeout,
-          modelId: toOpenAIProviderModelRef(modelId),
-          system,
-          user,
-          tools,
-          authMode: 'codex',
-          apiKeyEnv: config.apiKeyEnv,
-          headers: config.headers,
-        });
-      }
-      if (config.format === 'anthropic') {
-        return chatWithToolsAnthropic({
-          baseUrl: baseUrl!,
-          apiKey: resolveDirectApiKey('anthropic'),
-          timeout,
-          extraHeaders,
-          modelId: resolvedModelId,
-          system,
-          user,
-          tools,
-        });
-      }
-      return chatWithToolsOpenAI({
-        baseUrl: baseUrl!,
-        apiKey: resolveDirectApiKey('openai', openAICredential),
-        timeout,
-        extraHeaders,
-        modelId: resolvedModelId,
-        system,
-        user,
-        tools,
-      });
-    },
-    async chatAgentLoop(modelId, system, user, tools, executor, maxTurns = 5) {
-      const openAICredential = resolveOpenAICredential();
-      const resolvedModelId = shouldStripPrefix ? stripProviderPrefix(modelId) : modelId;
-      if (config.format === 'pi') {
-        return chatAgentLoopPi({
-          timeout,
-          modelId,
-          system,
-          user,
-          tools,
-          executor,
-          maxTurns,
-          authMode: config.authMode,
-          apiKeyEnv: config.apiKeyEnv,
-          headers: config.headers,
-        });
-      }
-      // openAICredential is set only when config.format === 'openai'
-      if (openAICredential?.source === 'codex') {
-        return chatAgentLoopPi({
-          timeout,
-          modelId: toOpenAIProviderModelRef(modelId),
-          system,
-          user,
-          tools,
-          executor,
-          maxTurns,
-          authMode: 'codex',
-          apiKeyEnv: config.apiKeyEnv,
-          headers: config.headers,
-        });
-      }
-      if (config.format === 'anthropic') {
-        return chatAgentLoopAnthropic({
-          baseUrl: baseUrl!,
-          apiKey: resolveDirectApiKey('anthropic'),
-          timeout,
-          extraHeaders,
-          modelId: resolvedModelId,
-          system,
-          user,
-          tools,
-          executor,
-          maxTurns,
-        });
-      }
-      return chatAgentLoopOpenAI({
-        baseUrl: baseUrl!,
-        apiKey: resolveDirectApiKey('openai', openAICredential),
-        timeout,
-        extraHeaders,
-        modelId: resolvedModelId,
-        system,
-        user,
-        tools,
-        executor,
-        maxTurns,
-      });
-    },
-  };
-}
diff --git a/src/benchmark/llm/openai-format.ts b/src/benchmark/llm/openai-format.ts
deleted file mode 100644
index a2441f2..0000000
--- a/src/benchmark/llm/openai-format.ts
+++ /dev/null
@@ -1,206 +0,0 @@
-import type { LLMResponse, McpToolDefinition, ToolExecutor } from '../types.js';
-import { createToolNameAliasCodec } from './tool-name-aliases.js';
-import {
-  isAbortError,
-  isRetryableError,
-  sleep,
-  truncateToolResult,
-  undefinedIfEmpty,
-  normalizeUsage,
-} from './shared.js';
-
-interface CallParams {
-  baseUrl: string;
-  apiKey: string | undefined;
-  timeout: number;
-  extraHeaders: Record<string, string>;
-  modelId: string;
-  system: string;
-  user: string;
-}
-
-interface CallWithToolsParams extends CallParams {
-  tools: McpToolDefinition[];
-}
-
-export async function chatOpenAI(params: CallParams): Promise<LLMResponse> {
-  const body = {
-    model: params.modelId,
-    messages: [
-      { role: 'system', content: params.system },
-      { role: 'user', content: params.user },
-    ],
-    temperature: 0.2,
-  };
-  return callWithRetry(params, body);
-}
-
-export async function chatWithToolsOpenAI(params: CallWithToolsParams): Promise<LLMResponse> {
-  const toolCodec = createToolNameAliasCodec(params.tools);
-  const body = {
-    model: params.modelId,
-    messages: [
-      { role: 'system', content: params.system },
-      { role: 'user', content: params.user },
-    ],
-    tools: toolCodec.tools,
-    tool_choice: 'auto',
-    temperature: 0.2,
-  };
-  return callWithRetry(params, body, toolCodec.toCanonical);
-}
-
-async function doFetch(params: CallParams, body: Record<string, unknown>): Promise<LLMResponse> {
-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), params.timeout);
-
-  const headers: Record<string, string> = {
-    'Content-Type': 'application/json',
-    ...params.extraHeaders,
-  };
-  if (params.apiKey) {
-    headers['Authorization'] = `Bearer ${params.apiKey}`;
-  }
-
-  let response: Response;
-  try {
-    response = await fetch(`${params.baseUrl}/chat/completions`, {
-      method: 'POST',
-      headers,
-      body: JSON.stringify(body),
-      signal: controller.signal,
-    });
-  } finally {
-    clearTimeout(timer);
-  }
-
-  if (!response.ok) {
-    const text = await response.text();
-    const err = new Error(`OpenAI API error ${response.status}: ${text}`) as Error & { status: number };
-    err.status = response.status;
-    throw err;
-  }
-
-  const data = await response.json() as {
-    choices: Array<{
-      message: {
-        content?: string | null;
-        tool_calls?: Array<{
-          function: { name: string; arguments: string };
-        }>;
-      };
-    }>;
-    usage?: {
-      prompt_tokens: number;
-      completion_tokens: number;
-      total_tokens: number;
-    };
-  };
-
-  if (!data.choices || !Array.isArray(data.choices) || data.choices.length === 0) {
-    throw new Error(`OpenAI API returned unexpected response: no choices in response`);
-  }
-
-  const message = data.choices[0]?.message;
-  const content = message?.content ?? '';
-
-  const toolCalls = message?.tool_calls?.map((tc) => {
-    let parsedArguments: Record<string, unknown> = {};
-    try {
-      parsedArguments = JSON.parse(tc.function.arguments) as Record<string, unknown>;
-    } catch {
-      process.stderr.write(
-        `[openai-format] Warning: failed to parse tool call arguments for "${tc.function.name}". ` +
-          `Raw value: ${tc.function.arguments}\n`,
-      );
-    }
-    return { name: tc.function.name, arguments: parsedArguments };
-  });
-
-  const usage = data.usage
-    ? {
-        prompt: data.usage.prompt_tokens,
-        completion: data.usage.completion_tokens,
-        total: data.usage.total_tokens,
-      }
-    : undefined;
-
-  return { content, toolCalls, usage };
-}
-
-interface AgentLoopParams extends CallWithToolsParams {
-  executor: ToolExecutor;
-  maxTurns: number;
-}
-
-export async function chatAgentLoopOpenAI(params: AgentLoopParams): Promise<LLMResponse> {
-  const toolCodec = createToolNameAliasCodec(params.tools);
-  const messages: Array<Record<string, unknown>> = [
-    { role: 'system', content: params.system },
-    { role: 'user', content: params.user },
-  ];
-  const allToolCalls: Array<{ name: string; arguments: Record<string, unknown> }> = [];
-  const totalUsage = { prompt: 0, completion: 0, total: 0 };
-
-  for (let turn = 0; turn < params.maxTurns; turn++) {
-    const body: Record<string, unknown> = {
-      model: params.modelId, messages, tools: toolCodec.tools, tool_choice: 'auto', temperature: 0.2,
-    };
-    const response = await callWithRetry(params, body, toolCodec.toCanonical);
-    if (response.usage) {
-      totalUsage.prompt += response.usage.prompt;
-      totalUsage.completion += response.usage.completion;
-      totalUsage.total += response.usage.total;
-    }
-    if (!response.toolCalls || response.toolCalls.length === 0) {
-      return { content: response.content, toolCalls: undefinedIfEmpty(allToolCalls), usage: normalizeUsage(totalUsage) };
-    }
-    allToolCalls.push(...response.toolCalls);
-    messages.push({
-      role: 'assistant', content: response.content || null,
-      tool_calls: response.toolCalls.map((tc, i) => ({
-        id: `call_${turn}_${i}`,
-        type: 'function',
-        function: {
-          name: toolCodec.toProvider(tc.name),
-          arguments: JSON.stringify(tc.arguments),
-        },
-      })),
-    });
-    for (let i = 0; i < response.toolCalls.length; i++) {
-      const tc = response.toolCalls[i];
-      let result: string;
-      try { result = await params.executor(tc.name, tc.arguments); }
-      catch (err) { result = `Error: ${err instanceof Error ? err.message : String(err)}`; }
-      messages.push({ role: 'tool', tool_call_id: `call_${turn}_${i}`, content: truncateToolResult(result) });
-    }
-  }
-  return { content: '', toolCalls: undefinedIfEmpty(allToolCalls), usage: normalizeUsage(totalUsage) };
-}
-
-async function callWithRetry(
-  params: CallParams,
-  body: Record<string, unknown>,
-  toCanonicalToolName: (name: string) => string = (name) => name,
-): Promise<LLMResponse> {
-  const applyCodec = (response: LLMResponse): LLMResponse => {
-    if (!response.toolCalls || response.toolCalls.length === 0) return response;
-    return {
-      ...response,
-      toolCalls: response.toolCalls.map((toolCall) => ({
-        ...toolCall,
-        name: toCanonicalToolName(toolCall.name),
-      })),
-    };
-  };
-
-  try {
-    return applyCodec(await doFetch(params, body));
-  } catch (err) {
-    if (isAbortError(err)) throw err;
-    if (!isRetryableError(err)) throw err;
-    // Retry once after 3s
-    await sleep(3_000);
-    return applyCodec(await doFetch(params, body));
-  }
-}
diff --git a/src/benchmark/llm/pi-format.ts b/src/benchmark/llm/pi-format.ts
deleted file mode 100644
index 76149ea..0000000
--- a/src/benchmark/llm/pi-format.ts
+++ /dev/null
@@ -1,250 +0,0 @@
-import { Type } from '@mariozechner/pi-ai';
-import type { Api, Context, Model, AssistantMessage, SimpleStreamOptions, Tool as PiTool } from '@mariozechner/pi-ai';
-import { complete, completeSimple } from '@mariozechner/pi-ai';
-
-import type { LLMResponse, McpToolDefinition, ToolExecutor } from '../types.js';
-import { resolvePiModelByRef } from '../../runtime/pi/index.js';
-import type { PiAuthMode } from '../../runtime/pi/auth.js';
-import { createToolNameAliasCodec } from './tool-name-aliases.js';
-
-interface PiCallParams {
-  authMode?: PiAuthMode;
-  apiKeyOverride?: string;
-  apiKeyEnv?: string;
-  headers?: Record<string, string>;
-  timeout: number;
-  modelId: string;
-  system: string;
-  user: string;
-}
-
-interface PiCallWithToolsParams extends PiCallParams {
-  tools: McpToolDefinition[];
-}
-
-interface ResolvedPiRequest {
-  model: Model<Api>;
-  auth: {
-    apiKey?: string;
-    headers?: Record<string, string>;
-  };
-}
-
-type PiImplementationSet = {
-  resolve(
-    modelId: string,
-    authOptions?: {
-      authMode?: PiAuthMode;
-      apiKeyEnv?: string;
-      apiKeyOverride?: string;
-    },
-  ): Promise<ResolvedPiRequest>;
-  completeSimple(model: Model<Api>, context: Context, options?: SimpleStreamOptions): Promise<AssistantMessage>;
-  complete(model: Model<Api>, context: Context, options?: SimpleStreamOptions): Promise<AssistantMessage>;
-};
-
-let piImplementationsForTest: PiImplementationSet | null = null;
-
-export function __setPiImplementationsForTest(implementations: PiImplementationSet | null): void {
-  piImplementationsForTest = implementations;
-}
-
-export async function chatPi(params: PiCallParams): Promise<LLMResponse> {
-  const impl = getPiImplementations();
-  const { model, auth } = await impl.resolve(params.modelId, {
-    authMode: params.authMode,
-    apiKeyEnv: params.apiKeyEnv,
-    apiKeyOverride: params.apiKeyOverride,
-  });
-  const response = await impl.completeSimple(
-    model,
-    {
-      systemPrompt: params.system,
-      messages: [{ role: 'user', content: params.user, timestamp: Date.now() }],
-    },
-    buildPiOptions(params.timeout, auth, params.headers),
-  );
-  assertPiResponseSucceeded(response);
-  return toLLMResponse(response);
-}
-
-export async function chatWithToolsPi(params: PiCallWithToolsParams): Promise<LLMResponse> {
-  const impl = getPiImplementations();
-  const toolCodec = createToolNameAliasCodec(params.tools);
-  const { model, auth } = await impl.resolve(params.modelId, {
-    authMode: params.authMode,
-    apiKeyEnv: params.apiKeyEnv,
-    apiKeyOverride: params.apiKeyOverride,
-  });
-  const response = await impl.complete(
-    model,
-    {
-      systemPrompt: params.system,
-      messages: [{ role: 'user', content: params.user, timestamp: Date.now() }],
-      tools: toolCodec.tools.map(toPiTool),
-    },
-    buildPiOptions(params.timeout, auth, params.headers),
-  );
-  assertPiResponseSucceeded(response);
-  return toLLMResponse(response, toolCodec.toCanonical);
-}
-
-interface PiAgentLoopParams extends PiCallWithToolsParams {
-  executor: ToolExecutor;
-  maxTurns: number;
-}
-
-export async function chatAgentLoopPi(params: PiAgentLoopParams): Promise<LLMResponse> {
-  const impl = getPiImplementations();
-  const toolCodec = createToolNameAliasCodec(params.tools);
-  const { model, auth } = await impl.resolve(params.modelId, {
-    authMode: params.authMode,
-    apiKeyEnv: params.apiKeyEnv,
-    apiKeyOverride: params.apiKeyOverride,
-  });
-  const messages: Context['messages'] = [
-    { role: 'user', content: params.user, timestamp: Date.now() },
-  ];
-  const allToolCalls: Array<{ name: string; arguments: Record<string, unknown> }> = [];
-  let finalResponse: LLMResponse = { content: '' };
-
-  for (let turn = 0; turn < params.maxTurns; turn++) {
-    const response = await impl.complete(
-      model,
-      {
-        systemPrompt: params.system,
-        messages,
-        tools: toolCodec.tools.map(toPiTool),
-      },
-      buildPiOptions(params.timeout, auth, params.headers),
-    );
-
-    assertPiResponseSucceeded(response);
-    finalResponse = toLLMResponse(response, toolCodec.toCanonical);
-    if (!finalResponse.toolCalls || finalResponse.toolCalls.length === 0) {
-      return {
-        ...finalResponse,
-        toolCalls: allToolCalls.length > 0 ? allToolCalls : undefined,
-      };
-    }
-
-    allToolCalls.push(...finalResponse.toolCalls);
-    messages.push(response);
-
-    const responseToolCalls = response.content.filter(
-      (block): block is Extract<AssistantMessage['content'][number], { type: 'toolCall' }> => block.type === 'toolCall',
-    );
-
-    for (const toolCall of responseToolCalls) {
-      const canonicalToolName = toolCodec.toCanonical(toolCall.name);
-      let result: string;
-      let isError = false;
-      try {
-        result = await params.executor(canonicalToolName, toolCall.arguments);
-      } catch (error) {
-        isError = true;
-        result = `Error: ${error instanceof Error ? error.message : String(error)}`;
-      }
-
-      messages.push({
-        role: 'toolResult',
-        toolCallId: toolCall.id,
-        toolName: toolCall.name,
-        content: [{ type: 'text', text: result }],
-        isError,
-        timestamp: Date.now(),
-      });
-    }
-  }
-
-  return {
-    ...finalResponse,
-    toolCalls: allToolCalls.length > 0 ? allToolCalls : undefined,
-  };
-}
-
-function getPiImplementations(): PiImplementationSet {
-  if (piImplementationsForTest) {
-    return piImplementationsForTest;
-  }
-
-  return {
-    resolve: resolvePiRequest,
-    completeSimple,
-    complete,
-  };
-}
-
-async function resolvePiRequest(
-  modelId: string,
-  authOptions?: {
-    authMode?: PiAuthMode;
-    apiKeyEnv?: string;
-    apiKeyOverride?: string;
-  },
-): Promise<ResolvedPiRequest> {
-  const resolved = await resolvePiModelByRef(modelId, authOptions);
-  return {
-    model: resolved.model,
-    auth: {
-      apiKey: resolved.auth.apiKey,
-      headers: resolved.auth.headers,
-    },
-  };
-}
-
-function buildPiOptions(
-  timeout: number,
-  auth: { apiKey?: string; headers?: Record<string, string> },
-  extraHeaders?: Record<string, string>,
-): SimpleStreamOptions {
-  const controller = new AbortController();
-  setTimeout(() => controller.abort(), timeout).unref?.();
-  return {
-    signal: controller.signal,
-    apiKey: auth.apiKey,
-    headers: { ...(auth.headers ?? {}), ...(extraHeaders ?? {}) },
-    reasoning: 'medium',
-  };
-}
-
-function toPiTool(tool: McpToolDefinition): PiTool {
-  return {
-    name: tool.function.name,
-    description: tool.function.description ?? '',
-    parameters: Type.Unsafe(tool.function.parameters ?? { type: 'object', properties: {}, required: [] }),
-  };
-}
-
-function assertPiResponseSucceeded(message: AssistantMessage): void {
-  if (message.stopReason === 'error') {
-    throw new Error(message.errorMessage ?? 'PI model returned an unknown error');
-  }
-}
-
-function toLLMResponse(
-  message: AssistantMessage,
-  toCanonicalToolName: (name: string) => string = (name) => name,
-): LLMResponse {
-  const content = message.content
-    .filter((block): block is Extract<AssistantMessage['content'][number], { type: 'text' }> => block.type === 'text')
-    .map((block) => block.text)
-    .join('\n');
-
-  const toolCalls = message.content
-    .filter((block): block is Extract<AssistantMessage['content'][number], { type: 'toolCall' }> => block.type === 'toolCall')
-    .map((block) => ({
-      name: toCanonicalToolName(block.name),
-      arguments: block.arguments as Record<string, unknown>,
-    }));
-
-  return {
-    content,
-    toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
-    usage: {
-      prompt: message.usage.input,
-      completion: message.usage.output,
-      total: message.usage.totalTokens,
-    },
-  };
-}
diff --git a/src/benchmark/llm/shared.ts b/src/benchmark/llm/shared.ts
deleted file mode 100644
index f756d8d..0000000
--- a/src/benchmark/llm/shared.ts
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Shared utilities for LLM format handlers.
- *
- * Keeping these in one place avoids duplicating identical logic across
- * openai-format.ts, anthropic-format.ts, and any future format handlers.
- */
-
-import type { LLMResponse } from '../types.js';
-
-/** Maximum tool-result characters forwarded to the model in agent loops. */
-const MAX_TOOL_RESULT_CHARS = 50_000;
-const TRUNCATED_SUFFIX = '\n\n[... truncated]';
-
-/**
- * Truncate a tool result string so it does not exceed MAX_TOOL_RESULT_CHARS.
- * Returns the original string if it is already within the limit.
- */
-export function truncateToolResult(result: string): string {
-  if (result.length <= MAX_TOOL_RESULT_CHARS) return result;
-  return result.slice(0, MAX_TOOL_RESULT_CHARS - TRUNCATED_SUFFIX.length) + TRUNCATED_SUFFIX;
-}
-
-export function isAbortError(err: unknown): boolean {
-  return Boolean(err && typeof err === 'object' && 'name' in err && (err as { name: unknown }).name === 'AbortError');
-}
-
-/**
- * Return true if the error is a transient server error that should be retried.
- * Does NOT retry abort errors or 4xx client errors (except 429 rate-limit).
- */
-export function isRetryableError(err: unknown): boolean {
-  if (isAbortError(err)) return false;
-  if (err && typeof err === 'object' && 'status' in err) {
-    const status = (err as { status: unknown }).status;
-    if (typeof status === 'number' && status >= 400 && status < 500 && status !== 429) {
-      return false;
-    }
-  }
-  return true;
-}
-
-export function sleep(ms: number): Promise<void> {
-  return new Promise((resolve) => setTimeout(resolve, ms));
-}
-
-export function undefinedIfEmpty<T>(arr: T[]): T[] | undefined {
-  return arr.length > 0 ? arr : undefined;
-}
-
-export function normalizeUsage(
-  usage: { prompt: number; completion: number; total: number },
-): LLMResponse['usage'] {
-  return usage.total > 0 ? usage : undefined;
-}
diff --git a/src/benchmark/llm/tool-name-aliases.ts b/src/benchmark/llm/tool-name-aliases.ts
deleted file mode 100644
index 5ddda1e..0000000
--- a/src/benchmark/llm/tool-name-aliases.ts
+++ /dev/null
@@ -1,65 +0,0 @@
-import type { McpToolDefinition } from '../types.js';
-
-const PROVIDER_TOOL_NAME_PATTERN = /[^a-zA-Z0-9_-]/g;
-
-interface ToolNameAliasCodec {
-  tools: McpToolDefinition[];
-  toCanonical(name: string): string;
-  toProvider(name: string): string;
-}
-
-export function createToolNameAliasCodec(
-  tools: McpToolDefinition[],
-): ToolNameAliasCodec {
-  const usedProviderNames = new Set<string>();
-  const canonicalToProvider = new Map<string, string>();
-  const providerToCanonical = new Map<string, string>();
-
-  const aliasedTools = tools.map((tool) => {
-    const canonicalName = tool.function.name;
-    let providerName = sanitizeToolName(canonicalName);
-
-    if (providerName.length === 0) {
-      providerName = 'tool';
-    }
-
-    if (usedProviderNames.has(providerName)) {
-      const baseName = providerName;
-      let suffix = 1;
-      while (usedProviderNames.has(`${baseName}__${suffix}`)) {
-        suffix += 1;
-      }
-      providerName = `${baseName}__${suffix}`;
-    }
-
-    usedProviderNames.add(providerName);
-    canonicalToProvider.set(canonicalName, providerName);
-    providerToCanonical.set(providerName, canonicalName);
-
-    if (providerName === canonicalName) {
-      return tool;
-    }
-
-    return {
-      ...tool,
-      function: {
-        ...tool.function,
-        name: providerName,
-      },
-    };
-  });
-
-  return {
-    tools: aliasedTools,
-    toCanonical(name: string) {
-      return providerToCanonical.get(name) ?? name;
-    },
-    toProvider(name: string) {
-      return canonicalToProvider.get(name) ?? name;
-    },
-  };
-}
-
-function sanitizeToolName(name: string): string {
-  return name.replace(PROVIDER_TOOL_NAME_PATTERN, '_');
-}
diff --git a/src/benchmark/prompt-criteria.ts b/src/benchmark/prompt-criteria.ts
deleted file mode 100644
index 7c57360..0000000
--- a/src/benchmark/prompt-criteria.ts
+++ /dev/null
@@ -1,41 +0,0 @@
-import type { GeneratedTask } from '../tasks/types.js';
-import type { PromptCapabilityWithSection } from '../project/discover-prompt.js';
-import type { PromptEvaluationCriteria } from './prompt-evaluator.js';
-import { generateCriteriaFromCapability } from './prompt-evaluator.js';
-
-export interface ResolvedPromptCriteria {
-  criteria: PromptEvaluationCriteria;
-  noActiveCriteria: boolean;
-}
-
-function isEmptyCriteria(c: PromptEvaluationCriteria): boolean {
-  const s = (c.requiredSections?.length ?? 0) === 0;
-  const k = (c.requiredKeywords?.length ?? 0) === 0 && (c.forbiddenKeywords?.length ?? 0) === 0;
-  const f = (c.formatPatterns?.length ?? 0) === 0 && (c.minLength ?? 0) === 0;
-  const structure =
-    c.hasCodeBlocks === undefined &&
-    c.hasNumberedList === undefined &&
-    c.hasTable === undefined;
-  return s && k && f && structure;
-}
-
-export function resolveCriteriaForTask(
-  task: GeneratedTask,
-  caps: readonly PromptCapabilityWithSection[],
-): ResolvedPromptCriteria {
-  if (!task.capabilityId) {
-    throw new Error(
-      `Task ${task.id}: prompt-surface task is missing capabilityId. ` +
-      `Regenerate tasks with \`skill-optimizer generate-tasks\`.`,
-    );
-  }
-  const cap = caps.find((c) => c.action.key === task.capabilityId);
-  if (!cap) {
-    const known = caps.map((c) => c.action.key).join(', ') || '(none discovered)';
-    throw new Error(
-      `Task ${task.id}: capabilityId "${task.capabilityId}" is not in the discovered capability set. Known: ${known}`,
-    );
-  }
-  const criteria = generateCriteriaFromCapability(cap.action, cap.section);
-  return { criteria, noActiveCriteria: isEmptyCriteria(criteria) };
-}
diff --git a/src/benchmark/prompt-evaluator.ts b/src/benchmark/prompt-evaluator.ts
deleted file mode 100644
index 8946629..0000000
--- a/src/benchmark/prompt-evaluator.ts
+++ /dev/null
@@ -1,403 +0,0 @@
-import type { ActionDefinition } from '../actions/types.js';
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-export interface FormatPattern {
-  name: string;
-  pattern: string;
-}
-
-export interface PromptEvaluationCriteria {
-  /** Required sections in the output (case-insensitive heading match). */
-  requiredSections?: string[];
-  /** Required format patterns (regex). */
-  formatPatterns?: FormatPattern[];
-  /** Minimum content length in characters. */
-  minLength?: number;
-  /** Keywords that must appear (case-insensitive). */
-  requiredKeywords?: string[];
-  /** Keywords that must NOT appear (hallucination check, case-insensitive). */
-  forbiddenKeywords?: string[];
-  /** Structural checks. */
-  hasCodeBlocks?: boolean;
-  hasNumberedList?: boolean;
-  hasTable?: boolean;
-}
-
-export interface PromptCheckDetail {
-  check: string;
-  passed: boolean;
-  detail: string;
-}
-
-export interface PromptEvaluationResult {
-  /** Overall score 0.0-1.0 (weighted across all criteria categories). */
-  score: number;
-  /** Human-readable detail for each sub-check. */
-  details: string[];
-  /** Structured breakdown of individual checks. */
-  checks: PromptCheckDetail[];
-  /** Per-category scores before weighting. */
-  categoryScores: {
-    sections: number;
-    format: number;
-    keywords: number;
-    structure: number;
-  };
-  /**
-   * True when every criteria category is empty. Evaluation cannot produce a
-   * meaningful score in this case; runner treats this as an evaluation error
-   * rather than a pass.
-   */
-  noActiveCriteria: boolean;
-}
-
-// ── Weights ───────────────────────────────────────────────────────────────────
-
-const WEIGHT_SECTIONS = 0.4;
-const WEIGHT_FORMAT = 0.2;
-const WEIGHT_KEYWORDS = 0.2;
-const WEIGHT_STRUCTURE = 0.2;
-
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-/**
- * Build a regex that matches a markdown heading (any level) whose text
- * contains the given section name.  Also matches bold lines and plain
- * uppercase lines that act as section headers.
- */
-function sectionRegex(section: string): RegExp {
-  const escaped = section.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-  // Match: ## Section Name, **Section Name**, or SECTION NAME on its own line
-  return new RegExp(
-    `(?:^#{1,6}\\s+.*${escaped}.*$)|(?:^\\*\\*.*${escaped}.*\\*\\*$)|(?:^${escaped}\\s*$)`,
-    'im',
-  );
-}
-
-function hasCodeBlock(text: string): boolean {
-  return /```[\s\S]*?```/.test(text);
-}
-
-function hasNumberedList(text: string): boolean {
-  // At least two consecutive numbered items
-  return /(?:^|\n)\s*\d+[\.\)]\s+\S.*\n\s*\d+[\.\)]\s+\S/m.test(text);
-}
-
-function hasTable(text: string): boolean {
-  // Markdown table: header row, separator row, at least one data row
-  return /\|.+\|[\r\n]+\|[\s:|-]+\|[\r\n]+\|.+\|/.test(text);
-}
-
-// ── Main evaluator ────────────────────────────────────────────────────────────
-
-/**
- * Evaluate a model's response against prompt-based criteria.
- * Returns a score 0.0-1.0 (recall equivalent for prompt surface).
- */
-export function evaluatePromptResponse(
-  response: string,
-  criteria: PromptEvaluationCriteria,
-): PromptEvaluationResult {
-  const checks: PromptCheckDetail[] = [];
-  const details: string[] = [];
-
-  // Track per-category numerator / denominator
-  let sectionHits = 0;
-  let sectionTotal = 0;
-  let formatHits = 0;
-  let formatTotal = 0;
-  let keywordHits = 0;
-  let keywordTotal = 0;
-  let structureHits = 0;
-  let structureTotal = 0;
-
-  // ── 1. Required sections (weight: 40%) ──────────────────────────────────
-  if (criteria.requiredSections && criteria.requiredSections.length > 0) {
-    sectionTotal = criteria.requiredSections.length;
-    for (const section of criteria.requiredSections) {
-      const found = sectionRegex(section).test(response);
-      if (found) sectionHits++;
-      const msg = found
-        ? `section "${section}": found`
-        : `section "${section}": MISSING`;
-      checks.push({ check: `section:${section}`, passed: found, detail: msg });
-      details.push(msg);
-    }
-  }
-
-  // ── 2. Format patterns (part of format weight: 20%) ─────────────────────
-  if (criteria.formatPatterns && criteria.formatPatterns.length > 0) {
-    formatTotal += criteria.formatPatterns.length;
-    for (const fp of criteria.formatPatterns) {
-      let matched = false;
-      try {
-        const re = new RegExp(fp.pattern, 'm');
-        matched = re.test(response);
-      } catch {
-        // Invalid regex — treat as not matched
-      }
-      if (matched) formatHits++;
-      const msg = matched
-        ? `format "${fp.name}": matched`
-        : `format "${fp.name}": NOT matched`;
-      checks.push({ check: `format:${fp.name}`, passed: matched, detail: msg });
-      details.push(msg);
-    }
-  }
-
-  // Minimum length is also a format check
-  if (criteria.minLength !== undefined && criteria.minLength > 0) {
-    formatTotal++;
-    const lengthOk = response.length >= criteria.minLength;
-    if (lengthOk) formatHits++;
-    const msg = lengthOk
-      ? `minLength (${criteria.minLength}): OK (${response.length} chars)`
-      : `minLength (${criteria.minLength}): TOO SHORT (${response.length} chars)`;
-    checks.push({ check: 'format:minLength', passed: lengthOk, detail: msg });
-    details.push(msg);
-  }
-
-  // ── 3. Keywords (weight: 20%) ───────────────────────────────────────────
-  const responseLower = response.toLowerCase();
-
-  if (criteria.requiredKeywords && criteria.requiredKeywords.length > 0) {
-    keywordTotal += criteria.requiredKeywords.length;
-    for (const kw of criteria.requiredKeywords) {
-      const found = responseLower.includes(kw.toLowerCase());
-      if (found) keywordHits++;
-      const msg = found
-        ? `keyword "${kw}": found`
-        : `keyword "${kw}": MISSING`;
-      checks.push({ check: `keyword:${kw}`, passed: found, detail: msg });
-      details.push(msg);
-    }
-  }
-
-  if (criteria.forbiddenKeywords && criteria.forbiddenKeywords.length > 0) {
-    keywordTotal += criteria.forbiddenKeywords.length;
-    for (const kw of criteria.forbiddenKeywords) {
-      const absent = !responseLower.includes(kw.toLowerCase());
-      if (absent) keywordHits++;
-      const msg = absent
-        ? `forbidden "${kw}": absent (good)`
-        : `forbidden "${kw}": PRESENT (hallucination)`;
-      checks.push({ check: `forbidden:${kw}`, passed: absent, detail: msg });
-      details.push(msg);
-    }
-  }
-
-  // ── 4. Structural checks (weight: 20%) ─────────────────────────────────
-  if (criteria.hasCodeBlocks !== undefined) {
-    structureTotal++;
-    const found = hasCodeBlock(response);
-    const pass = criteria.hasCodeBlocks ? found : !found;
-    if (pass) structureHits++;
-    const label = criteria.hasCodeBlocks ? 'expected' : 'unexpected';
-    const msg = pass
-      ? `codeBlocks (${label}): OK`
-      : `codeBlocks (${label}): ${found ? 'PRESENT' : 'MISSING'}`;
-    checks.push({ check: 'structure:codeBlocks', passed: pass, detail: msg });
-    details.push(msg);
-  }
-
-  if (criteria.hasNumberedList !== undefined) {
-    structureTotal++;
-    const found = hasNumberedList(response);
-    const pass = criteria.hasNumberedList ? found : !found;
-    if (pass) structureHits++;
-    const label = criteria.hasNumberedList ? 'expected' : 'unexpected';
-    const msg = pass
-      ? `numberedList (${label}): OK`
-      : `numberedList (${label}): ${found ? 'PRESENT' : 'MISSING'}`;
-    checks.push({ check: 'structure:numberedList', passed: pass, detail: msg });
-    details.push(msg);
-  }
-
-  if (criteria.hasTable !== undefined) {
-    structureTotal++;
-    const found = hasTable(response);
-    const pass = criteria.hasTable ? found : !found;
-    if (pass) structureHits++;
-    const label = criteria.hasTable ? 'expected' : 'unexpected';
-    const msg = pass
-      ? `table (${label}): OK`
-      : `table (${label}): ${found ? 'PRESENT' : 'MISSING'}`;
-    checks.push({ check: 'structure:table', passed: pass, detail: msg });
-    details.push(msg);
-  }
-
-  // ── Compute category scores ─────────────────────────────────────────────
-
-  const sectionScore = sectionTotal > 0 ? sectionHits / sectionTotal : 1.0;
-  const formatScore = formatTotal > 0 ? formatHits / formatTotal : 1.0;
-  const keywordScore = keywordTotal > 0 ? keywordHits / keywordTotal : 1.0;
-  const structureScore = structureTotal > 0 ? structureHits / structureTotal : 1.0;
-
-  // If a category has no checks, redistribute its weight proportionally
-  // to the categories that do have checks.
-  const activeParts: { weight: number; score: number }[] = [];
-  if (sectionTotal > 0) activeParts.push({ weight: WEIGHT_SECTIONS, score: sectionScore });
-  if (formatTotal > 0) activeParts.push({ weight: WEIGHT_FORMAT, score: formatScore });
-  if (keywordTotal > 0) activeParts.push({ weight: WEIGHT_KEYWORDS, score: keywordScore });
-  if (structureTotal > 0) activeParts.push({ weight: WEIGHT_STRUCTURE, score: structureScore });
-
-  let score: number;
-  let noActiveCriteria = false;
-  if (activeParts.length === 0) {
-    // No criteria are active — treat as evaluation error, not vacuous pass.
-    score = 0;
-    noActiveCriteria = true;
-  } else {
-    const totalActiveWeight = activeParts.reduce((s, p) => s + p.weight, 0);
-    score = activeParts.reduce((s, p) => s + (p.weight / totalActiveWeight) * p.score, 0);
-  }
-
-  return {
-    score,
-    details,
-    checks,
-    categoryScores: {
-      sections: sectionScore,
-      format: formatScore,
-      keywords: keywordScore,
-      structure: structureScore,
-    },
-    noActiveCriteria,
-  };
-}
-
-// ── Auto-generation from capability ───────────────────────────────────────────
-
-/**
- * Instruction verbs that signal the model should produce something specific.
- * We look for "verb <object>" patterns to extract required keywords.
- */
-const INSTRUCTION_VERBS = [
-  'include', 'list', 'show', 'display', 'provide', 'output', 'generate',
-  'create', 'write', 'produce', 'return', 'describe', 'explain', 'summarize',
-  'format', 'use', 'add', 'specify', 'mention', 'contain',
-];
-
-/**
- * Generate evaluation criteria from a prompt capability's section content.
- * Extracts expected output patterns from code blocks and instruction verbs
- * found in the skill section text.
- */
-export function generateCriteriaFromCapability(
-  capability: ActionDefinition,
-  skillSection: string,
-): PromptEvaluationCriteria {
-  const criteria: PromptEvaluationCriteria = {};
-
-  // ── 1. Extract required sections from markdown headings in the section ──
-  // Look for sub-headings that describe expected output structure
-  const headingRe = /^#{2,6}\s+(.+)$/gm;
-  const sections: string[] = [];
-  let headingMatch: RegExpExecArray | null;
-  while ((headingMatch = headingRe.exec(skillSection)) !== null) {
-    const heading = headingMatch[1].trim();
-    // Skip headings that are clearly meta / instructional
-    if (!/^(example|note|tip|warning|usage|syntax|overview|description)s?$/i.test(heading)) {
-      sections.push(heading);
-    }
-  }
-  if (sections.length > 0) {
-    criteria.requiredSections = sections;
-  }
-
-  // ── 2. Extract format patterns from code fences ──
-  // Code blocks in the skill section often show expected output templates.
-  const codeBlockRe = /```\w*\n([\s\S]*?)```/g;
-  const formatPatterns: FormatPattern[] = [];
-  let codeBlockCount = 0;
-  let codeBlockMatch: RegExpExecArray | null;
-  while ((codeBlockMatch = codeBlockRe.exec(skillSection)) !== null) {
-    codeBlockCount++;
-    const content = codeBlockMatch[1].trim();
-    // Skip very long code blocks (likely full examples, not format constraints)
-    if (content.length > 500) continue;
-    // Extract lines that look like template patterns (contain placeholders or fixed structure)
-    const templateLines = content.split('\n').filter(
-      line => /[{<\[].+[}>\]]/.test(line) || /^\s*\w+\s*[:=]/.test(line),
-    );
-    for (const line of templateLines) {
-      // Convert template placeholders to regex wildcards
-      const escaped = line.trim()
-        .replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
-        .replace(/\\[{<]\\?[^}>\\]+\\?[}>]/g, '.+');
-      if (escaped.length > 5) {
-        formatPatterns.push({
-          name: `template-line-${formatPatterns.length + 1}`,
-          pattern: escaped,
-        });
-      }
-    }
-  }
-  if (formatPatterns.length > 0) {
-    criteria.formatPatterns = formatPatterns;
-  }
-
-  // ── 3. Detect structural expectations ──
-  if (codeBlockCount > 0 || /code block|code example|snippet/i.test(skillSection)) {
-    criteria.hasCodeBlocks = true;
-  }
-  if (/numbered list|ordered list|step[- ]by[- ]step|\d+\.\s/i.test(skillSection)) {
-    criteria.hasNumberedList = true;
-  }
-  if (/\btable\b|markdown table|\|.*\|.*\|/i.test(skillSection)) {
-    criteria.hasTable = true;
-  }
-
-  // ── 4. Extract required keywords from instruction patterns ──
-  const requiredKeywords: string[] = [];
-  const verbPattern = new RegExp(
-    `\\b(?:${INSTRUCTION_VERBS.join('|')})\\b\\s+(?:the\\s+|a\\s+|an\\s+)?["']?([\\w][\\w\\s-]{2,30}?)["']?(?:\\s*[,;.]|\\s+(?:in|to|for|as|with|from|using))`,
-    'gi',
-  );
-  const seenKeywords = new Set<string>();
-  let verbMatch: RegExpExecArray | null;
-  while ((verbMatch = verbPattern.exec(skillSection)) !== null) {
-    const keyword = verbMatch[1].trim().toLowerCase();
-    // Skip very generic words
-    if (keyword.length < 3) continue;
-    if (/^(the|this|that|your|each|all|any|it|them|these|those)$/i.test(keyword)) continue;
-    if (!seenKeywords.has(keyword)) {
-      seenKeywords.add(keyword);
-      requiredKeywords.push(keyword);
-    }
-  }
-  if (requiredKeywords.length > 0) {
-    criteria.requiredKeywords = requiredKeywords;
-  }
-
-  // ── 5. Extract forbidden keywords from explicit "do not" / "never" instructions ──
-  const forbiddenKeywords: string[] = [];
-  const forbiddenPattern = /\b(?:do\s+not|don't|never|avoid|must\s+not|should\s+not|shouldn't)\b\s+(?:\w+\s+)?["']?(\w[\w\s-]{2,30}?)["']?\b/gi;
-  const seenForbidden = new Set<string>();
-  let forbiddenMatch: RegExpExecArray | null;
-  while ((forbiddenMatch = forbiddenPattern.exec(skillSection)) !== null) {
-    const keyword = forbiddenMatch[1].trim().toLowerCase();
-    if (keyword.length < 3) continue;
-    if (/^(the|this|that|your|each|all|any|it|them|use|include|mention)$/i.test(keyword)) continue;
-    if (!seenForbidden.has(keyword)) {
-      seenForbidden.add(keyword);
-      forbiddenKeywords.push(keyword);
-    }
-  }
-  if (forbiddenKeywords.length > 0) {
-    criteria.forbiddenKeywords = forbiddenKeywords;
-  }
-
-  // ── 6. Infer minimum length from the capability description ──
-  // Longer descriptions with many args suggest non-trivial output
-  const descLength = (capability.description ?? '').length;
-  const argCount = capability.args.length;
-  if (descLength > 200 || argCount > 3) {
-    criteria.minLength = 200;
-  } else if (descLength > 50 || argCount > 1) {
-    criteria.minLength = 100;
-  }
-
-  return criteria;
-}
diff --git a/src/benchmark/prompts.ts b/src/benchmark/prompts.ts
deleted file mode 100644
index 065e280..0000000
--- a/src/benchmark/prompts.ts
+++ /dev/null
@@ -1,125 +0,0 @@
-import type { FetchedSkill, TaskDefinition, SdkLanguage } from './types.js';
-
-export type PromptSurface = 'sdk' | 'cli' | 'mcp' | 'prompt';
-
-interface PromptOptions {
-  surface: PromptSurface;
-  agentic?: boolean;
-  shell?: 'bash' | 'sh';
-  sdkLanguage?: SdkLanguage;
-}
-
-const SDK_LANGUAGE_LABELS: Record<SdkLanguage, string> = {
-  typescript: 'TypeScript',
-  python: 'Python',
-  rust: 'Rust',
-};
-
-const SDK_FENCE_LABELS: Record<SdkLanguage, string> = {
-  typescript: 'typescript',
-  python: 'python',
-  rust: 'rust',
-};
-
-/**
- * Build the system prompt with the skill documentation.
- *
- * @param skill - The fetched skill documentation (may be null if no skill configured)
- * @param sdkName - The name of the SDK/tool set from config
- * @param options - Prompt options
- */
-export function buildSystemPrompt(
-  skill: FetchedSkill | null,
-  sdkName: string,
-  options: PromptOptions,
-): string {
-  const guidanceSection = skill
-    ? `\n\nOptional guidance context (SKILL.md):\n--- GUIDANCE ---\n${skill.content}\n--- END GUIDANCE ---`
-    : '';
-
-  if (options.surface === 'prompt') {
-    // For prompt surface, the skill IS the system prompt.
-    // If skill content is available, use it directly; otherwise use a generic wrapper.
-    if (skill) {
-      return skill.content;
-    }
-    return `You are a helpful assistant for ${sdkName}. Follow the instructions and complete the task.`;
-  }
-
-  if (options.surface === 'mcp') {
-    return (
-      `You are a helpful assistant with access to tools for ${sdkName}.\n` +
-      `Use the provided tools to complete the task.\n` +
-      `Output must be tool calls only. Do not include code blocks or prose explanations.\n` +
-      `Never invent tool names that are not available.` +
-      guidanceSection
-    );
-  }
-
-  if (options.surface === 'cli') {
-    const shell = options.shell ?? 'bash';
-    return (
-      `You are a command-line assistant for ${sdkName}.\n` +
-      `Respond with exactly one fenced code block tagged ${shell}.\n` +
-      `The block must contain commands only (no comments, no explanations, no surrounding prose).\n` +
-      `Use only commands documented in the provided context.` +
-      guidanceSection
-    );
-  }
-
-  const sdkLanguage = options.sdkLanguage ?? 'typescript';
-  const sdkFence = SDK_FENCE_LABELS[sdkLanguage];
-  const sdkLabel = SDK_LANGUAGE_LABELS[sdkLanguage];
-
-  return (
-    `You are an expert developer using ${sdkName}.\n` +
-    `Respond with exactly one fenced ${sdkFence} code block.\n` +
-    `Write ${sdkLabel} code.\n` +
-    `Use SDK APIs only; do not invent SDK classes or methods.\n` +
-    (options.agentic
-      ? `A \`web_fetch\` tool is available for additional documentation lookup when needed.\n`
-      : '') +
-    guidanceSection
-  );
-}
-
-/**
- * Build the user prompt for a specific task.
- *
- * @param task - The task definition
- * @param options - Prompt options
- */
-export function buildTaskPrompt(
-  task: TaskDefinition,
-  options: PromptOptions,
-): string {
-  if (options.surface === 'prompt') {
-    return task.prompt;
-  }
-
-  if (options.surface === 'mcp') {
-    return (
-      `Task: ${task.prompt}\n\n` +
-      `Use only tool calls to complete this task. Do not write code.`
-    );
-  }
-
-  if (options.surface === 'cli') {
-    const shell = options.shell ?? 'bash';
-    return (
-      `Task: ${task.prompt}\n\n` +
-      `Return exactly one fenced ${shell} block with commands only. No prose.`
-    );
-  }
-
-  const sdkLanguage = options.sdkLanguage ?? 'typescript';
-  const sdkFence = SDK_FENCE_LABELS[sdkLanguage];
-  const sdkLabel = SDK_LANGUAGE_LABELS[sdkLanguage];
-
-  return (
-    `Task: ${task.prompt}\n\n` +
-    `Write a complete ${sdkLabel} solution in a single fenced ${sdkFence} code block. ` +
-    `Use only the documented SDK APIs. ` +
-    (options.agentic ? 'Use available documentation tools if needed. ' : '')
-  );
-}
diff --git a/src/benchmark/reporter.ts b/src/benchmark/reporter.ts
deleted file mode 100644
index 301363e..0000000
--- a/src/benchmark/reporter.ts
+++ /dev/null
@@ -1,255 +0,0 @@
-import type { BenchmarkReport, TaskResult } from './types.js';
-import { getExpectedActions, getExpectedActionName } from './types.js';
-
-// ── Helpers ──────────────────────────────────────────────────────────────────
-
-function pct(n: number): string {
-  return `${(n * 100).toFixed(1)}%`;
-}
-
-function fixed2(n: number): string {
-  return n.toFixed(2);
-}
-
-function padR(s: string, w: number): string {
-  return s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length);
-}
-
-
-function center(s: string, w: number): string {
-  if (s.length >= w) return s.slice(0, w);
-  const total = w - s.length;
-  const left = Math.floor(total / 2);
-  const right = total - left;
-  return ' '.repeat(left) + s + ' '.repeat(right);
-}
-
-// ── Markdown generation ───────────────────────────────────────────────────────
-
-export function generateMarkdown(report: BenchmarkReport): string {
-  const lines: string[] = [];
-  const { summary, skillVersion, coverage, results } = report;
-
-  // 1. Title + metadata
-  lines.push('# Skill Benchmark Report');
-  lines.push('');
-  lines.push(`**Generated:** ${new Date(report.timestamp).toUTCString()}`);
-  if (skillVersion) {
-    lines.push(`**Skill Version:** \`${skillVersion.source}@${skillVersion.commitSha.slice(0, 8)}\` (ref: \`${skillVersion.ref}\`)`);
-    lines.push(`**Fetched At:** ${skillVersion.fetchedAt}`);
-  }
-  if (report.config) {
-    lines.push(`**Benchmark:** ${report.config.name} (surface: ${report.config.surface})`);
-  }
-  lines.push('');
-
-  // 2. Summary metrics table
-  lines.push('## Summary');
-  lines.push('');
-  lines.push('| Metric | Value |');
-  lines.push('|--------|-------|');
-  lines.push(`| Total Tasks | ${summary.totalTasks} |`);
-  lines.push(`| Total Models | ${summary.totalModels} |`);
-  lines.push(`| Total Evaluations | ${summary.totalEvaluations} |`);
-  lines.push(`| Overall Pass Rate | ${pct(summary.overallPassRate)} |`);
-  lines.push(`| Avg Recall | ${fixed2(summary.avgToolRecall)} |`);
-  lines.push(`| Action Coverage | ${pct(summary.methodCoveragePercent)} |`);
-  lines.push('');
-
-  // 3. Per-model results table
-  lines.push('## Results by Model');
-  lines.push('');
-  lines.push('| Model | Tier | Pass Rate | Recall | Tasks Run |');
-  lines.push('|-------|------|-----------|--------|-----------|');
-
-  // Collect model display names from results
-  const modelNameMap = new Map<string, string>();
-  const modelTierMap = new Map<string, string>();
-  for (const r of results) {
-    modelNameMap.set(r.model.id, r.model.name);
-    modelTierMap.set(r.model.id, r.model.tier);
-  }
-
-  // Sort by pass rate descending
-  const modelEntries = Object.entries(summary.perModel).sort(
-    ([, a], [, b]) => b.passRate - a.passRate,
-  );
-  for (const [modelId, ms] of modelEntries) {
-    const name = modelNameMap.get(modelId) ?? modelId;
-    const tier = modelTierMap.get(modelId) ?? '—';
-    lines.push(
-      `| ${name} | ${tier} | ${pct(ms.passRate)} | ${fixed2(ms.avgRecall)} | ${ms.tasksRun} |`,
-    );
-  }
-  lines.push('');
-
-  // 4. Per-task results table
-  lines.push('## Results by Task');
-  lines.push('');
-  lines.push('| Task | Pass Rate | Recall | Failed Models |');
-  lines.push('|------|-----------|--------|---------------|');
-
-  // Build failed models per task
-  const failedModelsPerTask = new Map<string, string[]>();
-  for (const r of results) {
-    if (!r.metrics.taskPassed) {
-      const list = failedModelsPerTask.get(r.task.id) ?? [];
-      list.push(r.model.name);
-      failedModelsPerTask.set(r.task.id, list);
-    }
-  }
-
-  const taskEntries = Object.entries(summary.perTask).sort(
-    ([, a], [, b]) => a.passRate - b.passRate,
-  );
-  for (const [taskId, ts] of taskEntries) {
-    const failed = failedModelsPerTask.get(taskId) ?? [];
-    const failedStr = failed.length > 0 ? failed.join(', ') : '—';
-    lines.push(
-      `| \`${taskId}\` | ${pct(ts.passRate)} | ${fixed2(ts.avgRecall)} | ${failedStr} |`,
-    );
-  }
-  lines.push('');
-
-  // 5. Per-tier summary
-  lines.push('## Results by Tier');
-  lines.push('');
-  lines.push('| Tier | Pass Rate | Tool Sel. | Arg Acc. | Avg Recall |');
-  lines.push('|------|-----------|-----------|----------|------------|');
-  for (const tier of ['flagship', 'mid', 'low'] as const) {
-    const t = summary.perTier[tier];
-    lines.push(`| ${tier} | ${pct(t.passRate)} | ${fixed2(t.avgToolSelectionAccuracy)} | ${fixed2(t.avgArgAccuracy)} | ${fixed2(t.avgRecall)} |`);
-  }
-  lines.push('');
-
-  // 6. Known-action coverage table
-  lines.push('## Known Action Coverage');
-  lines.push('');
-  lines.push('| Action | Covered | Tasks |');
-  lines.push('|--------|---------|-------|');
-  for (const mc of coverage) {
-    const icon = mc.covered ? '✔' : '✘';
-    const tasks = mc.tasksCovering.length > 0 ? mc.tasksCovering.join(', ') : '—';
-    lines.push(`| \`${mc.method}\` | ${icon} | ${tasks} |`);
-  }
-  lines.push('');
-
-  // 7. Detailed results per task
-  lines.push('## Detailed Results');
-  lines.push('');
-
-  // Group results by task
-  const byTask = new Map<string, TaskResult[]>();
-  for (const r of results) {
-    const list = byTask.get(r.task.id) ?? [];
-    list.push(r);
-    byTask.set(r.task.id, list);
-  }
-
-  for (const [taskId, taskResults] of byTask) {
-    const firstResult = taskResults[0]!;
-    lines.push(`### Task: \`${taskId}\``);
-    lines.push('');
-    lines.push(`**Prompt:** ${firstResult.task.prompt}`);
-    lines.push('');
-    lines.push(
-      `**Expected Actions:** ${getExpectedActions(firstResult.task).map((t) => `\`${getExpectedActionName(t)}\``).join(', ')}`,
-    );
-    lines.push('');
-    lines.push('| Model | Status | Recall | Latency |');
-    lines.push('|-------|--------|--------|---------|');
-
-    for (const r of taskResults) {
-      const status = r.metrics.taskPassed ? '✅ PASS' : '❌ FAIL';
-      lines.push(
-        `| ${r.model.name} | ${status} | ${fixed2(r.metrics.toolRecall)} | ${r.llmLatencyMs}ms |`,
-      );
-    }
-    lines.push('');
-  }
-
-  return lines.join('\n');
-}
-
-// ── Console summary ───────────────────────────────────────────────────────────
-
-/**
- * Print a summary table to the console.
- */
-export function printSummary(report: BenchmarkReport): void {
-  const { summary, results } = report;
-  const surface = report.config?.surface ?? 'unknown';
-
-  // Collect model display info
-  const modelNameMap = new Map<string, string>();
-  const modelTierMap = new Map<string, string>();
-  for (const r of results) {
-    modelNameMap.set(r.model.id, r.model.name);
-    modelTierMap.set(r.model.id, r.model.tier);
-  }
-
-  // Column widths
-  const COL_MODEL = 24;
-  const COL_TIER = 10;
-  const COL_PASS = 11;
-  const COL_RECALL = 9;
-
-  const divider =
-    '+' +
-    '─'.repeat(COL_MODEL + 2) +
-    '+' +
-    '─'.repeat(COL_TIER + 2) +
-    '+' +
-    '─'.repeat(COL_PASS + 2) +
-    '+' +
-    '─'.repeat(COL_RECALL + 2) +
-    '+';
-
-  console.log(`\nSkill Benchmark — Surface-Based SDK/CLI/MCP Evaluation (surface: ${surface})`);
-  console.log(divider);
-  console.log(
-    '| ' +
-      padR('Model', COL_MODEL) +
-      ' | ' +
-      padR('Tier', COL_TIER) +
-      ' | ' +
-      center('Pass Rate', COL_PASS) +
-      ' | ' +
-      center('Recall', COL_RECALL) +
-      ' |',
-  );
-  console.log(divider);
-
-  // Sort by pass rate descending
-  const modelEntries = Object.entries(summary.perModel).sort(
-    ([, a], [, b]) => b.passRate - a.passRate,
-  );
-
-  for (const [modelId, ms] of modelEntries) {
-    const name = modelNameMap.get(modelId) ?? modelId;
-    const tier = modelTierMap.get(modelId) ?? '—';
-    console.log(
-      '| ' +
-        padR(name, COL_MODEL) +
-        ' | ' +
-        padR(tier, COL_TIER) +
-        ' | ' +
-        center(pct(ms.passRate), COL_PASS) +
-        ' | ' +
-        center(fixed2(ms.avgRecall), COL_RECALL) +
-        ' |',
-    );
-  }
-
-  console.log(divider);
-  console.log('');
-
-  // Overall stats
-  console.log('Overall Statistics:');
-  console.log(`  Pass Rate:      ${pct(summary.overallPassRate)}`);
-  console.log(`  Avg Recall:     ${fixed2(summary.avgToolRecall)}`);
-  console.log(`  Coverage:       ${pct(summary.methodCoveragePercent)}`);
-  console.log(`  Evaluations:    ${summary.totalEvaluations} (${summary.totalTasks} tasks × ${summary.totalModels} models)`);
-  console.log('');
-
-}
diff --git a/src/benchmark/runner.ts b/src/benchmark/runner.ts
deleted file mode 100644
index 1383117..0000000
--- a/src/benchmark/runner.ts
+++ /dev/null
@@ -1,590 +0,0 @@
-import { writeFileSync, mkdirSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-import type {
-  Tier,
-  BenchmarkConfig,
-  TaskResult,
-  BenchmarkReport,
-  ModelConfig,
-  ExtractedCall,
-  MethodCoverage,
-  SkillVersion,
-  ModelSummary,
-  TaskSummary,
-  ToolExecutor,
-  McpToolDefinition,
-  CoverageReport,
-} from './types.js';
-import { getExpectedActionName, getExpectedActions } from './types.js';
-import { loadConfig, loadTasks, loadMcpTools, loadCliCommands, slugify, getModelBySlug, getModelsByTier } from './config.js';
-import { createLLMClient } from './llm/index.js';
-import { extract } from './extractors/index.js';
-import { fetchSkill } from './skill-fetcher.js';
-import { evaluateTask } from './evaluator.js';
-import { computeCoverage } from './coverage.js';
-import { computeVerdict } from './scoring.js';
-import { buildSystemPrompt, buildTaskPrompt } from './prompts.js';
-import { evaluatePromptResponse } from './prompt-evaluator.js';
-import { resolveCriteriaForTask } from './prompt-criteria.js';
-import { discoverPromptCapabilitiesWithSections } from '../project/discover-prompt.js';
-
-function buildWebFetchTool(): McpToolDefinition {
-  return {
-    type: 'function',
-    function: {
-      name: 'web_fetch',
-      description: 'Fetch a reference document by path. Use this to load SDK documentation referenced in the skill.',
-      parameters: {
-        type: 'object',
-        properties: { url: { type: 'string', description: 'Path to the reference document' } },
-        required: ['url'],
-      },
-    },
-  };
-}
-
-function createReferenceExecutor(baseUrl: string, allowedPaths: string[]): { executor: ToolExecutor; fetchedPaths: string[] } {
-  const fetched: string[] = [];
-  const allowed = new Set(allowedPaths);
-  const executor: ToolExecutor = async (name, args) => {
-    if (name !== 'web_fetch') return `Error: Unknown tool "${name}"`;
-    let url = (args.url ?? args.path ?? '') as string;
-    url = url.replace(/^\/+/, '');
-    const prefix = baseUrl.replace(/\/+$/, '') + '/';
-    if (url.startsWith(prefix)) url = url.slice(prefix.length);
-    if (url.startsWith('https://')) {
-      const idx = url.indexOf(prefix);
-      if (idx !== -1) url = url.slice(idx + prefix.length);
-    }
-    if (!allowed.has(url)) return `Error: Path "${url}" not in allowed list. Available: ${allowedPaths.join(', ')}`;
-    fetched.push(url);
-    const fullUrl = `${baseUrl.replace(/\/+$/, '')}/${url}`;
-    try {
-      const controller = new AbortController();
-      const timer = setTimeout(() => controller.abort(), 30_000);
-      try {
-        const res = await fetch(fullUrl, { signal: controller.signal });
-        if (!res.ok) return `Error: HTTP ${res.status} fetching ${fullUrl}`;
-        return await res.text();
-      } finally { clearTimeout(timer); }
-    } catch (err) { return `Error: ${err instanceof Error ? err.message : String(err)}`; }
-  };
-  return { executor, fetchedPaths: fetched };
-}
-
-function formatMcpResponseContent(
-  textContent: string,
-  toolCalls: Array<{ name: string; arguments: Record<string, unknown> }>,
-): string {
-  const parts: string[] = [];
-  const trimmed = textContent.trim();
-  if (trimmed) {
-    parts.push(trimmed);
-  }
-  if (toolCalls.length > 0) {
-    parts.push(`Tool calls:\n${JSON.stringify(toolCalls, null, 2)}`);
-  }
-  return parts.join('\n\n');
-}
-
-export interface RunnerOptions {
-  configPath?: string;
-  tier?: Tier;
-  taskId?: string;
-  modelSlug?: string;
-  noCache?: boolean;
-  outputDir?: string;
-  verdictPolicy?: { perModelFloor: number; targetWeightedAverage: number };
-  scopeCoverage?: CoverageReport;
-  /**
-   * Override the skill source path for this run. Used by the optimizer to
-   * make the benchmark read a locally-versioned skill copy instead of the
-   * one committed in the target repo.
-   */
-  skillOverride?: string;
-}
-
-/**
- * Run the full benchmark.
- */
-export async function runBenchmark(options: RunnerOptions = {}): Promise<BenchmarkReport> {
-  const { config, configDir } = await loadConfig(options.configPath);
-
-  console.log('================================================================');
-  console.log(`  Skill Benchmark — ${config.name}`);
-  console.log('================================================================\n');
-
-  if (config.tasks === '__generated__') {
-    throw new Error(
-      'This benchmark config uses task generation (benchmark.taskGeneration.enabled=true). ' +
-      'Run `npx tsx src/cli.ts generate-tasks --config <path>` first, or use the CLI `run` command ' +
-      'which generates tasks automatically before benchmarking.',
-    );
-  }
-  let tasks = loadTasks(config.tasks, configDir);
-
-  let knownMethods: Set<string>;
-  let cliCommands: ReturnType<typeof loadCliCommands> | undefined = undefined;
-  let mcpToolDefs: ReturnType<typeof loadMcpTools> | undefined = undefined;
-  if (config.surface === 'prompt') {
-    // Prompt surface: no tool/action definitions — evaluation is content-based.
-    knownMethods = new Set<string>();
-  } else if (config.surface === 'sdk') {
-    const fromTasks = new Set<string>();
-    for (const task of tasks) {
-      for (const action of getExpectedActions(task)) {
-        fromTasks.add(getExpectedActionName(action));
-      }
-    }
-    const apiSurface = config.sdk?.apiSurface ?? [];
-    knownMethods = new Set([...fromTasks, ...apiSurface]);
-  } else if (config.surface === 'cli') {
-    cliCommands = loadCliCommands(config.cli!.commands, configDir);
-    knownMethods = new Set(cliCommands.map((c) => c.command));
-  } else {
-    mcpToolDefs = config.mcpToolDefinitions ?? (config.mcp ? loadMcpTools(config.mcp.tools, configDir) : []);
-    knownMethods = new Set(mcpToolDefs.map(t => t.function.name));
-  }
-  console.log(`[tasks] Loaded ${tasks.length} tasks from ${config.tasks}`);
-
-  if (options.taskId) {
-    tasks = tasks.filter(t => t.id === options.taskId);
-    if (tasks.length === 0) {
-      throw new Error(`Task '${options.taskId}' not found in ${config.tasks}`);
-    }
-    console.log(`[tasks] Filtered to task: ${options.taskId}`);
-  }
-
-  if (config.surface === 'prompt') {
-    console.log('[prompt] Content-based evaluation — no action definitions needed');
-  } else if (config.surface === 'mcp' && mcpToolDefs) {
-    const sourceLabel = config.surfaceSnapshot ? 'surface snapshot' : config.mcp?.tools ?? 'MCP manifest';
-    console.log(`[mcp] Loaded ${mcpToolDefs.length} tool definitions from ${sourceLabel}`);
-  } else if (config.surface === 'cli' && cliCommands) {
-    console.log(`[cli] Loaded ${cliCommands.length} command definitions from ${config.cli!.commands}`);
-  }
-
-  const skillConfig = options.skillOverride
-    ? { ...(config.skill ?? {}), source: options.skillOverride, cache: false } as typeof config.skill
-    : config.skill;
-  const skill = await fetchSkill(options.noCache
-    ? { ...skillConfig, cache: false } as typeof skillConfig
-    : skillConfig
-  );
-  if (skill) {
-    console.log(`[skill] Version: ${skill.version.source}@${skill.version.commitSha.slice(0, 8)}\n`);
-  } else {
-    console.log('[skill] No skill configured — using generic system prompt\n');
-  }
-
-  // Discover prompt capabilities once; criteria are resolved per-task inside the loop.
-  const promptCaps = (config.surface === 'prompt' && skill)
-    ? discoverPromptCapabilitiesWithSections(skill.content)
-    : [];
-  if (config.surface === 'prompt') {
-    console.log(`[prompt] ${promptCaps.length} capabilities discovered`);
-  }
-
-
-  const promptOptions = {
-    surface: config.surface,
-    agentic: Boolean(config.agentic),
-    shell: config.cli?.shell,
-    sdkLanguage: config.sdk?.language,
-  };
-  const systemPrompt = buildSystemPrompt(skill, config.name, promptOptions);
-  console.log(`[prompt] Surface: ${config.surface}`);
-  console.log(`[prompt] System prompt: ${systemPrompt.length} chars\n`);
-
-  const client = createLLMClient(config.llm);
-
-  let models: ModelConfig[] = [...config.llm.models];
-  if (options.tier) {
-    models = getModelsByTier(config, options.tier);
-    console.log(`[models] Filtered to tier: ${options.tier} (${models.length} models)`);
-  }
-  if (options.modelSlug) {
-    const found = getModelBySlug(config, options.modelSlug);
-    if (!found) {
-      throw new Error(`Model '${options.modelSlug}' not found in config`);
-    }
-    models = [found];
-    console.log(`[models] Filtered to model: ${found.name}`);
-  }
-
-  console.log(`\n[run] ${tasks.length} tasks × ${models.length} models = ${tasks.length * models.length} evaluations\n`);
-
-  const outputDir = options.outputDir
-    ? resolve(options.outputDir)
-    : resolve(configDir, config.output?.dir ?? 'benchmark-results');
-  mkdirSync(outputDir, { recursive: true });
-
-  const results: TaskResult[] = [];
-
-  for (const task of tasks) {
-    console.log(`\n${'─'.repeat(60)}`);
-    console.log(`  Task: ${task.id}`);
-    console.log(`  Prompt: ${task.prompt.slice(0, 80)}${task.prompt.length > 80 ? '...' : ''}`);
-    console.log('─'.repeat(60));
-
-    for (const model of models) {
-      const slug = slugify(model.name);
-      console.log(`\n  [${slug}] Calling ${model.name}...`);
-
-      let rawResponse = '';
-      let llmLatencyMs = 0;
-      let tokenUsage;
-      let error: string | undefined;
-      let llmResponse;
-      let fetchedPaths: string[] = [];
-
-      const start = Date.now();
-      try {
-        if (config.agentic) {
-          const ref = createReferenceExecutor(
-            config.agentic.references.baseUrl, config.agentic.references.allowedPaths,
-          );
-          llmResponse = await client.chatAgentLoop(
-            model.id, systemPrompt, buildTaskPrompt(task, promptOptions),
-            [buildWebFetchTool()], ref.executor, config.agentic.maxTurns ?? 5,
-          );
-          fetchedPaths = ref.fetchedPaths;
-        } else if (config.surface === 'mcp' && mcpToolDefs) {
-          llmResponse = await client.chatWithTools(
-            model.id,
-            systemPrompt,
-            buildTaskPrompt(task, promptOptions),
-            mcpToolDefs,
-          );
-        } else {
-          // SDK and CLI surfaces both use plain chat transport.
-          llmResponse = await client.chat(
-            model.id,
-            systemPrompt,
-            buildTaskPrompt(task, promptOptions),
-          );
-        }
-        rawResponse = llmResponse.content;
-        if (config.surface === 'mcp' && (llmResponse.toolCalls?.length ?? 0) > 0) {
-          rawResponse = formatMcpResponseContent(llmResponse.content, llmResponse.toolCalls ?? []);
-        }
-        tokenUsage = llmResponse.usage;
-        llmLatencyMs = Date.now() - start;
-        const toolCallCount = llmResponse.toolCalls?.length ?? 0;
-        if (config.surface === 'mcp') {
-          const toolLabel = `${toolCallCount} tool call${toolCallCount === 1 ? '' : 's'}`;
-          if (toolCallCount > 0 && llmResponse.content.length === 0) {
-            console.log(
-              `  [${slug}] Structured response: ${toolLabel}, no text reply (${llmLatencyMs}ms)`,
-            );
-          } else if (toolCallCount > 0) {
-            console.log(
-              `  [${slug}] Structured response: ${toolLabel} + ${llmResponse.content.length} chars text (${llmLatencyMs}ms)`,
-            );
-          } else {
-            console.log(`  [${slug}] Text response: ${llmResponse.content.length} chars (${llmLatencyMs}ms)`);
-          }
-        } else {
-          console.log(`  [${slug}] Response: ${rawResponse.length} chars (${llmLatencyMs}ms)`);
-        }
-      } catch (err) {
-        llmLatencyMs = Date.now() - start;
-        error = err instanceof Error ? err.message : String(err);
-        console.error(`  [${slug}] FAILED: ${error}`);
-        llmResponse = { content: '', usage: undefined };
-      }
-
-      let extractedCalls: ExtractedCall[] = [];
-      let generatedCode: string | null = null;
-      let bindings: Map<string, string> | undefined;
-
-      if (config.surface === 'prompt') {
-        // Prompt surface: no extraction — evaluation is content-based.
-        // The raw text response is the output; no tool calls or code blocks to parse.
-        console.log(`  [${slug}] Prompt response: ${rawResponse.length} chars`);
-      } else {
-        try {
-          const extractionConfig = config.surface === 'cli' && cliCommands
-            ? {
-                ...config,
-                cli: {
-                  ...config.cli,
-                  commandDefinitions: cliCommands,
-                },
-              }
-            : config;
-
-          const extracted = await extract(llmResponse!, extractionConfig as BenchmarkConfig);
-          extractedCalls = extracted.calls;
-          generatedCode = extracted.generatedCode;
-          bindings = extracted.bindings;
-
-          if (config.surface === 'sdk') {
-            const sdkLanguage = config.sdk?.language ?? 'typescript';
-            if (generatedCode) {
-              console.log(`  [${slug}] ${sdkLanguage} code extracted: ${generatedCode.length} chars`);
-            } else if (!error) {
-              console.log(`  [${slug}] WARNING: No ${sdkLanguage} code block found`);
-              error = error ?? `No ${sdkLanguage} code block in response`;
-            }
-          }
-
-          if (config.surface === 'cli') {
-            if (generatedCode) {
-              console.log(`  [${slug}] Command block extracted: ${generatedCode.length} chars`);
-            } else if (!error) {
-              console.log(`  [${slug}] WARNING: No shell command block found`);
-              error = error ?? 'No shell command block in response';
-            }
-          }
-
-          if (extractedCalls.length > 0) {
-            console.log(`  [${slug}] Extracted ${extractedCalls.length} calls: ${extractedCalls.map(c => c.method).join(', ')}`);
-          }
-        } catch (err) {
-          console.error(`  [${slug}] Extraction error: ${err instanceof Error ? err.message : err}`);
-          error = error ?? `Extraction failed: ${err instanceof Error ? err.message : err}`;
-        }
-      }
-
-      const taskResult = evaluateTask({
-        task,
-        model,
-        generatedCode,
-        rawResponse,
-        extractedCalls,
-        llmLatencyMs,
-        tokenUsage,
-        error,
-        knownMethods,
-        bindings,
-        surface: config.surface,
-      });
-
-      // Prompt surface: replace vacuous tool-recall with per-capability content score.
-      if (config.surface === 'prompt') {
-        // toolPrecision is vacuously 1.0 for prompt tasks (no tool calls = no wrong calls).
-        taskResult.metrics.toolPrecision = 1.0;
-        // Only evaluate if the model call succeeded; error already records the failure.
-        if (!error) {
-          try {
-            const { criteria } = resolveCriteriaForTask(task, promptCaps);
-            const promptResult = evaluatePromptResponse(rawResponse, criteria);
-            if (promptResult.noActiveCriteria) {
-              const msg = `Task "${task.id}" has no extractable criteria — fix SKILL.md section for that action`;
-              taskResult.metrics.toolRecall = 0;
-              taskResult.metrics.taskPassed = false;
-              taskResult.error = taskResult.error ?? msg;
-              console.error(`  [${slug}] Prompt eval error: ${msg}`);
-            } else {
-              taskResult.metrics.toolRecall = promptResult.score;
-              taskResult.metrics.taskPassed = promptResult.score >= 0.5;
-              console.log(`  [${slug}] Prompt score: ${promptResult.score.toFixed(3)} → ${taskResult.metrics.taskPassed ? 'PASS' : 'FAIL'}`);
-            }
-          } catch (err) {
-            const msg = err instanceof Error ? err.message : String(err);
-            console.error(`  [${slug}] Prompt eval error: ${msg}`);
-            taskResult.metrics.toolRecall = 0;
-            taskResult.metrics.taskPassed = false;
-            taskResult.error = taskResult.error ?? msg;
-          }
-        }
-      }
-
-      if (config.agentic && task.expected_fetches) {
-        const actualFetches = fetchedPaths;
-        const expectedSet = new Set(task.expected_fetches);
-        const actualSet = new Set(actualFetches);
-        const matched = [...expectedSet].filter(f => actualSet.has(f));
-        taskResult.metrics.fetchRecall = expectedSet.size === 0 ? 1.0 : matched.length / expectedSet.size;
-        taskResult.metrics.fetchPrecision = actualSet.size === 0 ? 0.0 : matched.length / actualSet.size;
-        taskResult.metrics.actualFetches = actualFetches;
-        taskResult.metrics.taskPassed = taskResult.metrics.taskPassed && taskResult.metrics.fetchRecall === 1.0;
-        const fetchStatus = taskResult.metrics.fetchRecall === 1.0 ? 'correct' : 'WRONG';
-        console.log(`  [${slug}] Fetched: [${actualFetches.join(', ')}] (${fetchStatus})`);
-      }
-
-      const status = taskResult.metrics.taskPassed ? '✅ PASS' : '❌ FAIL';
-      console.log(
-        `  [${slug}] ${status}  recall=${taskResult.metrics.toolRecall.toFixed(2)}`
-      );
-
-      results.push(taskResult);
-
-      // Save raw response per model per task
-      const taskModelDir = resolve(outputDir, task.id, slug);
-      mkdirSync(taskModelDir, { recursive: true });
-      writeFileSync(resolve(taskModelDir, 'response.md'), rawResponse, 'utf-8');
-      if (generatedCode) {
-        const generatedFile = config.surface === 'sdk'
-          ? config.sdk?.language === 'python'
-            ? 'code.py'
-            : config.sdk?.language === 'rust'
-              ? 'code.rs'
-              : 'code.ts'
-          : config.surface === 'cli'
-            ? 'commands.sh'
-            : 'generated.txt';
-        writeFileSync(resolve(taskModelDir, generatedFile), generatedCode, 'utf-8');
-      }
-
-      // Rate limit
-      await new Promise(r => setTimeout(r, 1_000));
-    }
-  }
-
-  const allMethods = Array.from(knownMethods);
-  const coverage = computeCoverage(tasks, allMethods);
-
-  const skillVersion: SkillVersion = skill?.version ?? {
-    source: 'none',
-    commitSha: 'none',
-    ref: 'none',
-    fetchedAt: new Date().toISOString(),
-  };
-  const report = buildBenchmarkReport(
-    results,
-    coverage,
-    skillVersion,
-    tasks.length,
-    models.length,
-    config,
-    outputDir,
-  );
-
-  if (options.scopeCoverage) {
-    report.scopeCoverage = options.scopeCoverage;
-  }
-
-  if (options.verdictPolicy) {
-    report.verdict = computeVerdict(report, config.llm.models, options.verdictPolicy);
-  }
-
-  const jsonPath = resolve(outputDir, 'report.json');
-  writeFileSync(jsonPath, JSON.stringify(report, null, 2), 'utf-8');
-  console.log(`\n[output] Report saved to ${jsonPath}`);
-
-  return report;
-}
-
-/**
- * Build the full benchmark report with summaries.
- */
-function buildBenchmarkReport(
-  results: TaskResult[],
-  coverage: MethodCoverage[],
-  skillVersion: SkillVersion,
-  totalTasks: number,
-  totalModels: number,
-  config: BenchmarkConfig,
-  outputDir: string,
-): BenchmarkReport {
-  const passed = results.filter(r => r.metrics.taskPassed).length;
-  const totalEvals = results.length;
-
-  // Per-model summary
-  const perModel: Record<string, ModelSummary> = {};
-  const modelGroups = new Map<string, TaskResult[]>();
-  for (const r of results) {
-    const key = r.model.id;
-    if (!modelGroups.has(key)) modelGroups.set(key, []);
-    modelGroups.get(key)!.push(r);
-  }
-  for (const [modelId, runs] of modelGroups) {
-    const p = runs.filter(r => r.metrics.taskPassed).length;
-    perModel[modelId] = {
-      passRate: runs.length ? p / runs.length : 0,
-      avgRecall: runs.length ? runs.reduce((s, r) => s + r.metrics.toolRecall, 0) / runs.length : 0,
-      avgPrecision: runs.length ? runs.reduce((s, r) => s + r.metrics.toolPrecision, 0) / runs.length : 0,
-      avgToolSelectionAccuracy: runs.length ? runs.reduce((s, r) => s + r.metrics.toolSelectionAccuracy, 0) / runs.length : 0,
-      avgArgAccuracy: runs.length ? runs.reduce((s, r) => s + r.metrics.argAccuracy, 0) / runs.length : 0,
-      avgHallucinationRate: runs.length ? runs.reduce((s, r) => s + r.metrics.hallucinationRate, 0) / runs.length : 0,
-      tasksRun: runs.length,
-    };
-  }
-
-  // Per-task summary
-  const perTask: Record<string, TaskSummary> = {};
-  const taskGroups = new Map<string, TaskResult[]>();
-  for (const r of results) {
-    const key = r.task.id;
-    if (!taskGroups.has(key)) taskGroups.set(key, []);
-    taskGroups.get(key)!.push(r);
-  }
-  for (const [taskId, runs] of taskGroups) {
-    const p = runs.filter(r => r.metrics.taskPassed).length;
-    perTask[taskId] = {
-      passRate: runs.length ? p / runs.length : 0,
-      avgRecall: runs.length ? runs.reduce((s, r) => s + r.metrics.toolRecall, 0) / runs.length : 0,
-      avgToolSelectionAccuracy: runs.length ? runs.reduce((s, r) => s + r.metrics.toolSelectionAccuracy, 0) / runs.length : 0,
-      avgArgAccuracy: runs.length ? runs.reduce((s, r) => s + r.metrics.argAccuracy, 0) / runs.length : 0,
-    };
-  }
-
-  // Per-tier summary
-  const tiers: Tier[] = ['flagship', 'mid', 'low'];
-  const perTier = {} as Record<Tier, { passRate: number; avgRecall: number; avgToolSelectionAccuracy: number; avgArgAccuracy: number }>;
-  for (const tier of tiers) {
-    const tierResults = results.filter(r => r.model.tier === tier);
-    const p = tierResults.filter(r => r.metrics.taskPassed).length;
-    perTier[tier] = {
-      passRate: tierResults.length ? p / tierResults.length : 0,
-      avgRecall: tierResults.length
-        ? tierResults.reduce((s, r) => s + r.metrics.toolRecall, 0) / tierResults.length
-        : 0,
-      avgToolSelectionAccuracy: tierResults.length
-        ? tierResults.reduce((s, r) => s + r.metrics.toolSelectionAccuracy, 0) / tierResults.length
-        : 0,
-      avgArgAccuracy: tierResults.length
-        ? tierResults.reduce((s, r) => s + r.metrics.argAccuracy, 0) / tierResults.length
-        : 0,
-    };
-  }
-
-  const coveredCount = coverage.filter(c => c.covered).length;
-
-  return {
-    timestamp: new Date().toISOString(),
-    config: {
-      name: config.name,
-      surface: config.surface,
-      outputDir,
-    },
-    skillVersion,
-    results,
-    coverage,
-    summary: {
-      totalTasks,
-      totalModels,
-      totalEvaluations: totalEvals,
-      overallPassRate: totalEvals ? passed / totalEvals : 0,
-      weightedAverage: (() => {
-        const weights = config.llm.models.map((m) => ({ id: m.id, w: m.weight ?? 1 }));
-        const totalWeight = weights.reduce((acc, x) => acc + x.w, 0);
-        return totalWeight > 0
-          ? weights.reduce((acc, { id, w }) => acc + w * (perModel[id]?.passRate ?? 0), 0) / totalWeight
-          : 0;
-      })(),
-      avgToolRecall: totalEvals
-        ? results.reduce((s, r) => s + r.metrics.toolRecall, 0) / totalEvals
-        : 0,
-      avgToolPrecision: totalEvals
-        ? results.reduce((s, r) => s + r.metrics.toolPrecision, 0) / totalEvals
-        : 0,
-      avgToolSelectionAccuracy: totalEvals
-        ? results.reduce((s, r) => s + r.metrics.toolSelectionAccuracy, 0) / totalEvals
-        : 0,
-      avgArgAccuracy: totalEvals
-        ? results.reduce((s, r) => s + r.metrics.argAccuracy, 0) / totalEvals
-        : 0,
-      avgHallucinationRate: totalEvals
-        ? results.reduce((s, r) => s + r.metrics.hallucinationRate, 0) / totalEvals
-        : 0,
-      methodCoveragePercent: coverage.length ? coveredCount / coverage.length : 0,
-      perModel,
-      perTask,
-      perTier,
-    },
-  };
-}
diff --git a/src/benchmark/scoring.ts b/src/benchmark/scoring.ts
deleted file mode 100644
index 4e041fb..0000000
--- a/src/benchmark/scoring.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-import type {
-  BenchmarkReport,
-  ModelConfig,
-  Verdict,
-  VerdictPolicy,
-} from './types.js';
-
-export function computePerModelPassRates(report: BenchmarkReport): Record<string, number> {
-  const rates: Record<string, number> = {};
-  for (const [id, summary] of Object.entries(report.summary.perModel)) {
-    rates[id] = summary.passRate;
-  }
-  return rates;
-}
-
-export function computeWeightedAverage(report: BenchmarkReport, models: ModelConfig[]): number {
-  if (models.length === 0) return 0;
-  const rates = computePerModelPassRates(report);
-  let num = 0;
-  let den = 0;
-  for (const model of models) {
-    const w = model.weight ?? 1;
-    num += w * (rates[model.id] ?? 0);
-    den += w;
-  }
-  return den > 0 ? num / den : 0;
-}
-
-export function computeVerdict(
-  report: BenchmarkReport,
-  models: ModelConfig[],
-  policy: VerdictPolicy,
-): { result: Verdict; reasons: string[]; policy: VerdictPolicy } {
-  const rates = computePerModelPassRates(report);
-  const reasons: string[] = [];
-
-  for (const model of models) {
-    const rate = rates[model.id] ?? 0;
-    if (rate < policy.perModelFloor) {
-      reasons.push(
-        `${model.name} (${model.id}) passes ${(rate * 100).toFixed(1)}% < floor ${(policy.perModelFloor * 100).toFixed(1)}%`,
-      );
-    }
-  }
-
-  const wavg = report.summary.weightedAverage ?? computeWeightedAverage(report, models);
-  if (wavg < policy.targetWeightedAverage) {
-    reasons.push(
-      `weighted average ${(wavg * 100).toFixed(1)}% < target ${(policy.targetWeightedAverage * 100).toFixed(1)}%`,
-    );
-  }
-
-  // Prompt surface: coverage is reported but does not veto the verdict.
-  // Prompt tasks are not required to map 1:1 to capabilities, so a
-  // coverage gap here is informational rather than a failure condition.
-  if (report.scopeCoverage?.coverageViolation && report.config.surface !== 'prompt') {
-    reasons.push('coverage violation: some in-scope actions have zero tasks');
-  }
-
-  return {
-    result: reasons.length === 0 ? 'PASS' : 'FAIL',
-    reasons,
-    policy,
-  };
-}
-
-export function accept(
-  before: BenchmarkReport,
-  after: BenchmarkReport,
-  models: ModelConfig[],
-  policy: VerdictPolicy & { minImprovement: number },
-): boolean {
-  // When no model configs are provided, fall back to simple overall pass rate comparison
-  if (models.length === 0) {
-    return after.summary.overallPassRate - before.summary.overallPassRate >= policy.minImprovement;
-  }
-  const beforeRates = computePerModelPassRates(before);
-  const afterRates = computePerModelPassRates(after);
-  for (const model of models) {
-    const afterRate = afterRates[model.id] ?? 0;
-    if (afterRate < policy.perModelFloor) {
-      const beforeRate = beforeRates[model.id] ?? 0;
-      if (afterRate <= beforeRate) return false;
-    }
-  }
-  const beforeAvg = before.summary.weightedAverage ?? computeWeightedAverage(before, models);
-  const afterAvg = after.summary.weightedAverage ?? computeWeightedAverage(after, models);
-  return (afterAvg - beforeAvg) >= policy.minImprovement;
-}
diff --git a/src/benchmark/skill-fetcher.ts b/src/benchmark/skill-fetcher.ts
deleted file mode 100644
index 4088428..0000000
--- a/src/benchmark/skill-fetcher.ts
+++ /dev/null
@@ -1,220 +0,0 @@
-import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'node:fs';
-import { resolve } from 'node:path';
-import type { FetchedSkill, SkillConfig, SkillVersion } from './types.js';
-
-// ── Helpers ────────────────────────────────────────────────────────────────
-
-/** Simple hash for cache key — just a short digest of the source string */
-function hashSource(source: string): string {
-  let h = 0;
-  for (let i = 0; i < source.length; i++) {
-    h = (Math.imul(31, h) + source.charCodeAt(i)) | 0;
-  }
-  return Math.abs(h).toString(16).padStart(8, '0');
-}
-
-function readCache(cachePath: string): FetchedSkill | null {
-  if (!existsSync(cachePath)) return null;
-  try {
-    return JSON.parse(readFileSync(cachePath, 'utf-8')) as FetchedSkill;
-  } catch {
-    return null;
-  }
-}
-
-function writeCache(cachePath: string, result: FetchedSkill): void {
-  const cacheDir = resolve(cachePath, '..');
-  mkdirSync(cacheDir, { recursive: true });
-  writeFileSync(cachePath, JSON.stringify(result, null, 2), 'utf-8');
-}
-
-// ── GitHub source ──────────────────────────────────────────────────────────
-
-/**
- * Fetch skill from GitHub: "github:org/repo/path/to/file.md"
- */
-async function fetchFromGitHub(source: string, useCache: boolean): Promise<FetchedSkill> {
-  // Parse "github:org/repo/path/to/file.md"
-  const withoutPrefix = source.slice('github:'.length);
-  const slashIdx = withoutPrefix.indexOf('/');
-  const secondSlashIdx = withoutPrefix.indexOf('/', slashIdx + 1);
-
-  if (slashIdx === -1 || secondSlashIdx === -1) {
-    throw new Error(
-      `Invalid github source format: "${source}". ` +
-      `Expected "github:org/repo/path/to/file.md"`
-    );
-  }
-
-  const org = withoutPrefix.slice(0, slashIdx);
-  const repo = withoutPrefix.slice(slashIdx + 1, secondSlashIdx);
-  const path = withoutPrefix.slice(secondSlashIdx + 1);
-
-  const cacheDir = resolve('.cache');
-  const cachePath = resolve(cacheDir, `skill-${hashSource(source)}.json`);
-
-  if (useCache) {
-    const cached = readCache(cachePath);
-    if (cached) {
-      console.log(`[skill] Using cached skill (${cached.version.commitSha.slice(0, 8)}) from ${cachePath}`);
-      return cached;
-    }
-  }
-
-  console.log(`[skill] Fetching from GitHub: ${org}/${repo}/${path}...`);
-
-  // Try to get commit SHA from GitHub API (optional — don't fail if it doesn't work)
-  let commitSha = 'unknown';
-  try {
-    const commitController = new AbortController();
-    const commitTimer = setTimeout(() => commitController.abort(), 30_000);
-    try {
-      const commitRes = await fetch(
-        `https://api.github.com/repos/${org}/${repo}/commits/main`,
-        { headers: { Accept: 'application/vnd.github.v3+json' }, signal: commitController.signal }
-      );
-      if (commitRes.ok) {
-        const commitData = (await commitRes.json()) as { sha: string };
-        commitSha = commitData.sha;
-      }
-    } finally {
-      clearTimeout(commitTimer);
-    }
-  } catch {
-    // Non-fatal — proceed with 'unknown'
-  }
-
-  // Fetch raw content
-  const rawUrl = `https://raw.githubusercontent.com/${org}/${repo}/main/${path}`;
-  const skillController = new AbortController();
-  const skillTimer = setTimeout(() => skillController.abort(), 30_000);
-  let content: string;
-  try {
-    const skillRes = await fetch(rawUrl, { signal: skillController.signal });
-    if (!skillRes.ok) {
-      throw new Error(`Failed to fetch skill from GitHub (${rawUrl}): ${skillRes.status} ${skillRes.statusText}`);
-    }
-    content = await skillRes.text();
-  } finally {
-    clearTimeout(skillTimer);
-  }
-
-  const version: SkillVersion = {
-    source,
-    commitSha,
-    ref: 'main',
-    fetchedAt: new Date().toISOString(),
-  };
-
-  const result: FetchedSkill = { version, content };
-
-  writeCache(cachePath, result);
-  console.log(`[skill] Cached skill (${commitSha.slice(0, 8)}, ${content.length} chars) to ${cachePath}`);
-
-  return result;
-}
-
-// ── URL source ─────────────────────────────────────────────────────────────
-
-/**
- * Fetch skill from a direct URL: "https://..." or "http://..."
- */
-async function fetchFromUrl(source: string, useCache: boolean): Promise<FetchedSkill> {
-  const cacheDir = resolve('.cache');
-  const cachePath = resolve(cacheDir, `skill-${hashSource(source)}.json`);
-
-  if (useCache) {
-    const cached = readCache(cachePath);
-    if (cached) {
-      console.log(`[skill] Using cached skill from ${cachePath}`);
-      return cached;
-    }
-  }
-
-  console.log(`[skill] Fetching from URL: ${source}...`);
-
-  const urlController = new AbortController();
-  const urlTimer = setTimeout(() => urlController.abort(), 30_000);
-  let content: string;
-  try {
-    const res = await fetch(source, { signal: urlController.signal });
-    if (!res.ok) {
-      throw new Error(`Failed to fetch skill from URL (${source}): ${res.status} ${res.statusText}`);
-    }
-    content = await res.text();
-  } finally {
-    clearTimeout(urlTimer);
-  }
-
-  const version: SkillVersion = {
-    source,
-    commitSha: 'unknown',
-    ref: 'url',
-    fetchedAt: new Date().toISOString(),
-  };
-
-  const result: FetchedSkill = { version, content };
-
-  writeCache(cachePath, result);
-  console.log(`[skill] Cached skill (${content.length} chars) to ${cachePath}`);
-
-  return result;
-}
-
-// ── File source ────────────────────────────────────────────────────────────
-
-/**
- * Read skill from local filesystem: "./path" or "/absolute/path"
- */
-function fetchFromFile(source: string): FetchedSkill {
-  const resolved = resolve(process.cwd(), source);
-
-  if (!existsSync(resolved)) {
-    throw new Error(`Skill file not found: ${resolved} (from source: "${source}")`);
-  }
-
-  console.log(`[skill] Reading skill from file: ${resolved}`);
-
-  let content: string;
-  try {
-    content = readFileSync(resolved, 'utf-8');
-  } catch (err) {
-    throw new Error(`Failed to read skill file ${resolved}: ${err instanceof Error ? err.message : err}`);
-  }
-
-  const version: SkillVersion = {
-    source,
-    commitSha: 'local',
-    ref: 'file',
-    fetchedAt: new Date().toISOString(),
-  };
-
-  return { version, content };
-}
-
-// ── Public API ─────────────────────────────────────────────────────────────
-
-/**
- * Fetch skill documentation from the source specified in config.
- *
- * Supported source formats:
- * - "github:org/repo/path/to/file.md" → fetch from GitHub raw content
- * - "https://..." or "http://..." → fetch from URL directly
- * - "./path" or "/absolute/path" → read from local filesystem
- *
- * Returns null if no skill config is provided (skill is optional in MCP mode).
- */
-export async function fetchSkill(skillConfig: SkillConfig | undefined): Promise<FetchedSkill | null> {
-  if (!skillConfig) return null;
-
-  const source = skillConfig.source;
-  const useCache = skillConfig.cache !== false;
-
-  if (source.startsWith('github:')) {
-    return fetchFromGitHub(source, useCache);
-  } else if (source.startsWith('https://') || source.startsWith('http://')) {
-    return fetchFromUrl(source, useCache);
-  } else {
-    return fetchFromFile(source);
-  }
-}
diff --git a/src/benchmark/types.ts b/src/benchmark/types.ts
deleted file mode 100644
index c1c77ab..0000000
--- a/src/benchmark/types.ts
+++ /dev/null
@@ -1,330 +0,0 @@
-import type {
-  ActionAttempt,
-  ActionCatalog,
-  ActionDefinition,
-  ActionSurface,
-} from '../actions/types.js';
-
-// === Core ===
-export type Tier = 'flagship' | 'mid' | 'low';
-
-export interface ModelConfig {
-  id: string;       // LLM model ID e.g. 'openai/gpt-4o'
-  name: string;     // Display name e.g. 'GPT-4o'
-  tier: Tier;
-  weight?: number;  // Optional; defaults to 1.0 at scoring time.
-}
-
-export interface TokenUsage {
-  prompt: number;
-  completion: number;
-  total: number;
-}
-
-// === Config (loaded from benchmark.config.json) ===
-
-/** Canonical surface type — same union as ActionSurface in actions/types. */
-export type BenchmarkSurface = ActionSurface;
-export type SdkLanguage = 'typescript' | 'python' | 'rust';
-
-export interface BenchmarkConfig {
-  name: string;                    // e.g. "fast-sdk", "my-mcp-tools"
-  surface: BenchmarkSurface;
-  sdk?: SdkSurfaceConfig;
-  cli?: CliSurfaceConfig;
-  mcp?: McpSurfaceConfig;
-  skill?: SkillConfig;
-  tasks: string;                   // path to tasks.json
-  llm: LLMConfig;
-  output?: OutputConfig;
-  agentic?: AgenticConfig;
-  surfaceSnapshot?: SurfaceSnapshot;
-  mcpToolDefinitions?: McpToolDefinition[];
-}
-
-export interface SdkSurfaceConfig {
-  language: SdkLanguage;
-  style?: 'sdk';                   // defaults to 'sdk' if omitted
-  // Optional: explicit API surface for coverage/hallucination reporting only.
-  // If omitted, derived automatically from task expected_actions.
-  apiSurface?: string[];
-}
-
-export interface CliSurfaceConfig {
-  shell?: 'bash' | 'sh';
-  commands: string;                // path to commands.json
-}
-
-export interface CliCommandOptionDefinition {
-  name: string;
-  description?: string;
-  aliases?: string[];
-  takesValue?: boolean;
-}
-
-export interface CliCommandDefinition {
-  command: string;
-  description?: string;
-  options?: CliCommandOptionDefinition[];
-}
-
-export interface McpSurfaceConfig {
-  tools: string;                   // path to tools.json (OpenAI function calling format)
-}
-
-export interface SkillConfig {
-  source: string;                  // "github:org/repo/path", "./file.md", "https://url"
-  cache?: boolean;                 // default true
-}
-
-export interface LLMConfig {
-  baseUrl?: string;                // required for direct openai/anthropic formats
-  authMode?: 'env' | 'codex' | 'auto';
-  apiKeyEnv?: string;              // e.g. "OPENROUTER_API_KEY" — reads from process.env
-  format: 'openai' | 'anthropic' | 'pi';
-  timeout?: number;                // ms, default 240000
-  headers?: Record<string, string>; // extra headers
-  models: ModelConfig[];
-}
-
-export interface OutputConfig {
-  dir?: string;                    // default "./benchmark-results"
-}
-
-export interface AgenticConfig {
-  references: {
-    baseUrl: string;
-    allowedPaths: string[];
-  };
-  maxTurns?: number;
-}
-
-// === MCP Tool Definition (OpenAI function calling format) ===
-
-export interface McpToolDefinition {
-  type: 'function';
-  function: {
-    name: string;
-    description?: string;
-    parameters?: {
-      type: 'object';
-      properties?: Record<string, unknown>;
-      required?: string[];
-    };
-  };
-}
-
-// === Skill ===
-
-export interface SkillVersion {
-  source: string;                  // the source string from config
-  commitSha: string;               // git SHA or 'local' or 'unknown'
-  ref: string;                     // git ref or 'file' or 'url'
-  fetchedAt: string;               // ISO timestamp
-}
-
-export interface FetchedSkill {
-  version: SkillVersion;
-  content: string;
-}
-
-// === Task Definition (loaded from tasks.json) ===
-
-export interface ExpectedAction {
-  name: string;                    // Unified action name (SDK method, CLI command, or MCP tool)
-  args?: Record<string, unknown>;  // expected arg values (supports nested objects/arrays, strings, regexes, sentinels)
-}
-
-export interface TaskVerification {
-  code_pattern?: string;           // regex pattern to match in generated code
-}
-
-export interface TaskDefinition {
-  id: string;
-  prompt: string;
-  expected_actions: ExpectedAction[];
-  verify?: TaskVerification[];
-  expected_fetches?: string[];
-  capabilityId?: string;
-}
-
-// === Extracted from generated code or tool_calls ===
-
-export type ExtractedCall = ActionAttempt;
-
-// === LLM Response ===
-
-export interface LLMResponse {
-  content: string;                 // text content from LLM
-  toolCalls?: ToolCallResult[];    // structured tool calls (MCP surface)
-  usage?: TokenUsage;
-}
-
-export interface ToolCallResult {
-  name: string;
-  arguments: Record<string, unknown>;
-}
-
-export type ToolExecutor = (name: string, args: Record<string, unknown>) => Promise<string>;
-
-// === Evaluation ===
-
-export interface ActionMatch {
-  expected: ExpectedAction;
-  found: ExtractedCall | null;
-  methodFound: boolean;
-  argsCorrect: boolean;
-  matched: boolean;
-  argResults?: Record<string, {
-    expected: string;
-    got: unknown;
-    match: boolean;
-  }>;
-}
-
-export interface TaskResult {
-  task: TaskDefinition;
-  model: ModelConfig;
-  generatedCode: string | null;
-  rawResponse: string;
-  extractedCalls: ExtractedCall[];
-  actionMatches: ActionMatch[];
-  codePatternResults?: Record<string, boolean>;
-  metrics: {
-    toolPrecision: number;
-    toolRecall: number;
-    taskPassed: boolean;
-    toolSelectionAccuracy: number;
-    argAccuracy: number;
-    unnecessaryActions: string[];
-    hallucinatedActions: string[];
-    hallucinationRate: number;
-    fetchRecall?: number;
-    fetchPrecision?: number;
-    actualFetches?: string[];
-  };
-  llmLatencyMs: number;
-  tokenUsage?: TokenUsage;
-  error?: string;
-}
-
-// === Coverage ===
-
-export interface MethodCoverage {
-  method: string;
-  tasksCovering: string[];
-  covered: boolean;
-}
-
-export type SurfaceAction = Omit<ActionDefinition, 'key'>;
-
-export interface SurfaceSnapshot extends Omit<ActionCatalog, 'actions'> {
-  surface: BenchmarkSurface;
-  actions: SurfaceAction[];
-}
-
-// === Verdict & Coverage ===
-
-export type Verdict = 'PASS' | 'FAIL';
-
-export interface VerdictPolicy {
-  perModelFloor: number;
-  targetWeightedAverage: number;
-}
-
-export interface CoverageReport {
-  inScopeActions: string[];
-  outOfScopeActions: string[];
-  coveredActions: string[];
-  uncoveredActions: string[];
-  tasksPerAction: Record<string, number>;
-  coverageViolation: boolean;
-}
-
-// === Report ===
-
-export interface ModelSummary {
-  passRate: number;
-  avgRecall: number;
-  avgPrecision: number;
-  avgToolSelectionAccuracy: number;
-  avgArgAccuracy: number;
-  avgHallucinationRate: number;
-  tasksRun: number;
-}
-
-export interface TaskSummary {
-  passRate: number;
-  avgRecall: number;
-  avgToolSelectionAccuracy: number;
-  avgArgAccuracy: number;
-}
-
-export interface BenchmarkReport {
-  timestamp: string;
-  config: { name: string; surface: BenchmarkSurface; outputDir?: string };
-  skillVersion: SkillVersion;
-  results: TaskResult[];
-  coverage: MethodCoverage[];
-  scopeCoverage?: CoverageReport;
-  summary: {
-    totalTasks: number;
-    totalModels: number;
-    totalEvaluations: number;
-    overallPassRate: number;
-    weightedAverage?: number;
-    avgToolRecall: number;
-    avgToolPrecision: number;
-    avgToolSelectionAccuracy: number;
-    avgArgAccuracy: number;
-    avgHallucinationRate: number;
-    methodCoveragePercent: number;
-    perModel: Record<string, ModelSummary>;
-    perTask: Record<string, TaskSummary>;
-    perTier: Record<Tier, { passRate: number; avgRecall: number; avgToolSelectionAccuracy: number; avgArgAccuracy: number }>;
-  };
-  verdict?: {
-    policy: VerdictPolicy;
-    result: Verdict;
-    reasons: string[];
-  };
-}
-
-export function getExpectedActionName(action: ExpectedAction): string {
-  return action.name;
-}
-
-export function getExpectedActions(task: TaskDefinition): ExpectedAction[] {
-  return task.expected_actions;
-}
-
-// === Comparison ===
-
-export type Delta = 'improved' | 'regressed' | 'unchanged' | 'new' | 'removed';
-
-export interface TaskDelta {
-  taskId: string;
-  modelId: string;
-  passedBefore: boolean;
-  passedNow: boolean;
-  delta: Delta;
-  recallBefore: number;
-  recallNow: number;
-  toolSelectionBefore: number;
-  toolSelectionNow: number;
-}
-
-export interface ComparisonReport {
-  baseline: { timestamp: string; skillVersion: SkillVersion };
-  current: { timestamp: string; skillVersion: SkillVersion };
-  taskDeltas: TaskDelta[];
-  summary: {
-    improved: number;
-    regressed: number;
-    unchanged: number;
-    coverageBefore: number;
-    coverageNow: number;
-    accuracyBefore: number;
-    accuracyNow: number;
-  };
-}
diff --git a/src/cli.ts b/src/cli.ts
index 2ef1908..22d5d7c 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -1,598 +1,71 @@
 #!/usr/bin/env node
 
-import { existsSync, realpathSync, statSync, writeFileSync } from 'node:fs';
-import { resolve, dirname } from 'node:path';
+import { realpathSync } from 'node:fs';
 import { pathToFileURL } from 'node:url';
-import { config as loadDotenv } from 'dotenv';
-
-loadDotenv({ override: true, quiet: true });
-
-import type { Tier } from './benchmark/types.js';
-import type { ResolvedProjectConfig } from './project/types.js';
-import { runBenchmark } from './benchmark/runner.js';
-import { loadReport, compareReports, printComparison } from './benchmark/compare.js';
-import { printSummary, generateMarkdown } from './benchmark/reporter.js';
-import { printCoverage } from './benchmark/coverage.js';
-import { printOptimizeSummary, runOptimizeFromConfig } from './optimizer/main.js';
-import { DEFAULT_PROJECT_CONFIG_NAME, loadProjectConfig, parseModelRef } from './project/index.js';
-import { runDoctor } from './doctor/index.js';
-import { createDefaultPiTaskGenerator, generateTasksForProject, createDefaultPiCritic, discoverActionsOnly, resolveScope } from './tasks/index.js';
-import type { Recommendation } from './verdict/recommendations.js';
-import { generateRecommendations } from './verdict/recommendations.js';
-import { renderVerdictConsole, renderVerdictMarkdown } from './verdict/render.js';
-import { importCommands } from './import/index.js';
-import { scaffoldInit } from './init/scaffold.js';
-import { buildDefaultAnswers, readAnswersFile } from './init/answers.js';
-import type { WizardAnswers } from './init/answers.js';
-import { runWizard } from './init/wizard.js';
-import { detectProject, detectedToPreseed, printDetectionSummary } from './init/detect-project.js';
-import { ERRORS, SkillOptimizerError, printError } from './errors.js';
-import { requireConfiguredApiKey } from './runtime/pi/index.js';
-
-// ── Error handling ────────────────────────────────────────────────────────────
-
-/** Print an error and exit. SkillOptimizerErrors render their fix list; others
- * are wrapped in E_UNEXPECTED and include the stack trace. */
-function fatalError(err: unknown): never {
-  if (err instanceof SkillOptimizerError) {
-    printError(err);
-  } else {
-    printError(new SkillOptimizerError(ERRORS.E_UNEXPECTED, err instanceof Error ? err.message : String(err)));
-    if (err instanceof Error && err.stack) console.error(err.stack);
-  }
-  process.exit(1);
-}
-
-// ── Arg parsing helpers ───────────────────────────────────────────────────────
-
-/** Return the value of a named flag, e.g. --tier flagship → 'flagship' */
-function getFlag(args: string[], flag: string): string | undefined {
-  const idx = args.indexOf(flag);
-  if (idx === -1) return undefined;
-  const val = args[idx + 1];
-  if (!val || val.startsWith('--')) {
-    console.error(`ERROR: Flag ${flag} requires a value.`);
-    process.exit(1);
-  }
-  return val;
-}
 
-/** Return true if a boolean flag is present, e.g. --no-cache */
-function hasFlag(args: string[], flag: string): boolean {
-  return args.includes(flag);
-}
-
-/** Return all positional (non-flag) arguments. */
-const BOOLEAN_FLAGS = new Set([
-  '--help',
-  '-h',
-  '--auto',
-  '--dry-run',
-  '--force',
-  '--no-cache',
-  '--skip-generation',
-  '--check-models',
-  '--fix',
-  '--static',
-  '--scrape',
-  '--yes',
-]);
-
-const VALUE_FLAGS = new Set([
-  '--answers',
-  '--baseline',
-  '--config',
-  '--current',
-  '--depth',
-  '--from',
-  '--max-iterations',
-  '--model',
-  '--out',
-  '--task',
-  '--tier',
-]);
-
-export function positionals(args: string[]): string[] {
-  const result: string[] = [];
-  let i = 0;
-  while (i < args.length) {
-    const arg = args[i]!;
-    if (BOOLEAN_FLAGS.has(arg)) {
-      i += 1;
-      continue;
-    }
+import { config as loadDotenv } from 'dotenv';
 
-    if (VALUE_FLAGS.has(arg)) {
-      const next = args[i + 1];
-      if (next && !next.startsWith('--')) {
-        i += 2;
-      } else {
-        i += 1;
-      }
-      continue;
-    }
+import { runWorkbenchCaseFromCli } from './workbench/run-case.js';
+import { runWorkbenchSuiteFromCli } from './workbench/run-suite.js';
 
-    if (arg.startsWith('--')) {
-      throw new Error(`Unknown flag: ${arg}`);
-    } else {
-      result.push(arg);
-      i++;
-    }
-  }
-  return result;
-}
+loadDotenv({ override: true, quiet: true });
 
 function printUsage(): void {
   console.log(`
-Skill Optimizer CLI — Benchmark and optimize SDK/CLI/MCP/prompt guidance
+Skill Optimizer Workbench
 
 Usage:
-  skill-optimizer init [sdk|cli|mcp|prompt]      Interactive wizard — scaffold config for the given surface
-  skill-optimizer init [surface] --yes          Accept all defaults non-interactively
-  skill-optimizer init --answers <file.json>    Load wizard answers from a JSON file (CI mode)
-  skill-optimizer init --auto                   Auto-detect surface from CWD and pre-fill wizard
-  skill-optimizer import-commands [options]     Extract CLI commands from source or binary
-  skill-optimizer doctor [options]              Validate config pre-flight
-  skill-optimizer generate-tasks [options]      Generate and freeze tasks from discovered surface
-  skill-optimizer benchmark [options]           Run the benchmark
-  skill-optimizer run [options]                 Run the benchmark
-  skill-optimizer optimize [options]            Run the optimization loop
-  skill-optimizer compare [options]             Compare two benchmark reports
-
-Global options:
-  --dry-run                                     Discover + scope preview only; no LLM calls, no side effects
-  --config <path>                               Config file (overrides per-command default)
-
-Doctor options:
-  --config <path>                               Config file (default: skill-optimizer.json)
-  --static                                      Run tier-1 structural checks only (no discovery)
-  --check-models                                Also ping each model for reachability (tier 3)
-  --fix                                         Apply auto-fixable issues and write config to disk
-
-Run options:
-  --config <path>                               Config file (default: skill-optimizer.json)
-  --tier <flagship|mid|low>                     Filter models by tier
-  --task <task-id>                              Run a single task
-  --model <slug>                                Run a single model
-  --no-cache                                    Force fresh skill fetch
-
-Optimize options:
-  --config <path>                               Config file (default: skill-optimizer.json)
-  --max-iterations <n>                          Override optimization iteration cap
-  --skip-generation                             Disable task generation for this run
-
-Generate-tasks options:
-  --config <path>                               Config file (default: skill-optimizer.json)
-
-Import-commands options:
-  --from <path>                                 Entry file or binary name (required)
-  --out <path>                                  Output path (default: .skill-optimizer/cli-commands.json)
-  --scrape                                      Force --help scraping regardless of file type
-  --depth <n>                                   Max subcommand depth for --help scraping (default: 2)
-  --force                                       Overwrite output file without prompting
-
-Compare options:
-  --baseline <path>                             Path to baseline report.json
-  --current <path>                              Path to current report.json
+  skill-optimizer run-case <case.yml>
+  skill-optimizer run-suite <suite.yml>
+
+Run-case options:
+  --out <path>                                  Results directory (default: <case-dir>/.results)
+  --model <model>                               Override case model
+  --models <models>                             Comma-separated OpenRouter model refs
+  --trials <n>                                  Number of independent trials (default: 1)
+  --concurrency <n>                             Maximum concurrent trial containers (default: 1)
+  --image <image>                               Docker image (default: skill-optimizer-workbench:local)
+  --keep-workspace                              Copy final /work into results/workspace; failures are always preserved
+
+Run-suite options:
+  --out <path>                                  Results directory (default: <suite-dir>/.results)
+  --trials <n>                                  Number of independent trials per case/model (default: 1)
+  --concurrency <n>                             Maximum concurrent trial containers (default: 1)
+  --image <image>                               Docker image (default: skill-optimizer-workbench:local)
+  --keep-workspace                              Copy final /work into each result workspace; failures are always preserved
 
 Examples:
-  skill-optimizer init cli
-  skill-optimizer init sdk
-  skill-optimizer init mcp
-  skill-optimizer init prompt
-  skill-optimizer import-commands --from ./src/cli.ts
-  skill-optimizer import-commands --from fast-cli --scrape
-  skill-optimizer doctor --config ./skill-optimizer.json
-  skill-optimizer doctor --static
-  skill-optimizer doctor --check-models
-  skill-optimizer doctor --fix
-  skill-optimizer --dry-run --config ./skill-optimizer.json
-  skill-optimizer benchmark --config ./skill-optimizer.json
-  skill-optimizer run
-  skill-optimizer run --config ./my-config.json
-  skill-optimizer run --tier flagship
-  skill-optimizer run --task send-tokens
-  skill-optimizer run --model gpt-4o
-  skill-optimizer run --no-cache
-  skill-optimizer generate-tasks --config ./skill-optimizer.json
-  skill-optimizer optimize --config ./skill-optimizer.json
-  skill-optimizer compare --baseline results/baseline/report.json --current results/report.json
+  skill-optimizer run-case ./case.yml
+  skill-optimizer run-case ./case.yml --keep-workspace
+  skill-optimizer run-suite ./suite.yml --trials 3
+  skill-optimizer run-case ./case.yml --models openrouter/google/gemini-2.5-flash,openrouter/openai/gpt-5.4
 `);
 }
 
-// ── Dry-run ───────────────────────────────────────────────────────────────────
-
-async function runDryRun(configPath: string): Promise<void> {
-  const project = await loadProjectConfig(configPath);
-  const discovered = discoverActionsOnly(project);
-  const { inScope, outOfScope } = resolveScope(discovered, project.target.scope);
-
-  console.log('=== skill-optimizer dry run ===');
-  console.log(`Config: ${project.configPath}`);
-  console.log(`Surface: ${project.target.surface}`);
-  console.log(`Discovered: ${discovered.length} action(s)`);
-  console.log(`In scope:     ${inScope.length} — ${inScope.map((a) => a.name).join(', ')}`);
-  console.log(`Out of scope: ${outOfScope.length} — ${outOfScope.map((a) => a.name).join(', ')}`);
-
-  const maxTasks = project.benchmark.taskGeneration.maxTasks;
-  if (project.target.surface !== 'prompt' && project.benchmark.taskGeneration.enabled && inScope.length > 0 && maxTasks < inScope.length) {
-    console.error(`\nERROR: maxTasks (${maxTasks}) < in-scope action count (${inScope.length}).`);
-    console.error(`Raise benchmark.taskGeneration.maxTasks in ${project.configPath}, or tighten target.scope.exclude.`);
-    process.exit(1);
-  }
-
-  if (inScope.length === 0) {
-    console.error('\nERROR: zero in-scope actions. Adjust target.scope.include/exclude in your config.');
-    process.exit(1);
-  }
-
-  console.log('\nNo LLM calls made. Zero side effects.');
-  process.exit(0);
-}
-
-// ── Main ──────────────────────────────────────────────────────────────────────
-
 async function main(): Promise<void> {
   const args = process.argv.slice(2);
 
-  if (hasFlag(args, '--help') || hasFlag(args, '-h')) {
+  if (args.includes('--help') || args.includes('-h')) {
     printUsage();
     process.exit(0);
   }
 
-  if (hasFlag(args, '--dry-run')) {
-    const configPath = getFlag(args, '--config') ?? DEFAULT_PROJECT_CONFIG_NAME;
-    await runDryRun(configPath);
-    return;
-  }
-
-  const pos = positionals(args);
-  const command = pos[0];
-
-  // ── Init mode ────────────────────────────────────────────────────────────────
-  if (command === 'init') {
-    const surfaceArg = pos[1] as 'sdk' | 'cli' | 'mcp' | 'prompt' | undefined;
-    if (surfaceArg && !['sdk', 'cli', 'mcp', 'prompt'].includes(surfaceArg)) {
-      console.error(`ERROR: Unknown surface '${surfaceArg}'. Must be: sdk | cli | mcp | prompt`);
-      process.exit(1);
-    }
-    const answersFlag = getFlag(args, '--answers');
-    const useDefaults = hasFlag(args, '--yes');
-    const useAuto = hasFlag(args, '--auto');
-
-    if (useAuto) {
-      let detected;
-      try {
-        detected = detectProject(process.cwd());
-      } catch (err) {
-        fatalError(err);
-      }
-      printDetectionSummary(detected);
-      if (surfaceArg && surfaceArg !== detected.surface) {
-        console.log(`Note: explicit surface '${surfaceArg}' overridden by auto-detected '${detected.surface}'.`);
-      }
-      if (useDefaults) {
-        if (detected.confidence !== 'high') {
-          printError(new SkillOptimizerError(ERRORS.E_INIT_AUTO_LOW_CONFIDENCE,
-            `detected confidence is ${detected.confidence}`));
-          process.exit(1);
-        }
-        const answers: WizardAnswers = {
-          ...buildDefaultAnswers(detected.surface, detected.repoPath),
-          ...detectedToPreseed(detected),
-        };
-        await scaffoldInit(answers, process.cwd());
-      } else {
-        await runWizard(process.cwd(), detectedToPreseed(detected));
-      }
-      process.exit(0);
-    }
-
-    if (answersFlag) {
-      const answers = readAnswersFile(resolve(process.cwd(), answersFlag));
-      await scaffoldInit(answers, process.cwd());
-    } else if (useDefaults) {
-      const answers = buildDefaultAnswers(surfaceArg ?? 'sdk', process.cwd());
-      await scaffoldInit(answers, process.cwd());
-    } else {
-      await runWizard(process.cwd(), surfaceArg ? { surface: surfaceArg } : undefined);
-    }
-    process.exit(0);
-  }
-
-  // ── Import-commands mode ─────────────────────────────────────────────────────
-  if (command === 'import-commands') {
-    const fromFlag = getFlag(args, '--from');
-    if (!fromFlag) {
-      console.error('ERROR: --from <path> is required for import-commands.');
-      console.error('  Example: skill-optimizer import-commands --from ./src/cli.ts');
-      process.exit(1);
-    }
-    const outFlag = getFlag(args, '--out') ?? '.skill-optimizer/cli-commands.json';
-    const depthRaw = getFlag(args, '--depth');
-    try {
-      await importCommands({
-        from: fromFlag,
-        out: outFlag,
-        scrape: hasFlag(args, '--scrape'),
-        depth: depthRaw ? parseInt(depthRaw, 10) : 2,
-        cwd: process.cwd(),
-        force: hasFlag(args, '--force'),
-      });
-    } catch (err) {
-      console.error(`\n  ERROR: ${err instanceof Error ? err.message : err}`);
-      process.exit(1);
-    }
-    process.exit(0);
-  }
-
-  // ── Doctor mode ──────────────────────────────────────────────────────────────
-  if (command === 'doctor') {
-    const configPath = getFlag(args, '--config') ?? DEFAULT_PROJECT_CONFIG_NAME;
-    const exitCode = await runDoctor(configPath, {
-      staticOnly: hasFlag(args, '--static'),
-      checkModels: hasFlag(args, '--check-models'),
-      fix: hasFlag(args, '--fix'),
-    });
-    process.exit(exitCode);
-  }
-
-  // ── Compare mode ────────────────────────────────────────────────────────────
-  if (command === 'compare') {
-    const baselinePath = getFlag(args, '--baseline');
-    const currentPath = getFlag(args, '--current');
-
-    if (!baselinePath) {
-      console.error('ERROR: --baseline <path> is required for compare mode.');
-      console.error('  Example: skill-optimizer compare --baseline results/baseline/report.json --current results/report.json');
-      process.exit(1);
-    }
-    if (!currentPath) {
-      console.error('ERROR: --current <path> is required for compare mode.');
-      console.error('  Example: skill-optimizer compare --baseline results/baseline/report.json --current results/report.json');
-      process.exit(1);
-    }
-
-    let baseline;
-    try {
-      baseline = loadReport(resolve(baselinePath));
-    } catch (err) {
-      console.error(`ERROR: Could not load baseline report from '${baselinePath}': ${err instanceof Error ? err.message : err}`);
-      process.exit(1);
-    }
-
-    let current;
-    try {
-      current = loadReport(resolve(currentPath));
-    } catch (err) {
-      console.error(`ERROR: Could not load current report from '${currentPath}': ${err instanceof Error ? err.message : err}`);
-      process.exit(1);
-    }
-
-    const comparison = compareReports(baseline, current);
-    printComparison(comparison);
-    process.exit(0);
-  }
-
-  // ── Optimize mode ──────────────────────────────────────────────────────────
-  if (command === 'optimize') {
-    const configPath = getFlag(args, '--config');
-    try {
-      const { result, resolvedManifest, ledgerPath } = await runOptimizeFromConfig(configPath ?? DEFAULT_PROJECT_CONFIG_NAME, {
-        maxIterationsRaw: getFlag(args, '--max-iterations'),
-        skipGeneration: hasFlag(args, '--skip-generation'),
-      });
-      printOptimizeSummary(result, resolvedManifest, ledgerPath);
-      // Show verdict and recommendations for best report
-      const bestReport = result.bestReport;
-      if (bestReport.verdict) {
-        let recs: Recommendation[] = [];
-        if (bestReport.verdict.result === 'FAIL') {
-          try {
-            const mutation = resolvedManifest.mutation;
-            if (mutation) {
-              const criticDeps = createDefaultPiCritic({
-                provider: mutation.provider,
-                model: mutation.model,
-                authMode: mutation.authMode,
-                apiKeyEnv: mutation.apiKeyEnv,
-              });
-              recs = await generateRecommendations(
-                bestReport,
-                criticDeps,
-                mutation.reportContextMaxBytes ?? 16_000,
-              );
-            }
-          } catch (err) {
-            console.error(`WARNING: Could not generate recommendations: ${err instanceof Error ? err.message : err}`);
-          }
-        }
-        console.log(renderVerdictConsole(bestReport, recs));
-      }
-      process.exit(bestReport.verdict?.result === 'FAIL' ? 1 : 0);
-    } catch (err) {
-      fatalError(err);
-    }
-  }
-
-  // ── Generate-tasks mode ───────────────────────────────────────────────────
-  if (command === 'generate-tasks') {
-    const configPath = getFlag(args, '--config') ?? DEFAULT_PROJECT_CONFIG_NAME;
-    try {
-      const project = await loadProjectConfig(configPath);
-      if (!project.benchmark.taskGeneration.enabled) {
-        throw new Error('benchmark.taskGeneration.enabled must be true to use generate-tasks');
-      }
-      const modelRef = project.optimize?.model ?? project.benchmark.models[0]!.id;
-      const { provider, model } = parseModelRef(modelRef);
-      const deps = createDefaultPiTaskGenerator({
-        provider,
-        model,
-        authMode: project.optimize?.authMode ?? project.benchmark.authMode,
-        apiKeyEnv: project.optimize?.apiKeyEnv ?? project.benchmark.apiKeyEnv,
-      });
-      const result = await generateTasksForProject({
-        configPath,
-        maxTasks: project.benchmark.taskGeneration.maxTasks,
-        seed: project.benchmark.taskGeneration.seed,
-        outputDir: project.benchmark.taskGeneration.outputDir,
-        deps,
-      });
-      console.log('');
-      console.log(`Generated tasks: ${result.kept.length} (rejected: ${result.rejected.length})`);
-      console.log(`Frozen config: ${result.artifacts.benchmarkPath}`);
-      console.log(`Frozen snapshot: ${result.artifacts.snapshotPath}`);
-      console.log(`Generated tasks file: ${result.artifacts.tasksPath}`);
-    } catch (err) {
-      console.error(`\nFATAL: Task generation failed: ${err instanceof Error ? err.message : err}`);
-      if (err instanceof Error && err.stack) {
-        console.error(err.stack);
-      }
-      process.exit(1);
+  const command = args[0];
+  const commands = new Set(['run-case', 'run-suite']);
+  if (!commands.has(command ?? '')) {
+    if (command) {
+      console.error(`ERROR: Unknown command '${command}'.`);
     }
-    process.exit(0);
-  }
-
-  if (command && command !== 'run' && command !== 'benchmark') {
-    console.error(`ERROR: Unknown command '${command}'.`);
     printUsage();
     process.exit(1);
   }
 
-  // ── Benchmark mode (default, also handles explicit 'run' command) ─────────────
-  const tierRaw = getFlag(args, '--tier');
-  const validTiers: Tier[] = ['flagship', 'mid', 'low'];
-  if (tierRaw && !validTiers.includes(tierRaw as Tier)) {
-    console.error(`ERROR: Invalid tier '${tierRaw}'. Must be one of: ${validTiers.join(', ')}`);
-    process.exit(1);
+  if (command === 'run-case') {
+    await runWorkbenchCaseFromCli(args.slice(1));
+  } else if (command === 'run-suite') {
+    await runWorkbenchSuiteFromCli(args.slice(1));
   }
-
-  let options = {
-    configPath: getFlag(args, '--config'),
-    tier: tierRaw as Tier | undefined,
-    taskId: getFlag(args, '--task'),
-    modelSlug: getFlag(args, '--model'),
-    noCache: hasFlag(args, '--no-cache'),
-  };
-
-  let project: ResolvedProjectConfig | undefined;
-  let generatedCoverage: import('./benchmark/types.js').CoverageReport | undefined;
-  try {
-    project = await loadProjectConfig(options.configPath ?? DEFAULT_PROJECT_CONFIG_NAME);
-    if (!existsSync(project.target.repoPath) || !statSync(project.target.repoPath).isDirectory()) {
-      throw new Error(
-        `target.repoPath does not exist or is not a directory: ${project.target.repoPath}. ` +
-          `Edit "target.repoPath" in ${project.configPath}.`,
-      );
-    }
-    // Check credentials for each unique provider: direct-API formats have one provider,
-    // but pi format may route different models to different providers (e.g. openrouter + openai).
-    const benchmarkProviders = project.benchmark.format === 'openai'
-      ? ['openai']
-      : project.benchmark.format === 'anthropic'
-        ? ['anthropic']
-        : [...new Set(project.benchmark.models.map(m => parseModelRef(m.id).provider))];
-    for (const benchmarkProvider of benchmarkProviders) {
-      try {
-        requireConfiguredApiKey({
-          provider: benchmarkProvider,
-          authMode: project.benchmark.authMode,
-          apiKeyEnv: project.benchmark.apiKeyEnv,
-        });
-      } catch (error) {
-        throw new Error(
-          `${error instanceof Error ? error.message : String(error)} ` +
-            `Configure auth in ${project.configPath} before running the benchmark.`,
-        );
-      }
-    }
-    if (project.benchmark.taskGeneration.enabled) {
-      const modelRef = project.optimize?.model ?? project.benchmark.models[0]!.id;
-      const { provider, model } = parseModelRef(modelRef);
-      const deps = createDefaultPiTaskGenerator({
-        provider,
-        model,
-        authMode: project.optimize?.authMode ?? project.benchmark.authMode,
-        apiKeyEnv: project.optimize?.apiKeyEnv ?? project.benchmark.apiKeyEnv,
-      });
-      const generation = await generateTasksForProject({
-        configPath: options.configPath ?? DEFAULT_PROJECT_CONFIG_NAME,
-        maxTasks: project.benchmark.taskGeneration.maxTasks,
-        seed: project.benchmark.taskGeneration.seed,
-        outputDir: project.benchmark.taskGeneration.outputDir,
-        deps,
-      });
-      options = {
-        ...options,
-        configPath: generation.artifacts.benchmarkPath,
-      };
-      generatedCoverage = generation.coverage;
-    }
-  } catch (err) {
-    fatalError(err);
-  }
-
-  let report;
-  try {
-    report = await runBenchmark({
-      ...options,
-      verdictPolicy: project?.benchmark.verdict,
-      scopeCoverage: generatedCoverage,
-    });
-  } catch (err) {
-    fatalError(err);
-  }
-
-  printSummary(report);
-  printCoverage(report.coverage);
-
-  // Resolve output dir relative to the config file's directory, matching the runner's behavior
-  const configFileDir = options.configPath ? dirname(resolve(options.configPath)) : process.cwd();
-  const reportConfig = report.config as { name: string; surface: string; outputDir?: string };
-  const outputDir = resolve(configFileDir, reportConfig.outputDir ?? 'benchmark-results');
-
-  let recommendations: Recommendation[] = [];
-  if (report.verdict?.result === 'FAIL' && project) {
-    try {
-      const modelRef = project.optimize?.model ?? project.benchmark.models[0]!.id;
-      const { provider, model } = parseModelRef(modelRef);
-      const criticDeps = createDefaultPiCritic({
-        provider,
-        model,
-        authMode: project.optimize?.authMode ?? project.benchmark.authMode,
-        apiKeyEnv: project.optimize?.apiKeyEnv ?? project.benchmark.apiKeyEnv,
-      });
-      recommendations = await generateRecommendations(
-        report,
-        criticDeps,
-        project.optimize?.reportContextMaxBytes ?? 16_000,
-      );
-    } catch (err) {
-      console.error(`WARNING: Could not generate recommendations: ${err instanceof Error ? err.message : err}`);
-    }
-  }
-
-  console.log(renderVerdictConsole(report, recommendations));
-
-  const mdPath = resolve(outputDir, 'report.md');
-  try {
-    const markdown = generateMarkdown(report) + '\n\n' + renderVerdictMarkdown(report, recommendations);
-    writeFileSync(mdPath, markdown, 'utf-8');
-    console.log(`[output] Markdown report saved to ${mdPath}`);
-  } catch (err) {
-    console.error(`WARNING: Could not write Markdown report: ${err instanceof Error ? err.message : err}`);
-  }
-
-  const { summary } = report;
-  const passedCount = Math.round(summary.overallPassRate * summary.totalEvaluations);
-  console.log(
-    `\nDone. ${passedCount}/${summary.totalEvaluations} evaluations passed ` +
-      `(${(summary.overallPassRate * 100).toFixed(1)}%). ` +
-      `Coverage: ${(summary.methodCoveragePercent * 100).toFixed(1)}% ` +
-      `(surface: ${reportConfig.surface}).`,
-  );
-
-  process.exit(report.verdict?.result === 'FAIL' ? 1 : 0);
+  process.exit(process.exitCode ?? 0);
 }
 
 function isExecutedDirectly(): boolean {
@@ -606,8 +79,8 @@ function isExecutedDirectly(): boolean {
 }
 
 if (isExecutedDirectly()) {
-  main().catch(err => {
-    console.error('Unhandled error:', err);
+  main().catch((error: unknown) => {
+    console.error(error instanceof Error ? error.message : String(error));
     process.exit(1);
   });
 }
diff --git a/src/discovery/cli.ts b/src/discovery/cli.ts
deleted file mode 100644
index 95cc603..0000000
--- a/src/discovery/cli.ts
+++ /dev/null
@@ -1,321 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-import ts from 'typescript';
-
-import type { ActionArgSchema } from '../actions/types.js';
-import type { CliDiscoverySnapshot, DiscoveryOptions, DiscoveredAction } from './types.js';
-import { discoverOptiqueActionsFromFile } from './optique.js';
-
-type LiteralPrimitive = string | number | boolean | null;
-interface LiteralObject {
-  [key: string]: LiteralValue;
-}
-interface LiteralArray extends Array<LiteralValue> {}
-type LiteralValue = LiteralPrimitive | LiteralObject | LiteralArray;
-
-export function discoverCliSurfaceFromSources(sources: string[], options: DiscoveryOptions = {}): CliDiscoverySnapshot {
-  const baseDir = options.baseDir ?? process.cwd();
-  const resolvedSources = sources.map((source) => resolve(baseDir, source));
-  const discoveredActions: DiscoveredAction[] = [];
-
-  for (const sourcePath of resolvedSources) {
-    discoveredActions.push(...discoverCliActionsFromSourceFile(sourcePath));
-  }
-
-  return {
-    surface: 'cli',
-    actions: dedupeActionsByName(discoveredActions),
-    sources: resolvedSources,
-  };
-}
-
-function discoverCliActionsFromSourceFile(filePath: string): DiscoveredAction[] {
-  if (!existsSync(filePath)) {
-    throw new Error(`CLI discovery source file not found: ${filePath}`);
-  }
-
-  let sourceCode: string;
-  try {
-    sourceCode = readFileSync(filePath, 'utf-8');
-  } catch (error) {
-    throw new Error(`Failed to read CLI discovery source file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
-  }
-
-  // Fast path: optique combinator CLI — literal extractor can't handle function calls
-  if (/@optique\/core/.test(sourceCode)) {
-    return discoverOptiqueActionsFromFile(filePath);
-  }
-
-  const sourceFile = ts.createSourceFile(filePath, sourceCode, ts.ScriptTarget.Latest, false);
-  const constants = collectTopLevelConstInitializers(sourceFile);
-  const candidates = collectExportedLiteralCandidates(sourceFile, constants);
-  const actions: DiscoveredAction[] = [];
-
-  for (const candidate of candidates) {
-    const commandEntries = extractCommandEntries(candidate);
-    for (const commandEntry of commandEntries) {
-      const action = toDiscoveredAction(commandEntry, filePath);
-      if (action) {
-        actions.push(action);
-      }
-    }
-  }
-
-  return actions;
-}
-
-function collectTopLevelConstInitializers(sourceFile: ts.SourceFile): Map<string, ts.Expression> {
-  const constants = new Map<string, ts.Expression>();
-
-  for (const statement of sourceFile.statements) {
-    if (!ts.isVariableStatement(statement)) {
-      continue;
-    }
-
-    if ((ts.getCombinedNodeFlags(statement.declarationList) & ts.NodeFlags.Const) === 0) {
-      continue;
-    }
-
-    for (const declaration of statement.declarationList.declarations) {
-      if (!ts.isIdentifier(declaration.name) || !declaration.initializer) {
-        continue;
-      }
-
-      constants.set(declaration.name.text, declaration.initializer);
-    }
-  }
-
-  return constants;
-}
-
-function collectExportedLiteralCandidates(sourceFile: ts.SourceFile, constants: Map<string, ts.Expression>): LiteralValue[] {
-  const candidates: LiteralValue[] = [];
-
-  for (const statement of sourceFile.statements) {
-    if (isExportedVariableStatement(statement)) {
-      for (const declaration of statement.declarationList.declarations) {
-        if (!declaration.initializer) {
-          continue;
-        }
-
-        const value = readLiteralValue(declaration.initializer, constants, new Set());
-        if (value !== undefined) {
-          candidates.push(value);
-        }
-      }
-
-      continue;
-    }
-
-    if (ts.isExportAssignment(statement)) {
-      const value = readLiteralValue(statement.expression, constants, new Set());
-      if (value !== undefined) {
-        candidates.push(value);
-      }
-      continue;
-    }
-
-    if (!ts.isExportDeclaration(statement) || !statement.exportClause || statement.moduleSpecifier) {
-      continue;
-    }
-
-    if (!ts.isNamedExports(statement.exportClause)) {
-      continue;
-    }
-
-    for (const element of statement.exportClause.elements) {
-      const localName = element.propertyName?.text ?? element.name.text;
-      const expression = constants.get(localName);
-      if (!expression) {
-        continue;
-      }
-
-      const value = readLiteralValue(expression, constants, new Set());
-      if (value !== undefined) {
-        candidates.push(value);
-      }
-    }
-  }
-
-  return candidates;
-}
-
-function isExportedVariableStatement(statement: ts.Statement): statement is ts.VariableStatement {
-  return ts.isVariableStatement(statement)
-    && statement.modifiers?.some((modifier) => modifier.kind === ts.SyntaxKind.ExportKeyword) === true;
-}
-
-function readLiteralValue(node: ts.Expression, constants: Map<string, ts.Expression>, stack: Set<string>): LiteralValue | undefined {
-  const expression = unwrapExpression(node);
-
-  if (ts.isStringLiteral(expression) || ts.isNoSubstitutionTemplateLiteral(expression)) {
-    return expression.text;
-  }
-
-  if (ts.isNumericLiteral(expression)) {
-    return Number(expression.text);
-  }
-
-  if (expression.kind === ts.SyntaxKind.TrueKeyword) {
-    return true;
-  }
-
-  if (expression.kind === ts.SyntaxKind.FalseKeyword) {
-    return false;
-  }
-
-  if (expression.kind === ts.SyntaxKind.NullKeyword) {
-    return null;
-  }
-
-  if (ts.isIdentifier(expression)) {
-    const constant = constants.get(expression.text);
-    if (!constant || stack.has(expression.text)) {
-      return undefined;
-    }
-
-    stack.add(expression.text);
-    const value = readLiteralValue(constant, constants, stack);
-    stack.delete(expression.text);
-    return value;
-  }
-
-  if (ts.isArrayLiteralExpression(expression)) {
-    const values: LiteralValue[] = [];
-
-    for (const element of expression.elements) {
-      if (!ts.isExpression(element)) {
-        return undefined;
-      }
-
-      const value = readLiteralValue(element, constants, stack);
-      if (value === undefined) {
-        return undefined;
-      }
-
-      values.push(value);
-    }
-
-    return values;
-  }
-
-  if (ts.isObjectLiteralExpression(expression)) {
-    const objectValue: LiteralObject = {};
-
-    for (const property of expression.properties) {
-      if (ts.isPropertyAssignment(property)) {
-        const key = propertyNameToString(property.name);
-        if (!key) {
-          return undefined;
-        }
-
-        const value = readLiteralValue(property.initializer, constants, stack);
-        if (value === undefined) {
-          return undefined;
-        }
-
-        objectValue[key] = value;
-        continue;
-      }
-
-      if (ts.isShorthandPropertyAssignment(property)) {
-        const key = property.name.text;
-        const value = readLiteralValue(property.name, constants, stack);
-        if (value === undefined) {
-          return undefined;
-        }
-
-        objectValue[key] = value;
-        continue;
-      }
-
-      return undefined;
-    }
-
-    return objectValue;
-  }
-
-  return undefined;
-}
-
-function unwrapExpression(node: ts.Expression): ts.Expression {
-  let current = node;
-  while (ts.isParenthesizedExpression(current) || ts.isAsExpression(current) || ts.isTypeAssertionExpression(current) || ts.isSatisfiesExpression(current)) {
-    current = current.expression;
-  }
-  return current;
-}
-
-function propertyNameToString(name: ts.PropertyName): string | null {
-  if (ts.isIdentifier(name) || ts.isStringLiteral(name) || ts.isNumericLiteral(name)) {
-    return name.text;
-  }
-
-  return null;
-}
-
-function extractCommandEntries(candidate: LiteralValue): LiteralObject[] {
-  if (!Array.isArray(candidate)) {
-    return [];
-  }
-
-  return candidate.filter(isLiteralObject);
-}
-
-function toDiscoveredAction(commandEntry: LiteralObject, source: string): DiscoveredAction | null {
-  const command = commandEntry.command;
-  if (typeof command !== 'string' || command.trim() === '') {
-    return null;
-  }
-
-  const description = typeof commandEntry.description === 'string' ? commandEntry.description : undefined;
-  const args = extractActionArgs(commandEntry.options);
-
-  return {
-    name: command,
-    description,
-    args,
-    source,
-  };
-}
-
-function extractActionArgs(options: LiteralValue | undefined): ActionArgSchema[] {
-  if (!Array.isArray(options)) {
-    return [];
-  }
-
-  const args: ActionArgSchema[] = [];
-  for (const option of options) {
-    if (!isLiteralObject(option)) {
-      continue;
-    }
-
-    const name = option.name;
-    if (typeof name !== 'string' || name.trim() === '') {
-      continue;
-    }
-
-    const takesValue = typeof option.takesValue === 'boolean' ? option.takesValue : undefined;
-    args.push({
-      name,
-      required: false,
-      type: takesValue === true ? 'string' : takesValue === false ? 'boolean' : undefined,
-      description: typeof option.description === 'string' ? option.description : undefined,
-    });
-  }
-
-  return args;
-}
-
-function isLiteralObject(value: unknown): value is LiteralObject {
-  return Boolean(value) && typeof value === 'object' && !Array.isArray(value);
-}
-
-function dedupeActionsByName(actions: DiscoveredAction[]): DiscoveredAction[] {
-  const map = new Map<string, DiscoveredAction>();
-  for (const action of actions) {
-    map.set(action.name, action);
-  }
-  return Array.from(map.values());
-}
diff --git a/src/discovery/mcp.ts b/src/discovery/mcp.ts
deleted file mode 100644
index ebf5770..0000000
--- a/src/discovery/mcp.ts
+++ /dev/null
@@ -1,335 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-import ts from 'typescript';
-
-import type { ActionArgSchema } from '../actions/types.js';
-import type { DiscoveryOptions, DiscoveredAction, McpDiscoverySnapshot } from './types.js';
-
-type LiteralPrimitive = string | number | boolean | null;
-interface LiteralObject {
-  [key: string]: LiteralValue;
-}
-interface LiteralArray extends Array<LiteralValue> {}
-type LiteralValue = LiteralPrimitive | LiteralObject | LiteralArray;
-
-export function discoverMcpSurfaceFromSources(sources: string[], options: DiscoveryOptions = {}): McpDiscoverySnapshot {
-  const baseDir = options.baseDir ?? process.cwd();
-  const resolvedSources = sources.map((source) => resolve(baseDir, source));
-  const discoveredActions: DiscoveredAction[] = [];
-
-  for (const sourcePath of resolvedSources) {
-    discoveredActions.push(...discoverMcpActionsFromSourceFile(sourcePath));
-  }
-
-  const uniqueActions = dedupeActionsByName(discoveredActions);
-
-  return {
-    surface: 'mcp',
-    actions: uniqueActions,
-    sources: resolvedSources,
-  };
-}
-
-function discoverMcpActionsFromSourceFile(filePath: string): DiscoveredAction[] {
-  if (!existsSync(filePath)) {
-    throw new Error(`MCP discovery source file not found: ${filePath}`);
-  }
-
-  let sourceCode: string;
-  try {
-    sourceCode = readFileSync(filePath, 'utf-8');
-  } catch (error) {
-    throw new Error(`Failed to read MCP discovery source file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
-  }
-
-  const sourceFile = ts.createSourceFile(filePath, sourceCode, ts.ScriptTarget.Latest, false);
-  const constants = collectTopLevelConstInitializers(sourceFile);
-  const candidates = collectExportedLiteralCandidates(sourceFile, constants);
-  const actions: DiscoveredAction[] = [];
-
-  for (const candidate of candidates) {
-    const toolObjects = extractToolObjects(candidate);
-    for (const toolObject of toolObjects) {
-      const action = toDiscoveredAction(toolObject, filePath);
-      if (action) {
-        actions.push(action);
-      }
-    }
-  }
-
-  return actions;
-}
-
-function collectTopLevelConstInitializers(sourceFile: ts.SourceFile): Map<string, ts.Expression> {
-  const constants = new Map<string, ts.Expression>();
-
-  for (const statement of sourceFile.statements) {
-    if (!ts.isVariableStatement(statement)) {
-      continue;
-    }
-
-    if ((ts.getCombinedNodeFlags(statement.declarationList) & ts.NodeFlags.Const) === 0) {
-      continue;
-    }
-
-    for (const declaration of statement.declarationList.declarations) {
-      if (!ts.isIdentifier(declaration.name) || !declaration.initializer) {
-        continue;
-      }
-
-      constants.set(declaration.name.text, declaration.initializer);
-    }
-  }
-
-  return constants;
-}
-
-function collectExportedLiteralCandidates(sourceFile: ts.SourceFile, constants: Map<string, ts.Expression>): LiteralValue[] {
-  const candidates: LiteralValue[] = [];
-
-  for (const statement of sourceFile.statements) {
-    if (isExportedVariableStatement(statement)) {
-      for (const declaration of statement.declarationList.declarations) {
-        if (!declaration.initializer) {
-          continue;
-        }
-
-        const value = readLiteralValue(declaration.initializer, constants, new Set());
-        if (value !== undefined) {
-          candidates.push(value);
-        }
-      }
-      continue;
-    }
-
-    if (ts.isExportAssignment(statement)) {
-      const value = readLiteralValue(statement.expression, constants, new Set());
-      if (value !== undefined) {
-        candidates.push(value);
-      }
-      continue;
-    }
-
-    if (!ts.isExportDeclaration(statement) || !statement.exportClause || statement.moduleSpecifier) {
-      continue;
-    }
-
-    if (!ts.isNamedExports(statement.exportClause)) {
-      continue;
-    }
-
-    for (const element of statement.exportClause.elements) {
-      const localName = element.propertyName?.text ?? element.name.text;
-      const expression = constants.get(localName);
-      if (!expression) {
-        continue;
-      }
-
-      const value = readLiteralValue(expression, constants, new Set());
-      if (value !== undefined) {
-        candidates.push(value);
-      }
-    }
-  }
-
-  return candidates;
-}
-
-function isExportedVariableStatement(statement: ts.Statement): statement is ts.VariableStatement {
-  return ts.isVariableStatement(statement)
-    && statement.modifiers?.some((modifier) => modifier.kind === ts.SyntaxKind.ExportKeyword) === true;
-}
-
-function readLiteralValue(node: ts.Expression, constants: Map<string, ts.Expression>, stack: Set<string>): LiteralValue | undefined {
-  const expression = unwrapExpression(node);
-
-  if (ts.isStringLiteral(expression) || ts.isNoSubstitutionTemplateLiteral(expression)) {
-    return expression.text;
-  }
-
-  if (ts.isNumericLiteral(expression)) {
-    return Number(expression.text);
-  }
-
-  if (expression.kind === ts.SyntaxKind.TrueKeyword) {
-    return true;
-  }
-
-  if (expression.kind === ts.SyntaxKind.FalseKeyword) {
-    return false;
-  }
-
-  if (expression.kind === ts.SyntaxKind.NullKeyword) {
-    return null;
-  }
-
-  if (ts.isIdentifier(expression)) {
-    const constant = constants.get(expression.text);
-    if (!constant || stack.has(expression.text)) {
-      return undefined;
-    }
-
-    stack.add(expression.text);
-    const value = readLiteralValue(constant, constants, stack);
-    stack.delete(expression.text);
-    return value;
-  }
-
-  if (ts.isArrayLiteralExpression(expression)) {
-    const values: LiteralValue[] = [];
-
-    for (const element of expression.elements) {
-      if (!ts.isExpression(element)) {
-        return undefined;
-      }
-
-      const value = readLiteralValue(element, constants, stack);
-      if (value === undefined) {
-        return undefined;
-      }
-
-      values.push(value);
-    }
-
-    return values;
-  }
-
-  if (ts.isObjectLiteralExpression(expression)) {
-    const objectValue: LiteralObject = {};
-
-    for (const property of expression.properties) {
-      if (ts.isPropertyAssignment(property)) {
-        const key = propertyNameToString(property.name);
-        if (!key) {
-          return undefined;
-        }
-
-        const value = readLiteralValue(property.initializer, constants, stack);
-        if (value === undefined) {
-          return undefined;
-        }
-
-        objectValue[key] = value;
-        continue;
-      }
-
-      if (ts.isShorthandPropertyAssignment(property)) {
-        const key = property.name.text;
-        const value = readLiteralValue(property.name, constants, stack);
-        if (value === undefined) {
-          return undefined;
-        }
-
-        objectValue[key] = value;
-        continue;
-      }
-
-      return undefined;
-    }
-
-    return objectValue;
-  }
-
-  return undefined;
-}
-
-function unwrapExpression(node: ts.Expression): ts.Expression {
-  let current = node;
-  while (ts.isParenthesizedExpression(current) || ts.isAsExpression(current) || ts.isTypeAssertionExpression(current) || ts.isSatisfiesExpression(current)) {
-    current = current.expression;
-  }
-  return current;
-}
-
-function propertyNameToString(name: ts.PropertyName): string | null {
-  if (ts.isIdentifier(name) || ts.isStringLiteral(name) || ts.isNumericLiteral(name)) {
-    return name.text;
-  }
-
-  return null;
-}
-
-function extractToolObjects(candidate: LiteralValue): LiteralObject[] {
-  if (Array.isArray(candidate)) {
-    return candidate.filter(isLiteralObject);
-  }
-
-  if (!isLiteralObject(candidate)) {
-    return [];
-  }
-
-  if (looksLikeToolDefinition(candidate)) {
-    return [candidate];
-  }
-
-  const tools = candidate.tools;
-  if (Array.isArray(tools)) {
-    return tools.filter(isLiteralObject);
-  }
-
-  return [];
-}
-
-function looksLikeToolDefinition(value: LiteralObject): boolean {
-  return typeof value.type === 'string' && isLiteralObject(value.function);
-}
-
-function toDiscoveredAction(toolDefinition: LiteralObject, source: string): DiscoveredAction | null {
-  const fn = toolDefinition.function;
-  if (!isLiteralObject(fn)) {
-    return null;
-  }
-
-  const name = fn.name;
-  if (typeof name !== 'string' || name.trim() === '') {
-    return null;
-  }
-
-  const description = typeof fn.description === 'string' ? fn.description : undefined;
-  const parameters = isLiteralObject(fn.parameters) ? fn.parameters : null;
-  const properties = parameters && isLiteralObject(parameters.properties) ? parameters.properties : {};
-  const requiredNames = parameters ? asRequiredArray(parameters.required) : [];
-
-  const args: ActionArgSchema[] = [];
-  for (const [argName, schemaValue] of Object.entries(properties)) {
-    if (!isLiteralObject(schemaValue)) {
-      continue;
-    }
-
-    args.push({
-      name: argName,
-      required: requiredNames.includes(argName),
-      type: typeof schemaValue.type === 'string' ? schemaValue.type : undefined,
-      description: typeof schemaValue.description === 'string' ? schemaValue.description : undefined,
-      schema: schemaValue,
-    });
-  }
-
-  return {
-    name,
-    description,
-    args,
-    source,
-  };
-}
-
-function asRequiredArray(value: LiteralValue | undefined): string[] {
-  if (!Array.isArray(value)) {
-    return [];
-  }
-
-  return value.filter((entry): entry is string => typeof entry === 'string');
-}
-
-function isLiteralObject(value: unknown): value is LiteralObject {
-  return Boolean(value) && typeof value === 'object' && !Array.isArray(value);
-}
-
-function dedupeActionsByName(actions: DiscoveredAction[]): DiscoveredAction[] {
-  const map = new Map<string, DiscoveredAction>();
-  for (const action of actions) {
-    map.set(action.name, action);
-  }
-  return Array.from(map.values());
-}
diff --git a/src/discovery/optique.ts b/src/discovery/optique.ts
deleted file mode 100644
index d88c58d..0000000
--- a/src/discovery/optique.ts
+++ /dev/null
@@ -1,315 +0,0 @@
-/**
- * Static AST extractor for optique-based CLI parsers.
- *
- * Optique builds CLIs through function combinators:
- *   command("name", object({...}), { description })
- *   command("group", or(child1, child2), { description })
- *   merge(globalOptions, or(group1, group2, leafCmd))
- *
- * This module walks those combinators without executing any code.
- */
-import { existsSync, readFileSync } from 'node:fs';
-
-import ts from 'typescript';
-
-import type { ActionArgSchema } from '../actions/types.js';
-import type { DiscoveredAction } from './types.js';
-
-export function discoverOptiqueActionsFromFile(filePath: string): DiscoveredAction[] {
-  if (!existsSync(filePath)) {
-    throw new Error(`Optique discovery source not found: ${filePath}`);
-  }
-
-  const sourceCode = readFileSync(filePath, 'utf-8');
-  const sourceFile = ts.createSourceFile(filePath, sourceCode, ts.ScriptTarget.Latest, false);
-  const constants = collectTopLevelConsts(sourceFile);
-  const roots = collectExportedExpressions(sourceFile, constants);
-
-  const actions: DiscoveredAction[] = [];
-  for (const root of roots) {
-    walkExpr(root, constants, [], actions, filePath);
-  }
-
-  return dedupeByName(actions);
-}
-
-// ── AST collection ────────────────────────────────────────────────────────────
-
-function collectTopLevelConsts(sourceFile: ts.SourceFile): Map<string, ts.Expression> {
-  const map = new Map<string, ts.Expression>();
-  for (const stmt of sourceFile.statements) {
-    if (!ts.isVariableStatement(stmt)) continue;
-    if ((ts.getCombinedNodeFlags(stmt.declarationList) & ts.NodeFlags.Const) === 0) continue;
-    for (const decl of stmt.declarationList.declarations) {
-      if (ts.isIdentifier(decl.name) && decl.initializer) {
-        map.set(decl.name.text, decl.initializer);
-      }
-    }
-  }
-  return map;
-}
-
-function collectExportedExpressions(
-  sourceFile: ts.SourceFile,
-  constants: Map<string, ts.Expression>,
-): ts.Expression[] {
-  const exprs: ts.Expression[] = [];
-  for (const stmt of sourceFile.statements) {
-    if (ts.isVariableStatement(stmt) && isExportedStatement(stmt)) {
-      for (const decl of stmt.declarationList.declarations) {
-        if (decl.initializer) exprs.push(decl.initializer);
-      }
-    } else if (ts.isExportAssignment(stmt)) {
-      exprs.push(stmt.expression);
-    } else if (ts.isExportDeclaration(stmt) && stmt.exportClause && !stmt.moduleSpecifier) {
-      if (ts.isNamedExports(stmt.exportClause)) {
-        for (const el of stmt.exportClause.elements) {
-          const local = el.propertyName?.text ?? el.name.text;
-          const init = constants.get(local);
-          if (init) exprs.push(init);
-        }
-      }
-    }
-  }
-  return exprs;
-}
-
-function isExportedStatement(stmt: ts.VariableStatement): boolean {
-  return stmt.modifiers?.some((m) => m.kind === ts.SyntaxKind.ExportKeyword) ?? false;
-}
-
-// ── Combinator tree walker ────────────────────────────────────────────────────
-
-function walkExpr(
-  expr: ts.Expression,
-  constants: Map<string, ts.Expression>,
-  prefix: string[],
-  actions: DiscoveredAction[],
-  filePath: string,
-): void {
-  const node = unwrap(expr);
-
-  // Identifier → resolve and recurse
-  if (ts.isIdentifier(node)) {
-    const init = constants.get(node.text);
-    if (init) walkExpr(init, constants, prefix, actions, filePath);
-    return;
-  }
-
-  if (!ts.isCallExpression(node)) return;
-
-  const calleeName = getCalleeName(node);
-
-  if (calleeName === 'command') {
-    handleCommand(node, constants, prefix, actions, filePath);
-  } else if (calleeName === 'merge' || calleeName === 'or') {
-    // Descend into all arguments (globalOptions is an object() call → no-op)
-    for (const arg of node.arguments) {
-      walkExpr(arg, constants, prefix, actions, filePath);
-    }
-  }
-}
-
-function handleCommand(
-  node: ts.CallExpression,
-  constants: Map<string, ts.Expression>,
-  prefix: string[],
-  actions: DiscoveredAction[],
-  filePath: string,
-): void {
-  if (node.arguments.length < 2) return;
-
-  const name = extractStringLiteral(node.arguments[0]);
-  if (!name) return;
-
-  const newPrefix = [...prefix, name];
-  const description = node.arguments[2] ? extractDescription(node.arguments[2], constants) : undefined;
-
-  // Resolve body expression (may be an identifier referencing a const)
-  const rawBody = node.arguments[1];
-  const body = resolveExpr(rawBody, constants);
-
-  const bodyCallee = getCalleeName(body);
-
-  if (bodyCallee === 'object') {
-    // Leaf command: extract args from the object call
-    const cmdArgs = extractArgsFromObjectCall(body as ts.CallExpression, constants);
-    actions.push({ name: newPrefix.join(' '), description, args: cmdArgs, source: filePath });
-  } else if (bodyCallee === 'or') {
-    // Group: walk each child with the extended prefix
-    for (const child of (body as ts.CallExpression).arguments) {
-      walkExpr(child, constants, newPrefix, actions, filePath);
-    }
-  }
-}
-
-function resolveExpr(expr: ts.Expression, constants: Map<string, ts.Expression>): ts.Expression {
-  const node = unwrap(expr);
-  if (ts.isIdentifier(node)) {
-    const init = constants.get(node.text);
-    return init ? resolveExpr(init, constants) : node;
-  }
-  return node;
-}
-
-// ── Argument extraction ───────────────────────────────────────────────────────
-
-function extractArgsFromObjectCall(
-  objectCall: ts.CallExpression,
-  constants: Map<string, ts.Expression>,
-): ActionArgSchema[] {
-  if (objectCall.arguments.length === 0) return [];
-  const objLiteral = unwrap(objectCall.arguments[0]);
-  if (!ts.isObjectLiteralExpression(objLiteral)) return [];
-
-  const result: ActionArgSchema[] = [];
-  for (const prop of objLiteral.properties) {
-    if (!ts.isPropertyAssignment(prop)) continue;
-    const key = ts.isIdentifier(prop.name) ? prop.name.text : null;
-    if (!key || key === 'cmd') continue; // skip the discriminant constant
-
-    const arg = parseArgCall(key, prop.initializer, constants, false);
-    if (arg) result.push(arg);
-  }
-  return result;
-}
-
-/**
- * Recursively parse an optique arg combinator call.
- * `propKey` is the TypeScript property name and is used as the name for positional args.
- */
-function parseArgCall(
-  propKey: string,
-  expr: ts.Expression,
-  constants: Map<string, ts.Expression>,
-  forceOptional: boolean,
-): ActionArgSchema | null {
-  const node = unwrap(expr);
-  if (!ts.isCallExpression(node)) return null;
-
-  const fn = getCalleeName(node);
-  if (!fn) return null;
-
-  if (fn === 'option') {
-    const arg = extractOption(node, constants);
-    return arg ? { ...arg, required: forceOptional ? false : arg.required } : null;
-  }
-
-  if (fn === 'argument') {
-    const description = node.arguments.length >= 2
-      ? extractDescription(node.arguments[1], constants)
-      : undefined;
-    return { name: propKey, required: !forceOptional, type: 'string', description };
-  }
-
-  // Wrappers that make things optional or add defaults
-  if (fn === 'optional' || fn === 'withDefault' || fn === 'multiple') {
-    if (node.arguments.length === 0) return null;
-    return parseArgCall(propKey, node.arguments[0], constants, true);
-  }
-
-  return null;
-}
-
-function extractOption(
-  call: ts.CallExpression,
-  constants: Map<string, ts.Expression>,
-): ActionArgSchema | null {
-  if (call.arguments.length === 0) return null;
-  const name = extractStringLiteral(call.arguments[0]);
-  if (!name) return null;
-
-  // 2nd arg is either a value parser (string(), integer()) → value-taking,
-  // or an opts object → boolean flag
-  const hasValueParser = call.arguments.length >= 2 && isValueParserCall(call.arguments[1]);
-
-  const optsIdx = hasValueParser ? 2 : 1;
-  const description = call.arguments.length > optsIdx
-    ? extractDescription(call.arguments[optsIdx], constants)
-    : undefined;
-
-  return { name, required: false, type: hasValueParser ? 'string' : 'boolean', description };
-}
-
-function isValueParserCall(expr: ts.Expression): boolean {
-  const node = unwrap(expr);
-  if (!ts.isCallExpression(node)) return false;
-  const fn = getCalleeName(node);
-  return fn !== null && ['string', 'integer', 'number', 'boolean'].includes(fn);
-}
-
-// ── Description extraction ────────────────────────────────────────────────────
-
-function extractDescription(
-  expr: ts.Expression,
-  constants: Map<string, ts.Expression>,
-): string | undefined {
-  const node = unwrap(expr);
-
-  if (ts.isIdentifier(node)) {
-    const init = constants.get(node.text);
-    return init ? extractDescription(init, constants) : undefined;
-  }
-
-  // { description: message`...` }
-  if (ts.isObjectLiteralExpression(node)) {
-    for (const prop of node.properties) {
-      if (!ts.isPropertyAssignment(prop)) continue;
-      const key = ts.isIdentifier(prop.name) ? prop.name.text : null;
-      if (key === 'description') return extractDescriptionValue(prop.initializer);
-    }
-  }
-
-  return undefined;
-}
-
-function extractDescriptionValue(expr: ts.Expression): string | undefined {
-  const node = unwrap(expr);
-
-  // message`Create a new account` — tagged template literal
-  if (ts.isTaggedTemplateExpression(node)) {
-    const tpl = node.template;
-    if (ts.isNoSubstitutionTemplateLiteral(tpl)) return tpl.text;
-    if (ts.isTemplateExpression(tpl)) return tpl.head.text;
-  }
-
-  if (ts.isStringLiteral(node) || ts.isNoSubstitutionTemplateLiteral(node)) {
-    return node.text;
-  }
-
-  return undefined;
-}
-
-// ── Utilities ─────────────────────────────────────────────────────────────────
-
-function getCalleeName(node: ts.Expression): string | null {
-  if (!ts.isCallExpression(node)) return null;
-  const callee = unwrap(node.expression);
-  return ts.isIdentifier(callee) ? callee.text : null;
-}
-
-function extractStringLiteral(expr: ts.Expression | undefined): string | null {
-  if (!expr) return null;
-  const node = unwrap(expr);
-  if (ts.isStringLiteral(node) || ts.isNoSubstitutionTemplateLiteral(node)) return node.text;
-  return null;
-}
-
-function unwrap(expr: ts.Expression): ts.Expression {
-  let cur = expr;
-  while (
-    ts.isParenthesizedExpression(cur)
-    || ts.isAsExpression(cur)
-    || ts.isTypeAssertionExpression(cur)
-    || ts.isSatisfiesExpression(cur)
-  ) {
-    cur = cur.expression;
-  }
-  return cur;
-}
-
-function dedupeByName(actions: DiscoveredAction[]): DiscoveredAction[] {
-  const map = new Map<string, DiscoveredAction>();
-  for (const a of actions) map.set(a.name, a);
-  return Array.from(map.values());
-}
diff --git a/src/discovery/sdk.ts b/src/discovery/sdk.ts
deleted file mode 100644
index 3485d96..0000000
--- a/src/discovery/sdk.ts
+++ /dev/null
@@ -1,336 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { extname, resolve } from 'node:path';
-
-import ts from 'typescript';
-
-import type { ActionArgSchema } from '../actions/types.js';
-import type { DiscoveryOptions, DiscoveredAction, SdkDiscoverySnapshot } from './types.js';
-
-export function discoverSdkSurfaceFromSources(sources: string[], options: DiscoveryOptions = {}): SdkDiscoverySnapshot {
-  const baseDir = options.baseDir ?? process.cwd();
-  const resolvedSources = sources.map((source) => resolve(baseDir, source));
-  const discoveredActions: DiscoveredAction[] = [];
-  const visited = new Set<string>();
-
-  for (const sourcePath of resolvedSources) {
-    discoveredActions.push(...discoverSdkActionsFromSourceFile(sourcePath, visited));
-  }
-
-  return {
-    surface: 'sdk',
-    actions: dedupeActionsByName(discoveredActions),
-    sources: resolvedSources,
-  };
-}
-
-function discoverSdkActionsFromSourceFile(filePath: string, visited: Set<string>): DiscoveredAction[] {
-  if (visited.has(filePath)) {
-    return [];
-  }
-  visited.add(filePath);
-
-  if (!existsSync(filePath)) {
-    throw new Error(`SDK discovery source file not found: ${filePath}`);
-  }
-
-  let sourceCode: string;
-  try {
-    sourceCode = readFileSync(filePath, 'utf-8');
-  } catch (error) {
-    throw new Error(`Failed to read SDK discovery source file ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
-  }
-
-  const sourceFile = ts.createSourceFile(filePath, sourceCode, ts.ScriptTarget.Latest, false, scriptKindFromPath(filePath));
-  return collectDiscoveredActions(sourceFile, filePath, visited);
-}
-
-function collectDiscoveredActions(sourceFile: ts.SourceFile, sourcePath: string, visited: Set<string>): DiscoveredAction[] {
-  const actions: DiscoveredAction[] = [];
-  const topLevelClasses = new Map<string, ts.ClassDeclaration>();
-  const topLevelFunctions = new Map<string, ts.FunctionDeclaration>();
-  const exportedNames = new Set<string>();
-  let defaultExportName: string | null = null;
-
-  for (const statement of sourceFile.statements) {
-    if (ts.isClassDeclaration(statement) && statement.name) {
-      topLevelClasses.set(statement.name.text, statement);
-      if (hasExportModifier(statement)) {
-        exportedNames.add(statement.name.text);
-      }
-      if (hasDefaultModifier(statement)) {
-        defaultExportName = statement.name.text;
-      }
-      continue;
-    }
-
-    if (ts.isFunctionDeclaration(statement) && statement.name) {
-      topLevelFunctions.set(statement.name.text, statement);
-      if (hasExportModifier(statement)) {
-        exportedNames.add(statement.name.text);
-      }
-      if (hasDefaultModifier(statement)) {
-        defaultExportName = statement.name.text;
-      }
-      continue;
-    }
-
-    if (ts.isExportDeclaration(statement) && statement.exportClause && !statement.moduleSpecifier && ts.isNamedExports(statement.exportClause)) {
-      for (const element of statement.exportClause.elements) {
-        const localName = element.propertyName?.text ?? element.name.text;
-        exportedNames.add(localName);
-      }
-      continue;
-    }
-
-    if (ts.isExportDeclaration(statement) && statement.moduleSpecifier && ts.isStringLiteral(statement.moduleSpecifier)) {
-      const targetPath = resolveRelativeModuleSource(sourcePath, statement.moduleSpecifier.text);
-      if (!targetPath) {
-        continue;
-      }
-
-      const reExportedActions = discoverSdkActionsFromSourceFile(targetPath, visited);
-      if (!statement.exportClause) {
-        actions.push(...reExportedActions);
-        continue;
-      }
-
-      if (!ts.isNamedExports(statement.exportClause)) {
-        continue;
-      }
-
-      for (const element of statement.exportClause.elements) {
-        const localName = element.propertyName?.text ?? element.name.text;
-        const exportedName = element.name.text;
-        actions.push(...reExportedActions
-          .filter((action) => action.name === localName || action.name.startsWith(`${localName}.`))
-          .map((action) => renameReExportedAction(action, localName, exportedName)));
-      }
-      continue;
-    }
-
-    if (ts.isExportAssignment(statement)) {
-      const expression = unwrapExpression(statement.expression);
-      if (ts.isIdentifier(expression)) {
-        defaultExportName = expression.text;
-      } else if ((ts.isClassExpression(expression) || ts.isFunctionExpression(expression)) && expression.name) {
-        defaultExportName = expression.name.text;
-      }
-    }
-  }
-
-  for (const className of exportedNames) {
-    const classDeclaration = topLevelClasses.get(className);
-    if (classDeclaration) {
-      actions.push(...actionsFromClassDeclaration(classDeclaration, sourcePath));
-      continue;
-    }
-
-    const functionDeclaration = topLevelFunctions.get(className);
-    if (functionDeclaration) {
-      actions.push(actionFromFunctionDeclaration(functionDeclaration, sourcePath));
-    }
-  }
-
-  if (defaultExportName) {
-    const classDeclaration = topLevelClasses.get(defaultExportName);
-    if (classDeclaration) {
-      actions.push(...actionsFromClassDeclaration(classDeclaration, sourcePath));
-    }
-
-    const functionDeclaration = topLevelFunctions.get(defaultExportName);
-    if (functionDeclaration) {
-      actions.push(actionFromFunctionDeclaration(functionDeclaration, sourcePath));
-    }
-  }
-
-  return actions;
-}
-
-function resolveRelativeModuleSource(fromPath: string, specifier: string): string | null {
-  if (!specifier.startsWith('.')) {
-    return null;
-  }
-
-  const rawCandidate = resolve(fromPath, '..', specifier);
-  const candidates = [
-    rawCandidate,
-    resolve(fromPath, '..', `${specifier}.ts`),
-    resolve(fromPath, '..', `${specifier}.tsx`),
-    resolve(fromPath, '..', `${specifier}.js`),
-    resolve(fromPath, '..', `${specifier}.mjs`),
-    resolve(fromPath, '..', `${specifier}.cjs`),
-    resolve(fromPath, '..', specifier, 'index.ts'),
-    resolve(fromPath, '..', specifier, 'index.tsx'),
-    resolve(fromPath, '..', specifier, 'index.js'),
-    resolve(fromPath, '..', specifier, 'index.mjs'),
-    resolve(fromPath, '..', specifier, 'index.cjs'),
-  ];
-
-  for (const candidate of candidates) {
-    if (existsSync(candidate)) {
-      return candidate;
-    }
-  }
-
-  return null;
-}
-
-function actionsFromClassDeclaration(classDeclaration: ts.ClassDeclaration, source: string): DiscoveredAction[] {
-  const className = classDeclaration.name?.text;
-  if (!className) {
-    return [];
-  }
-
-  const actions: DiscoveredAction[] = [];
-
-  for (const member of classDeclaration.members) {
-    if (ts.isConstructorDeclaration(member)) {
-      if (hasPrivateOrProtectedModifier(member)) {
-        continue;
-      }
-
-      actions.push({
-        name: `${className}.constructor`,
-        args: extractArgsFromParameters(member.parameters),
-        source,
-      });
-      continue;
-    }
-
-    if (!ts.isMethodDeclaration(member)) {
-      continue;
-    }
-
-    if (hasPrivateOrProtectedModifier(member)) {
-      continue;
-    }
-
-    const methodName = methodNameFromClassElement(member.name);
-    if (!methodName) {
-      continue;
-    }
-
-    actions.push({
-      name: `${className}.${methodName}`,
-      args: extractArgsFromParameters(member.parameters),
-      source,
-    });
-  }
-
-  return actions;
-}
-
-function actionFromFunctionDeclaration(functionDeclaration: ts.FunctionDeclaration, source: string): DiscoveredAction {
-  return {
-    name: functionDeclaration.name!.text,
-    args: extractArgsFromParameters(functionDeclaration.parameters),
-    source,
-  };
-}
-
-function extractArgsFromParameters(parameters: ts.NodeArray<ts.ParameterDeclaration>): ActionArgSchema[] {
-  const args: ActionArgSchema[] = [];
-
-  for (const parameter of parameters) {
-    if (!ts.isIdentifier(parameter.name)) {
-      continue;
-    }
-
-    args.push({
-      name: parameter.name.text,
-      required: !parameter.questionToken && !parameter.initializer && !parameter.dotDotDotToken,
-    });
-  }
-
-  return args;
-}
-
-function scriptKindFromPath(filePath: string): ts.ScriptKind {
-  const extension = extname(filePath).toLowerCase();
-
-  if (extension === '.js' || extension === '.mjs' || extension === '.cjs') {
-    return ts.ScriptKind.JS;
-  }
-
-  if (extension === '.jsx') {
-    return ts.ScriptKind.JSX;
-  }
-
-  if (extension === '.tsx') {
-    return ts.ScriptKind.TSX;
-  }
-
-  return ts.ScriptKind.TS;
-}
-
-function hasExportModifier(node: ts.Node): boolean {
-  if (!ts.canHaveModifiers(node)) {
-    return false;
-  }
-
-  return ts.getModifiers(node)?.some((modifier) => modifier.kind === ts.SyntaxKind.ExportKeyword) === true;
-}
-
-function hasDefaultModifier(node: ts.Node): boolean {
-  if (!ts.canHaveModifiers(node)) {
-    return false;
-  }
-
-  return ts.getModifiers(node)?.some((modifier) => modifier.kind === ts.SyntaxKind.DefaultKeyword) === true;
-}
-
-function hasPrivateOrProtectedModifier(node: ts.Node): boolean {
-  if (!ts.canHaveModifiers(node)) {
-    return false;
-  }
-
-  return ts.getModifiers(node)?.some((modifier) => {
-    return modifier.kind === ts.SyntaxKind.PrivateKeyword || modifier.kind === ts.SyntaxKind.ProtectedKeyword;
-  }) === true;
-}
-
-function methodNameFromClassElement(name: ts.PropertyName): string | null {
-  if (ts.isIdentifier(name) || ts.isStringLiteral(name) || ts.isNumericLiteral(name)) {
-    return name.text;
-  }
-
-  return null;
-}
-
-function unwrapExpression(node: ts.Expression): ts.Expression {
-  let current = node;
-  while (ts.isParenthesizedExpression(current) || ts.isAsExpression(current) || ts.isTypeAssertionExpression(current) || ts.isSatisfiesExpression(current)) {
-    current = current.expression;
-  }
-  return current;
-}
-
-function dedupeActionsByName(actions: DiscoveredAction[]): DiscoveredAction[] {
-  const map = new Map<string, DiscoveredAction>();
-  for (const action of actions) {
-    map.set(action.name, action);
-  }
-  return Array.from(map.values());
-}
-
-function renameReExportedAction(action: DiscoveredAction, localName: string, exportedName: string): DiscoveredAction {
-  if (localName === exportedName) {
-    return action;
-  }
-
-  if (action.name === localName) {
-    return {
-      ...action,
-      name: exportedName,
-    };
-  }
-
-  if (action.name.startsWith(`${localName}.`)) {
-    return {
-      ...action,
-      name: `${exportedName}${action.name.slice(localName.length)}`,
-    };
-  }
-
-  return action;
-}
diff --git a/src/discovery/types.ts b/src/discovery/types.ts
deleted file mode 100644
index ab7bb78..0000000
--- a/src/discovery/types.ts
+++ /dev/null
@@ -1,30 +0,0 @@
-import type { ActionArgSchema } from '../actions/types.js';
-
-export interface DiscoveredAction {
-  name: string;
-  description?: string;
-  args: ActionArgSchema[];
-  source?: string;
-}
-
-export interface DiscoverySnapshot {
-  surface: 'sdk' | 'cli' | 'mcp';
-  actions: DiscoveredAction[];
-  sources: string[];
-}
-
-export interface McpDiscoverySnapshot extends DiscoverySnapshot {
-  surface: 'mcp';
-}
-
-export interface CliDiscoverySnapshot extends DiscoverySnapshot {
-  surface: 'cli';
-}
-
-export interface SdkDiscoverySnapshot extends DiscoverySnapshot {
-  surface: 'sdk';
-}
-
-export interface DiscoveryOptions {
-  baseDir?: string;
-}
diff --git a/src/doctor/checks.ts b/src/doctor/checks.ts
deleted file mode 100644
index 4c9ef70..0000000
--- a/src/doctor/checks.ts
+++ /dev/null
@@ -1,146 +0,0 @@
-import type { ResolvedProjectConfig } from '../project/types.js';
-import type { Issue } from '../project/validate.js';
-import { discoverActionsOnly, resolveScope } from '../tasks/index.js';
-import { requireConfiguredApiKey } from '../runtime/pi/index.js';
-
-/**
- * Tier-2: discover actions and verify scope + maxTasks.
- */
-export function checkDiscovery(project: ResolvedProjectConfig): Issue[] {
-  const issues: Issue[] = [];
-  let discovered: ReturnType<typeof discoverActionsOnly>;
-
-  try {
-    discovered = discoverActionsOnly(project);
-  } catch (err) {
-    const isPrompt = project.target.surface === 'prompt';
-    const discoveryHint = isPrompt
-      ? `Check the skill file at target.skill — ensure it has parseable capability headings`
-      : `Check target.discovery.sources and your manifest file`;
-    issues.push({
-      code: 'discovery-failed', severity: 'error', field: 'target.discovery',
-      message: `Discovery threw an error: ${err instanceof Error ? err.message : String(err)}`,
-      hint: discoveryHint,
-      fixable: false,
-    });
-    return issues;
-  }
-
-  const { inScope } = resolveScope(discovered, project.target.scope);
-
-  if (inScope.length === 0) {
-    let surfaceHint: string;
-    if (project.target.surface === 'cli') {
-      surfaceHint = `Add target.cli.commands pointing at a cli-commands.json manifest, or fix target.discovery.sources`;
-    } else if (project.target.surface === 'mcp') {
-      surfaceHint = `Add target.mcp.tools pointing at a tools.json manifest, or fix target.discovery.sources`;
-    } else if (project.target.surface === 'prompt') {
-      surfaceHint = `Ensure the skill file (target.skill) contains parseable capability headings`;
-    } else {
-      surfaceHint = `Fix target.discovery.sources to point at your SDK entry file`;
-    }
-    issues.push({
-      code: 'zero-actions-discovered', severity: 'error', field: 'target.discovery',
-      message: `Discovery found 0 in-scope actions`,
-      hint: surfaceHint,
-      fixable: false,
-    });
-  } else {
-    const maxTasks = project.benchmark.taskGeneration?.maxTasks ?? 0;
-    if (project.target.surface !== 'prompt' && project.benchmark.taskGeneration?.enabled && maxTasks < inScope.length) {
-      issues.push({
-        code: 'max-tasks-too-low', severity: 'error', field: 'benchmark.taskGeneration.maxTasks',
-        message: `maxTasks (${maxTasks}) is less than the number of in-scope actions (${inScope.length})`,
-        hint: `Raise benchmark.taskGeneration.maxTasks to at least ${inScope.length}`,
-        fixable: false,
-      });
-    }
-    issues.push({
-      code: 'discovery-ok', severity: 'info', field: 'target.discovery',
-      message: `${inScope.length} action(s) discovered (${project.target.surface} surface)`,
-      fixable: false,
-    });
-  }
-
-  return issues;
-}
-
-/**
- * Tier-3: ping each model with a 1-token request.
- */
-export async function checkModelReachability(project: ResolvedProjectConfig): Promise<Issue[]> {
-  const issues: Issue[] = [];
-
-  // Only PI format uses OpenRouter; skip reachability for other formats
-  if (project.benchmark.format && project.benchmark.format !== 'pi') {
-    issues.push({
-      code: 'reachability-skipped', severity: 'info', field: 'benchmark.format',
-      message: `Skipping reachability check — model probing is only implemented for openrouter/* model IDs via the OpenRouter API`,
-      fixable: false,
-    });
-    return issues;
-  }
-
-  const openrouterEntries = project.benchmark.models
-    .map((model, i) => ({ model, i }))
-    .filter(({ model }) => model.id.startsWith('openrouter/'));
-  const skippedCount = project.benchmark.models.length - openrouterEntries.length;
-
-  if (skippedCount > 0) {
-    issues.push({
-      code: 'reachability-skipped', severity: 'info', field: 'benchmark.models',
-      message: `Skipping reachability for ${skippedCount} non-OpenRouter model(s) (only OpenRouter models can be probed)`,
-      fixable: false,
-    });
-  }
-
-  if (openrouterEntries.length === 0) {
-    return issues;
-  }
-
-  let apiKey: string;
-  try {
-    apiKey = requireConfiguredApiKey({
-      provider: 'openrouter',
-      authMode: project.benchmark.authMode,
-      apiKeyEnv: project.benchmark.apiKeyEnv,
-    });
-  } catch {
-    return issues; // already reported by checkConfig
-  }
-
-  for (const { model, i } of openrouterEntries) {
-    try {
-      const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
-        method: 'POST',
-        headers: {
-          Authorization: `Bearer ${apiKey}`,
-          'Content-Type': 'application/json',
-        },
-        body: JSON.stringify({
-          model: model.id.replace(/^openrouter\//, ''),
-          messages: [{ role: 'user', content: 'hi' }],
-          max_tokens: 1,
-        }),
-      });
-      if (!res.ok) {
-        const body = await res.text();
-        issues.push({
-          code: 'model-unreachable', severity: 'error', field: `benchmark.models[${i}].id`,
-          message: `Model "${model.id}" returned HTTP ${res.status}: ${body.slice(0, 120)}`,
-          hint: `Check the model ID at https://openrouter.ai/models and verify your API key`,
-          fixable: false,
-        });
-      }
-    } catch (err) {
-      issues.push({
-        code: 'model-unreachable', severity: 'error', field: `benchmark.models[${i}].id`,
-        message: `Model "${model.id}" unreachable: ${err instanceof Error ? err.message : String(err)}`,
-        hint: `Check your network and API key`,
-        fixable: false,
-      });
-    }
-  }
-
-  return issues;
-}
diff --git a/src/doctor/format.ts b/src/doctor/format.ts
deleted file mode 100644
index 08e6831..0000000
--- a/src/doctor/format.ts
+++ /dev/null
@@ -1,50 +0,0 @@
-import type { Issue } from '../project/validate.js';
-
-const R = '\x1b[0m';
-const RED = '\x1b[31m';
-const YEL = '\x1b[33m';
-const GRN = '\x1b[32m';
-const DIM = '\x1b[2m';
-const BOLD = '\x1b[1m';
-
-function icon(severity: Issue['severity']): string {
-  if (severity === 'error') return `${RED}✗${R}`;
-  if (severity === 'warning') return `${YEL}⚠${R}`;
-  return `${GRN}✓${R}`;
-}
-
-export function formatIssues(issues: Issue[], configPath: string): string {
-  const lines: string[] = [];
-  lines.push(`\n${BOLD}skill-optimizer doctor${R} — ${configPath}\n`);
-
-  for (const issue of issues.filter((i) => i.severity !== 'info' || i.code === 'discovery-ok')) {
-    const label = issue.code === 'discovery-ok' ? 'discovery' : issue.field;
-    const pad = ' '.repeat(Math.max(1, 32 - label.length));
-    lines.push(`  ${icon(issue.severity)} ${label}${pad}${issue.message}`);
-    if (issue.hint) lines.push(`      ${DIM}hint: ${issue.hint}${R}`);
-    if (issue.fixable) lines.push(`      ${DIM}(auto-fixable with --fix)${R}`);
-  }
-
-  const errors = issues.filter((i) => i.severity === 'error');
-  const warnings = issues.filter((i) => i.severity === 'warning');
-  const fixable = issues.filter((i) => i.fixable).length;
-
-  lines.push('');
-  if (errors.length === 0 && warnings.length === 0) {
-    lines.push(`${GRN}No issues found — config is valid${R}`);
-  } else {
-    let summary = `${errors.length} error(s), ${warnings.length} warning(s)`;
-    if (fixable > 0) summary += ` — run with ${BOLD}--fix${R} to apply ${fixable} auto-fixable change(s)`;
-    lines.push(summary);
-  }
-
-  return lines.join('\n');
-}
-
-export function formatFixResult(appliedCount: number, remainingIssues: Issue[], configPath: string): string {
-  return [
-    `\n  Applied ${appliedCount} fix(es) to ${configPath}`,
-    `  Re-running checks...\n`,
-    formatIssues(remainingIssues, configPath),
-  ].join('\n');
-}
diff --git a/src/doctor/index.ts b/src/doctor/index.ts
deleted file mode 100644
index a2754be..0000000
--- a/src/doctor/index.ts
+++ /dev/null
@@ -1,72 +0,0 @@
-import { readFileSync, writeFileSync } from 'node:fs';
-import { resolve, dirname } from 'node:path';
-
-import { checkConfig } from '../project/validate.js';
-import { applyFixes } from '../project/fix.js';
-import { loadProjectConfig } from '../project/load.js';
-import { checkDiscovery, checkModelReachability } from './checks.js';
-import { formatIssues, formatFixResult } from './format.js';
-import type { Issue } from '../project/validate.js';
-
-interface DoctorOptions {
-  staticOnly?: boolean;
-  checkModels?: boolean;
-  fix?: boolean;
-}
-
-export async function runDoctor(configPath: string, opts: DoctorOptions = {}): Promise<number> {
-  const resolvedPath = resolve(configPath);
-
-  let rawJson: Record<string, unknown>;
-  try {
-    rawJson = JSON.parse(readFileSync(resolvedPath, 'utf-8')) as Record<string, unknown>;
-  } catch (err) {
-    console.error(`ERROR: Cannot read config: ${resolvedPath}`);
-    console.error(err instanceof Error ? err.message : String(err));
-    return 2;
-  }
-
-  // Tier 1: structural + path + format checks
-  let issues: Issue[] = await checkConfig(rawJson, resolvedPath);
-
-  if (opts.fix) {
-    const configDir = dirname(resolvedPath);
-    const initialFixableCount = issues.filter((i) => i.fixable).length;
-    if (initialFixableCount > 0) {
-      let safety = 3;
-      while (issues.some((i) => i.fixable) && safety-- > 0) {
-        rawJson = applyFixes(rawJson, issues, configDir);
-        issues = await checkConfig(rawJson, resolvedPath);
-      }
-      writeFileSync(resolvedPath, JSON.stringify(rawJson, null, 2) + '\n', 'utf-8');
-      console.log(formatFixResult(initialFixableCount, issues, resolvedPath));
-    } else {
-      console.log('\n  No auto-fixable issues found.');
-    }
-    return issues.some((i) => i.severity === 'error') ? 1 : 0;
-  }
-
-  // Tier 2: discovery (default, skipped with --static)
-  if (!opts.staticOnly && !issues.some((i) => i.severity === 'error')) {
-    try {
-      const project = await loadProjectConfig(resolvedPath);
-      issues = [...issues, ...checkDiscovery(project)];
-    } catch (e) {
-      // loadProjectConfig threw; static errors already cover this
-      if (process.env['DEBUG']) console.error('[debug] Tier-2 skipped:', e);
-    }
-  }
-
-  // Tier 3: model reachability (--check-models)
-  if (opts.checkModels) {
-    try {
-      const project = await loadProjectConfig(resolvedPath);
-      issues = [...issues, ...(await checkModelReachability(project))];
-    } catch (e) {
-      if (process.env['DEBUG']) console.error('[debug] Tier-3 skipped:', e);
-    }
-  }
-
-  console.log(formatIssues(issues, resolvedPath));
-  return issues.some((i) => i.severity === 'error') ? 1 : 0;
-}
diff --git a/src/errors.ts b/src/errors.ts
deleted file mode 100644
index 0fa8e8e..0000000
--- a/src/errors.ts
+++ /dev/null
@@ -1,223 +0,0 @@
-export interface ErrorDef {
-  code: string;
-  message: string;
-  fix: string[];
-}
-
-export class SkillOptimizerError extends Error {
-  constructor(public readonly def: ErrorDef, public readonly detail?: string) {
-    super(detail ? `${def.message}: ${detail}` : def.message);
-    this.name = def.code;
-  }
-}
-
-export function printError(err: SkillOptimizerError): void {
-  console.error(`\nError [${err.def.code}]: ${err.message}`);
-  if (err.def.fix.length > 0) {
-    console.error('How to fix:');
-    for (const step of err.def.fix) {
-      console.error(`  • ${step}`);
-    }
-  }
-}
-
-export const ERRORS = {
-  // ── Config validation ──────────────────────────────────────────────────────
-  E_INVALID_SURFACE: {
-    code: 'E_INVALID_SURFACE',
-    message: 'Invalid surface value',
-    fix: [
-      'Set target.surface to one of: sdk, cli, mcp, prompt',
-      'sdk = TypeScript/Python/Rust library, cli = command-line tool, mcp = MCP server, prompt = prompt template / skill document',
-    ],
-  },
-  E_MODELS_EMPTY: {
-    code: 'E_MODELS_EMPTY',
-    message: 'benchmark.models is empty or missing',
-    fix: [
-      'Add at least one model to benchmark.models, e.g.:',
-      '  { "id": "openrouter/anthropic/claude-sonnet-4.6", "name": "Claude Sonnet", "tier": "flagship" }',
-    ],
-  },
-  E_MODEL_ID_FORMAT: {
-    code: 'E_MODEL_ID_FORMAT',
-    message: 'Model ID is missing a provider prefix',
-    fix: [
-      'Prefix all model IDs with a supported provider prefix:',
-      '  openrouter/<provider>/<model>  — routed via OpenRouter (e.g. openrouter/anthropic/claude-sonnet-4.6)',
-      '  anthropic/<model>              — direct Anthropic API (e.g. anthropic/claude-sonnet-4-6)',
-      '  openai/<model>                 — direct OpenAI API (e.g. openai/gpt-4.1)',
-      'Browse OpenRouter models at https://openrouter.ai/models',
-    ],
-  },
-  E_VERDICT_OUT_OF_RANGE: {
-    code: 'E_VERDICT_OUT_OF_RANGE',
-    message: 'Verdict threshold is out of range',
-    fix: [
-      'Set benchmark.verdict.perModelFloor and targetWeightedAverage to values between 0.0 and 1.0',
-      'Typical values: perModelFloor=0.6, targetWeightedAverage=0.7',
-    ],
-  },
-  E_MAX_ITERATIONS_ZERO: {
-    code: 'E_MAX_ITERATIONS_ZERO',
-    message: 'optimize.maxIterations must be a positive integer',
-    fix: [
-      'Set optimize.maxIterations to a positive integer, e.g. 5',
-    ],
-  },
-  E_INVALID_FORMAT: {
-    code: 'E_INVALID_FORMAT',
-    message: 'Invalid benchmark.format value',
-    fix: [
-      'Set benchmark.format to one of: pi, openai, anthropic',
-    ],
-  },
-  // ── Path resolution ────────────────────────────────────────────────────────
-  E_REPO_NOT_FOUND: {
-    code: 'E_REPO_NOT_FOUND',
-    message: 'target.repoPath does not exist or is not a directory',
-    fix: [
-      'Fix target.repoPath in your skill-optimizer.json to point at an existing directory',
-      'Paths in the config are relative to the config file location',
-    ],
-  },
-  E_MISSING_SKILL: {
-    code: 'E_MISSING_SKILL',
-    message: 'target.skill file not found',
-    fix: [
-      'Create a SKILL.md at the path specified in target.skill',
-      'Or update target.skill in your config to point at an existing file',
-    ],
-  },
-  E_SOURCES_NOT_FOUND: {
-    code: 'E_SOURCES_NOT_FOUND',
-    message: 'One or more target.discovery.sources files do not exist',
-    fix: [
-      'Check that all paths in target.discovery.sources exist in your repo',
-      'Paths are relative to target.repoPath',
-      'For CLI: point at your main entry file (e.g. src/cli.ts)',
-      'For MCP: point at your server entry file (e.g. src/server.ts)',
-    ],
-  },
-  E_CLI_MANIFEST_NOT_FOUND: {
-    code: 'E_CLI_MANIFEST_NOT_FOUND',
-    message: 'target.cli.commands manifest file not found',
-    fix: [
-      'Run: skill-optimizer import-commands --from <entry-file> to auto-extract',
-      'Or create the file manually and populate it with your CLI commands',
-      'Format: Array of { command, description, options[] }',
-    ],
-  },
-  E_MCP_MANIFEST_NOT_FOUND: {
-    code: 'E_MCP_MANIFEST_NOT_FOUND',
-    message: 'target.mcp.tools manifest file not found',
-    fix: [
-      'Create the tools.json file at the path specified in target.mcp.tools',
-      'Format: Array of OpenAI function tool definitions { type: "function", function: { name, description, parameters } }',
-    ],
-  },
-  E_ALLOWED_PATHS_ESCAPE: {
-    code: 'E_ALLOWED_PATHS_ESCAPE',
-    message: 'optimize.allowedPaths contains a path outside target.repoPath',
-    fix: [
-      'All paths in optimize.allowedPaths must be inside target.repoPath',
-      'This is a safety boundary — the optimizer will only edit files within this list',
-    ],
-  },
-  E_OUTPUT_DIR_NOT_WRITABLE: {
-    code: 'E_OUTPUT_DIR_NOT_WRITABLE',
-    message: 'benchmark.output.dir is not writable',
-    fix: [
-      'Check directory permissions for the path set in benchmark.output.dir',
-      'Or change benchmark.output.dir to a path you have write access to',
-    ],
-  },
-  // ── Environment ────────────────────────────────────────────────────────────
-  E_MISSING_API_KEY: {
-    code: 'E_MISSING_API_KEY',
-    message: 'API key environment variable is not set',
-    fix: [
-      'Export your OpenRouter API key before running: export OPENROUTER_API_KEY=sk-or-...',
-      'Or add it to a .env file alongside your skill-optimizer.json',
-      'Get a key at https://openrouter.ai/keys',
-    ],
-  },
-  // ── Discovery ─────────────────────────────────────────────────────────────
-  E_DISCOVERY_EMPTY: {
-    code: 'E_DISCOVERY_EMPTY',
-    message: 'Discovery found zero callable actions',
-    fix: [
-      'Check that target.discovery.sources points at the right entry file',
-      'For SDK: should be your public API entry (e.g. src/index.ts)',
-      'For CLI: should be the file that registers all subcommands',
-      'For MCP: should be the file that registers all tools',
-      'Add a fallback manifest: target.discovery.fallbackManifest or target.cli.commands / target.mcp.tools',
-    ],
-  },
-  // ── Task generation ────────────────────────────────────────────────────────
-  E_MAXTASKS_TOO_LOW: {
-    code: 'E_MAXTASKS_TOO_LOW',
-    message: 'benchmark.taskGeneration.maxTasks is less than the in-scope action count',
-    fix: [
-      'Raise benchmark.taskGeneration.maxTasks to at least the number of in-scope actions',
-      'Run: skill-optimizer --dry-run --config ./skill-optimizer.json to see the action count',
-      'Or narrow the scope with target.scope.exclude to reduce the action count',
-    ],
-  },
-  E_COVERAGE_EXHAUSTED: {
-    code: 'E_COVERAGE_EXHAUSTED',
-    message: 'Task generation could not cover all in-scope actions after 2 retry passes',
-    fix: [
-      'Add guidance for the uncovered actions to your SKILL.md',
-      'The error message above names the specific uncovered actions',
-      'Or exclude them with target.scope.exclude if they should not be benchmarked',
-    ],
-  },
-  // ── Optimizer runtime ──────────────────────────────────────────────────────
-  E_DIRTY_GIT: {
-    code: 'E_DIRTY_GIT',
-    message: 'Target repo has uncommitted changes',
-    fix: [
-      'Commit or stash changes in target.repoPath before running the optimizer',
-      'Run: git -C <repoPath> stash',
-      'Or: git -C <repoPath> add -A && git -C <repoPath> commit -m "wip: before optimizer run"',
-    ],
-  },
-  E_GIT_CHECKPOINT_FAILED: {
-    code: 'E_GIT_CHECKPOINT_FAILED',
-    message: 'Git checkpoint creation failed',
-    fix: [
-      'Check disk space and git permissions in target.repoPath',
-      'Make sure the directory is a valid git repository',
-      'Run: git -C <repoPath> status to verify git state',
-    ],
-  },
-  E_VALIDATION_FAILED: {
-    code: 'E_VALIDATION_FAILED',
-    message: 'Configured validation command exited non-zero',
-    fix: [
-      'Fix the issue flagged by the validation command before retrying',
-      'The failing command is listed in optimize.validation in your config',
-      'Run the validation command manually to see the full error output',
-    ],
-  },
-  // ── Init ──────────────────────────────────────────────────────────────────
-  E_INIT_AUTO_LOW_CONFIDENCE: {
-    code: 'E_INIT_AUTO_LOW_CONFIDENCE',
-    message: 'init --auto --yes requires high confidence detection',
-    fix: [
-      'Run init interactively to review and confirm detection: skill-optimizer init --auto',
-      'Or supply a pre-filled answers file: skill-optimizer init --answers answers.json',
-      'See README for the answers.json format',
-    ],
-  },
-  // ── Catch-all ─────────────────────────────────────────────────────────────
-  E_UNEXPECTED: {
-    code: 'E_UNEXPECTED',
-    message: 'An unexpected error occurred',
-    fix: [
-      'Check the full error message and stack trace above for details',
-      'File an issue at https://github.com/fastxyz/skill-optimizer/issues with the full output',
-    ],
-  },
-} as const satisfies Record<string, ErrorDef>;
diff --git a/src/import/detect.ts b/src/import/detect.ts
deleted file mode 100644
index 4e37c53..0000000
--- a/src/import/detect.ts
+++ /dev/null
@@ -1,58 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { extname, resolve, basename } from 'node:path';
-import type { DetectionResult } from './types.js';
-
-function inferBinaryHint(fromPath: string, cwd: string): string | undefined {
-  const pkgPath = resolve(cwd, 'package.json');
-  if (existsSync(pkgPath)) {
-    try {
-      const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8')) as Record<string, unknown>;
-      const bin = pkg['bin'];
-      if (typeof bin === 'string') return bin;
-      if (typeof bin === 'object' && bin !== null) {
-        const keys = Object.keys(bin as object);
-        if (keys.length > 0) return keys[0];
-      }
-    } catch { /* ignore */ }
-  }
-
-  const pyprojPath = resolve(cwd, 'pyproject.toml');
-  if (existsSync(pyprojPath)) {
-    const content = readFileSync(pyprojPath, 'utf-8');
-    const m = content.match(/\[project\.scripts\][\s\S]*?\n(\S+)\s*=/);
-    if (m) return m[1];
-  }
-
-  return basename(fromPath).replace(/\.[^.]+$/, '') || undefined;
-}
-
-export function detectFramework(fromPath: string, cwd: string): DetectionResult {
-  const abs = resolve(cwd, fromPath);
-  const ext = extname(abs).toLowerCase();
-  const binaryHint = inferBinaryHint(fromPath, cwd);
-
-  if (ext === '.py') {
-    if (!existsSync(abs)) return { kind: 'unknown', binaryHint };
-    const content = readFileSync(abs, 'utf-8');
-    if (/import typer|from typer/.test(content)) return { kind: 'typer', binaryHint };
-    if (/import click|from click/.test(content)) return { kind: 'click', binaryHint };
-    if (/import argparse|from argparse/.test(content)) return { kind: 'argparse', binaryHint };
-    return { kind: 'unknown', binaryHint };
-  }
-
-  if (ext === '.rs' || basename(abs) === 'Cargo.toml') {
-    return { kind: 'clap', binaryHint };
-  }
-
-  if (['.ts', '.tsx', '.js', '.mjs', '.cjs'].includes(ext)) {
-    if (!existsSync(abs)) return { kind: 'unknown', binaryHint };
-    const content = readFileSync(abs, 'utf-8');
-    if (/from ['"]commander['"]|require\(['"]commander['"]\)/.test(content)) return { kind: 'commander', binaryHint };
-    if (/from ['"]yargs['"]|require\(['"]yargs['"]\)/.test(content)) return { kind: 'yargs', binaryHint };
-    if (/@optique\/core/.test(content)) return { kind: 'optique', binaryHint };
-    return { kind: 'unknown', binaryHint };
-  }
-
-  // No recognized extension — treat fromPath as a binary name
-  return { kind: 'unknown', binaryHint: fromPath };
-}
diff --git a/src/import/extractors/help-scraper.ts b/src/import/extractors/help-scraper.ts
deleted file mode 100644
index 5ebb4a8..0000000
--- a/src/import/extractors/help-scraper.ts
+++ /dev/null
@@ -1,142 +0,0 @@
-import { execFile } from 'node:child_process';
-import { promisify } from 'node:util';
-import type { CliCommandDefinition, CliCommandOptionDefinition } from '../types.js';
-
-const execFileAsync = promisify(execFile);
-
-/**
- * Pure parser: given the text output of `<binary> [prefix...] --help`,
- * returns the subcommands and their options found in that output.
- */
-export function parseHelpOutput(text: string, prefix: string[]): CliCommandDefinition[] {
-  const lines = text.split('\n');
-
-  let inCommands = false;
-  let inOptions = false;
-
-  const commands: CliCommandDefinition[] = [];
-  const optionsBuf: CliCommandOptionDefinition[] = [];
-
-  const COMMANDS_HEADER = /^(Commands|Subcommands):\s*$/i;
-  const OPTIONS_HEADER = /^(Options|Flags|Global Options):\s*$/i;
-  // Match a subcommand line: leading whitespace, non-dash first char, name token, optional description
-  const CMD_LINE = /^\s+(\S+)\s*(.*)/;
-  // Match an option line: "-x[, --long-form]" or "--long-form"
-  const OPT_LINE = /^\s+(-[a-zA-Z](?:,\s*--\S+)?|--\S+)\s*(.*)/;
-
-  for (const line of lines) {
-    const trimmed = line.trim();
-
-    // Blank line resets section state
-    if (trimmed === '') {
-      inCommands = false;
-      inOptions = false;
-      continue;
-    }
-
-    // Detect section headers
-    if (COMMANDS_HEADER.test(trimmed)) {
-      inCommands = true;
-      inOptions = false;
-      continue;
-    }
-    if (OPTIONS_HEADER.test(trimmed)) {
-      inOptions = true;
-      inCommands = false;
-      continue;
-    }
-
-    if (inCommands) {
-      const m = CMD_LINE.exec(line);
-      if (m) {
-        const rawName = m[1]!;
-        // Skip if it looks like a flag line
-        if (rawName.startsWith('-')) continue;
-        // Strip positional hints like <name> or [name] from the token
-        const name = rawName.replace(/[<[].*/g, '').trim();
-        if (!name) continue;
-        const description = m[2]!.trim();
-
-        const commandStr = prefix.length > 0 ? [...prefix, name].join(' ') : name;
-        commands.push({ command: commandStr, description: description || undefined });
-      }
-    } else if (inOptions) {
-      const m = OPT_LINE.exec(line);
-      if (m) {
-        const flagStr = m[1]!.trim();
-        const desc = m[2]!.trim();
-        // takesValue if the flag string contains <word> or [word]
-        const takesValue = /<\w+>|\[\w+\]/.test(flagStr);
-        optionsBuf.push({
-          name: flagStr,
-          description: desc || undefined,
-          takesValue,
-        });
-      }
-    }
-  }
-
-  // Options on a subcommand page belong to the parent command (prefix.join(' ')).
-  // Return a synthetic parent entry so scrapeHelp can merge options onto it.
-  if (prefix.length > 0 && optionsBuf.length > 0) {
-    const parentName = prefix.join(' ');
-    commands.push({ command: parentName, options: optionsBuf });
-  }
-
-  return commands;
-}
-
-/**
- * BFS scraper: runs `binary [prefix...] --help` for each discovered subcommand
- * up to `opts.depth` levels deep.
- */
-export async function scrapeHelp(
-  binary: string,
-  opts: { depth: number; cwd?: string },
-): Promise<CliCommandDefinition[]> {
-  const { depth, cwd } = opts;
-  const all: CliCommandDefinition[] = [];
-  const seen = new Set<string>();
-
-  // BFS queue of prefix arrays
-  const queue: string[][] = [[]];
-
-  while (queue.length > 0) {
-    const prefix = queue.shift()!;
-    const args = [...prefix, '--help'];
-
-    let stdout = '';
-    try {
-      const result = await execFileAsync(binary, args, {
-        cwd,
-        encoding: 'utf-8',
-        timeout: 5000,
-      });
-      stdout = result.stdout;
-    } catch (err: unknown) {
-      // Some CLIs exit non-zero for --help or print to stderr
-      const e = err as { stdout?: string; stderr?: string };
-      stdout = e.stdout ?? e.stderr ?? '';
-    }
-
-    const discovered = parseHelpOutput(stdout, prefix);
-
-    for (const cmd of discovered) {
-      const existing = all.find((c) => c.command === cmd.command);
-      if (existing) {
-        // Merge options onto an already-discovered parent command
-        if (cmd.options && cmd.options.length > 0) {
-          existing.options = cmd.options;
-        }
-      } else {
-        seen.add(cmd.command);
-        all.push(cmd);
-        if (prefix.length < depth) {
-          queue.push(cmd.command.split(' '));
-        }
-      }
-    }
-  }
-
-  return all;
-}
diff --git a/src/import/extractors/py-argparse.ts b/src/import/extractors/py-argparse.ts
deleted file mode 100644
index d2f6e71..0000000
--- a/src/import/extractors/py-argparse.ts
+++ /dev/null
@@ -1,90 +0,0 @@
-import { readFileSync } from 'node:fs';
-import { getSdkParser } from '../../benchmark/extractors/sdk/parser.js';
-import type { CliCommandDefinition, CliCommandOptionDefinition } from '../types.js';
-
-/** Extract the content of a paren-balanced block starting at `start` (the opening paren). */
-function extractParenBlock(source: string, start: number): string {
-  let depth = 0;
-  let i = start;
-  while (i < source.length) {
-    if (source[i] === '(') depth++;
-    else if (source[i] === ')') {
-      depth--;
-      if (depth === 0) return source.slice(start, i + 1);
-    }
-    i++;
-  }
-  return source.slice(start);
-}
-
-export async function extractArgparse(filePath: string): Promise<CliCommandDefinition[]> {
-  const source = readFileSync(filePath, 'utf-8');
-  // Use tree-sitter just to validate it parses (will throw on syntax error)
-  const parser = await getSdkParser('python');
-  parser.parse(source); // validate only
-
-  const lines = source.split('\n');
-
-  // Find subparsers variable
-  let subparsersVar: string | null = null;
-  for (const line of lines) {
-    const m = line.match(/(\w+)\s*=\s*\w+\.add_subparsers\s*\(/);
-    if (m) { subparsersVar = m[1]!; break; }
-  }
-  if (!subparsersVar) return [];
-
-  // Find parser variables and commands
-  const commands: CliCommandDefinition[] = [];
-  const parserVarToCmd = new Map<string, CliCommandDefinition>();
-  const escapedSub = subparsersVar.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-
-  // Scan full source for add_parser calls (handles multi-line calls)
-  const addParserRe = new RegExp(`(\\w+)\\s*=\\s*${escapedSub}\\.add_parser\\s*\\(`, 'g');
-  let m: RegExpExecArray | null;
-  while ((m = addParserRe.exec(source)) !== null) {
-    const parserVar = m[1]!;
-    const blockStart = m.index + m[0].length - 1; // position of '('
-    const block = extractParenBlock(source, blockStart);
-    const nameMatch = block.match(/^\(\s*['"]([^'"]+)['"]/);
-    if (!nameMatch) continue;
-    const commandName = nameMatch[1]!;
-    const helpMatch = block.match(/help\s*=\s*['"]([^'"]+)['"]/);
-    const def: CliCommandDefinition = { command: commandName, description: helpMatch?.[1] };
-    commands.push(def);
-    parserVarToCmd.set(parserVar, def);
-  }
-
-  // Find add_argument calls for each parser variable — use paren-block extraction
-  // so multi-line calls and arbitrary kwarg ordering are handled correctly.
-  for (const [parserVar, def] of parserVarToCmd) {
-    const escapedVar = parserVar.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-    const addArgRe = new RegExp(`${escapedVar}\\.add_argument\\s*\\(`, 'g');
-    let argMatch: RegExpExecArray | null;
-    while ((argMatch = addArgRe.exec(source)) !== null) {
-      const blockStart = argMatch.index + argMatch[0].length - 1;
-      const block = extractParenBlock(source, blockStart);
-
-      // First string arg — must start with '-' to be a flag (not a positional)
-      const flagMatch = block.match(/^\(\s*['"](-[^'"]+)['"]/);
-      if (!flagMatch) continue;
-      const flagName = flagMatch[1]!;
-
-      // help= anywhere in the block
-      const helpMatch = block.match(/help\s*=\s*['"]([^'"]+)['"]/);
-      // action='store_true' anywhere in the block (handles multi-line)
-      const isStoreTrue = /action\s*=\s*['"]store_true['"]/.test(block);
-
-      const opt: CliCommandOptionDefinition = {
-        name: flagName,
-        description: helpMatch?.[1],
-        takesValue: !isStoreTrue,
-      };
-      if (!def.options) def.options = [];
-      if (!def.options.find(o => o.name === flagName)) {
-        def.options.push(opt);
-      }
-    }
-  }
-
-  return commands;
-}
diff --git a/src/import/extractors/py-click.ts b/src/import/extractors/py-click.ts
deleted file mode 100644
index 465e034..0000000
--- a/src/import/extractors/py-click.ts
+++ /dev/null
@@ -1,110 +0,0 @@
-import { readFileSync } from 'node:fs';
-import type Parser from 'web-tree-sitter';
-import { getSdkParser } from '../../benchmark/extractors/sdk/parser.js';
-import type { CliCommandDefinition, CliCommandOptionDefinition } from '../types.js';
-
-function stripDocstring(raw: string): string {
-  const m = raw.match(/^['"]{1,3}([\s\S]*?)['"]{1,3}$/);
-  if (m) return m[1]!.trim();
-  return raw.trim();
-}
-
-function getDocstring(funcDef: Parser.SyntaxNode): string | undefined {
-  // Find the 'block' child of the function definition
-  let block: Parser.SyntaxNode | null = null;
-  for (const child of funcDef.children) {
-    if (child.type === 'block') {
-      block = child;
-      break;
-    }
-  }
-  if (!block) return undefined;
-
-  // First child of block that is expression_statement containing a string
-  for (const child of block.children) {
-    if (child.type === 'expression_statement') {
-      for (const inner of child.children) {
-        if (inner.type === 'string') {
-          return stripDocstring(inner.text);
-        }
-      }
-    }
-  }
-  return undefined;
-}
-
-function walkTree(node: Parser.SyntaxNode, source: string): CliCommandDefinition[] {
-  const results: CliCommandDefinition[] = [];
-
-  if (node.type === 'decorated_definition') {
-    const decorators: Parser.SyntaxNode[] = [];
-    let funcDef: Parser.SyntaxNode | null = null;
-
-    for (const child of node.children) {
-      if (child.type === 'decorator') {
-        decorators.push(child);
-      } else if (child.type === 'function_definition') {
-        funcDef = child;
-      }
-    }
-
-    if (funcDef) {
-      // Check if any decorator is a command decorator (but not group)
-      let isCommand = false;
-      const options: CliCommandOptionDefinition[] = [];
-
-      for (const dec of decorators) {
-        const decoratorText = source.slice(dec.startIndex, dec.endIndex);
-        const isCommandDec =
-          /\.command\s*\(\s*\)/.test(decoratorText) ||
-          /^@click\.command\s*\(/.test(decoratorText);
-        const isOptionDec = /\.option\s*\(/.test(decoratorText);
-
-        if (isCommandDec) {
-          isCommand = true;
-        } else if (isOptionDec) {
-          const flagMatch = decoratorText.match(/\.option\s*\(\s*['"]([^'"]+)['"]/);
-          const helpMatch = decoratorText.match(/help\s*=\s*['"]([^'"]+)['"]/);
-          const isFlag = /is_flag\s*=\s*True/.test(decoratorText);
-
-          if (flagMatch) {
-            options.push({
-              name: flagMatch[1]!,
-              description: helpMatch ? helpMatch[1] : undefined,
-              takesValue: !isFlag,
-            });
-          }
-        }
-      }
-
-      if (isCommand) {
-        const nameNode = funcDef.childForFieldName('name');
-        const rawName = nameNode ? nameNode.text : '';
-        const commandName = rawName.replace(/_/g, '-');
-        const description = getDocstring(funcDef);
-
-        const def: CliCommandDefinition = { command: commandName };
-        if (description !== undefined) def.description = description;
-        if (options.length > 0) def.options = options;
-
-        results.push(def);
-      }
-    }
-  }
-
-  for (const child of node.children) {
-    results.push(...walkTree(child, source));
-  }
-
-  return results;
-}
-
-/**
- * Extract click command definitions from a Python source file using tree-sitter.
- */
-export async function extractClick(filePath: string): Promise<CliCommandDefinition[]> {
-  const source = readFileSync(filePath, 'utf-8');
-  const parser = await getSdkParser('python');
-  const tree = parser.parse(source);
-  return walkTree(tree.rootNode, source);
-}
diff --git a/src/import/extractors/rs-clap.ts b/src/import/extractors/rs-clap.ts
deleted file mode 100644
index 4390afe..0000000
--- a/src/import/extractors/rs-clap.ts
+++ /dev/null
@@ -1,76 +0,0 @@
-import { readFileSync } from 'node:fs';
-import { getSdkParser } from '../../benchmark/extractors/sdk/parser.js';
-import type { CliCommandDefinition, CliCommandOptionDefinition } from '../types.js';
-
-function extractParenBlock(source: string, start: number): string {
-  let depth = 0;
-  let i = start;
-  while (i < source.length) {
-    if (source[i] === '(') depth++;
-    else if (source[i] === ')') {
-      depth--;
-      if (depth === 0) return source.slice(start, i + 1);
-    }
-    i++;
-  }
-  return source.slice(start);
-}
-
-/**
- * Extract clap command definitions from a Rust source file using tree-sitter for validation,
- * then text-based analysis for extraction.
- */
-export async function extractClap(filePath: string): Promise<CliCommandDefinition[]> {
-  const source = readFileSync(filePath, 'utf-8');
-  const parser = await getSdkParser('rust');
-  parser.parse(source); // validate only
-
-  const commands: CliCommandDefinition[] = [];
-  const subcommandSearchStr = '.subcommand(';
-  let searchStart = 0;
-
-  while (true) {
-    const subIdx = source.indexOf(subcommandSearchStr, searchStart);
-    if (subIdx === -1) break;
-
-    const blockStart = subIdx + subcommandSearchStr.length - 1;
-    const block = extractParenBlock(source, blockStart);
-
-    const nameMatch = block.match(/Command\s*::\s*new\s*\(\s*"([^"]+)"\s*\)/);
-    if (!nameMatch) { searchStart = subIdx + 1; continue; }
-    const commandName = nameMatch[1]!;
-
-    const aboutMatch = block.match(/\.about\s*\(\s*"([^"]+)"\s*\)/);
-    const description = aboutMatch ? aboutMatch[1] : undefined;
-
-    const options: CliCommandOptionDefinition[] = [];
-    const argSearchStr = '.arg(';
-    let argSearchStart = 0;
-    while (true) {
-      const argIdx = block.indexOf(argSearchStr, argSearchStart);
-      if (argIdx === -1) break;
-
-      const argBlockStart = argIdx + argSearchStr.length - 1;
-      const argBlock = extractParenBlock(block, argBlockStart);
-
-      const longMatch = argBlock.match(/\.long\s*\(\s*"([^"]+)"\s*\)/);
-      if (longMatch) {
-        const longFlag = '--' + longMatch[1];
-        const helpMatch = argBlock.match(/\.help\s*\(\s*"([^"]+)"\s*\)/);
-        const isBool = /SetTrue|SetFalse|store_true/.test(argBlock);
-        options.push({
-          name: longFlag,
-          description: helpMatch ? helpMatch[1] : undefined,
-          takesValue: !isBool,
-        });
-      }
-
-      argSearchStart = argIdx + 1;
-    }
-
-    commands.push({ command: commandName, description, options: options.length > 0 ? options : undefined });
-    searchStart = subIdx + 1;
-  }
-
-  return commands;
-}
diff --git a/src/import/extractors/ts-commander.ts b/src/import/extractors/ts-commander.ts
deleted file mode 100644
index 05c7392..0000000
--- a/src/import/extractors/ts-commander.ts
+++ /dev/null
@@ -1,71 +0,0 @@
-import { readFileSync } from 'node:fs';
-
-import ts from 'typescript';
-
-import type { CliCommandDefinition } from '../types.js';
-
-function getScriptKind(filePath: string): ts.ScriptKind {
-  if (filePath.endsWith('.tsx')) return ts.ScriptKind.TSX;
-  if (filePath.endsWith('.js')) return ts.ScriptKind.JS;
-  return ts.ScriptKind.TS;
-}
-
-function walkChainUp(node: ts.Node, def: CliCommandDefinition): void {
-  const propAccess = node.parent;
-  if (!propAccess || !ts.isPropertyAccessExpression(propAccess)) return;
-  const nextCall = propAccess.parent;
-  if (!nextCall || !ts.isCallExpression(nextCall)) return;
-
-  const method = propAccess.name.text;
-  if (method === 'description' && nextCall.arguments[0] !== undefined && ts.isStringLiteral(nextCall.arguments[0])) {
-    def.description = nextCall.arguments[0].text;
-  } else if (method === 'option' && nextCall.arguments[0] !== undefined && ts.isStringLiteral(nextCall.arguments[0])) {
-    const flagStr = nextCall.arguments[0].text;
-    const desc =
-      nextCall.arguments[1] !== undefined && ts.isStringLiteral(nextCall.arguments[1])
-        ? nextCall.arguments[1].text
-        : undefined;
-    const takesValue = /<\w+>|\[\w+\]/.test(flagStr);
-    if (!def.options) def.options = [];
-    def.options.push({ name: flagStr, description: desc, takesValue });
-  }
-  walkChainUp(nextCall, def);
-}
-
-/**
- * Extract commander.js command definitions from a TypeScript/JavaScript source file
- * using the TypeScript Compiler API.
- */
-export function extractCommander(filePath: string): CliCommandDefinition[] {
-  const source = readFileSync(filePath, 'utf-8');
-  const scriptKind = getScriptKind(filePath);
-  const sourceFile = ts.createSourceFile(filePath, source, ts.ScriptTarget.Latest, true, scriptKind);
-
-  const commands: CliCommandDefinition[] = [];
-
-  function visit(node: ts.Node): void {
-    if (ts.isCallExpression(node)) {
-      const expr = node.expression;
-      if (
-        ts.isPropertyAccessExpression(expr) &&
-        expr.name.text === 'command' &&
-        node.arguments[0] !== undefined &&
-        ts.isStringLiteral(node.arguments[0])
-      ) {
-        const rawName = node.arguments[0].text;
-        // Strip positional placeholders: 'delete <id>' → 'delete'
-        const commandName = rawName.split(' ')[0]!;
-
-        const def: CliCommandDefinition = { command: commandName };
-        walkChainUp(node, def);
-        commands.push(def);
-      }
-    }
-
-    ts.forEachChild(node, visit);
-  }
-
-  visit(sourceFile);
-
-  return commands;
-}
diff --git a/src/import/extractors/ts-yargs.ts b/src/import/extractors/ts-yargs.ts
deleted file mode 100644
index 08b7ce1..0000000
--- a/src/import/extractors/ts-yargs.ts
+++ /dev/null
@@ -1,83 +0,0 @@
-import { readFileSync } from 'node:fs';
-import { extname } from 'node:path';
-import ts from 'typescript';
-import type { CliCommandDefinition, CliCommandOptionDefinition } from '../types.js';
-
-function scriptKind(filePath: string): ts.ScriptKind {
-  const ext = extname(filePath).toLowerCase();
-  if (ext === '.tsx') return ts.ScriptKind.TSX;
-  if (ext === '.js' || ext === '.mjs' || ext === '.cjs') return ts.ScriptKind.JS;
-  return ts.ScriptKind.TS;
-}
-
-function extractOptionsFromBuilder(builder: ts.Expression): CliCommandOptionDefinition[] {
-  const options: CliCommandOptionDefinition[] = [];
-
-  function visitForOptions(node: ts.Node): void {
-    if (
-      ts.isCallExpression(node) &&
-      ts.isPropertyAccessExpression(node.expression) &&
-      node.expression.name.text === 'option' &&
-      node.arguments.length >= 1 &&
-      ts.isStringLiteral(node.arguments[0]!)
-    ) {
-      const optName = `--${node.arguments[0].text}`;
-      let desc: string | undefined;
-      let takesValue = false;
-
-      const config = node.arguments[1];
-      if (config && ts.isObjectLiteralExpression(config)) {
-        for (const prop of config.properties) {
-          if (!ts.isPropertyAssignment(prop)) continue;
-          const key = ts.isIdentifier(prop.name) ? prop.name.text : null;
-          if (key === 'describe' && ts.isStringLiteral(prop.initializer)) {
-            desc = prop.initializer.text;
-          }
-          if (key === 'type' && ts.isStringLiteral(prop.initializer)) {
-            const t = prop.initializer.text;
-            takesValue = t === 'string' || t === 'array' || t === 'number';
-          }
-        }
-      }
-
-      options.push({ name: optName, description: desc, takesValue });
-    }
-    ts.forEachChild(node, visitForOptions);
-  }
-
-  visitForOptions(builder);
-  return options;
-}
-
-export function extractYargs(filePath: string): CliCommandDefinition[] {
-  const source = readFileSync(filePath, 'utf-8');
-  const sf = ts.createSourceFile(filePath, source, ts.ScriptTarget.Latest, true, scriptKind(filePath));
-  const commands: CliCommandDefinition[] = [];
-
-  function visit(node: ts.Node): void {
-    if (
-      ts.isCallExpression(node) &&
-      ts.isPropertyAccessExpression(node.expression) &&
-      node.expression.name.text === 'command' &&
-      node.arguments.length >= 2 &&
-      ts.isStringLiteral(node.arguments[0]!) &&
-      ts.isStringLiteral(node.arguments[1]!)
-    ) {
-      const commandName = node.arguments[0].text.split(' ')[0]!;
-      const description = node.arguments[1].text;
-      const def: CliCommandDefinition = { command: commandName, description };
-
-      const builder = node.arguments[2];
-      if (builder) {
-        const opts = extractOptionsFromBuilder(builder);
-        if (opts.length > 0) def.options = opts;
-      }
-
-      commands.push(def);
-    }
-    ts.forEachChild(node, visit);
-  }
-
-  visit(sf);
-  return commands;
-}
diff --git a/src/import/index.ts b/src/import/index.ts
deleted file mode 100644
index 8b41faf..0000000
--- a/src/import/index.ts
+++ /dev/null
@@ -1,84 +0,0 @@
-import { existsSync, mkdirSync } from 'node:fs';
-import { dirname, resolve } from 'node:path';
-import type { ImportOptions, CliCommandDefinition } from './types.js';
-import { detectFramework } from './detect.js';
-import { extractCommander } from './extractors/ts-commander.js';
-import { extractYargs } from './extractors/ts-yargs.js';
-import { extractClick } from './extractors/py-click.js';
-import { extractArgparse } from './extractors/py-argparse.js';
-import { extractClap } from './extractors/rs-clap.js';
-import { scrapeHelp } from './extractors/help-scraper.js';
-import { discoverCliSurfaceFromSources } from '../discovery/cli.js';
-import { promptOverwrite, writeOutput } from './output.js';
-
-export async function importCommands(opts: ImportOptions): Promise<void> {
-  const { from, out, scrape, depth, cwd } = opts;
-  const absFrom = resolve(cwd, from);
-  const absOut = resolve(cwd, out);
-
-  const detection = scrape ? { kind: 'unknown' as const, binaryHint: from } : detectFramework(from, cwd);
-  console.log(`\nnpx skill-optimizer import-commands — detecting framework from ${from}`);
-  if (detection.kind !== 'unknown') {
-    console.log(`  Detected: ${detection.kind}`);
-  }
-
-  let commands: CliCommandDefinition[] = [];
-  if (!scrape) {
-    console.log('  Extracting commands...');
-    try {
-      if (detection.kind === 'commander') {
-        commands = extractCommander(absFrom);
-      } else if (detection.kind === 'yargs') {
-        commands = extractYargs(absFrom);
-      } else if (detection.kind === 'optique') {
-        const snapshot = discoverCliSurfaceFromSources([absFrom]);
-        commands = snapshot.actions.map(a => ({
-          command: a.name,
-          description: a.description,
-          options: a.args.map(arg => ({
-            name: arg.name,
-            description: arg.description,
-            takesValue: arg.type === 'string',
-          })),
-        }));
-      } else if (detection.kind === 'click' || detection.kind === 'typer') {
-        commands = await extractClick(absFrom);
-      } else if (detection.kind === 'argparse') {
-        commands = await extractArgparse(absFrom);
-      } else if (detection.kind === 'clap') {
-        commands = await extractClap(absFrom);
-      }
-    } catch (err) {
-      console.error(`  Warning: static extraction failed: ${err instanceof Error ? err.message : err}`);
-      commands = [];
-    }
-  }
-
-  if (commands.length === 0 && detection.binaryHint) {
-    console.log('  Static extraction yielded 0 commands — falling back to --help scraping...');
-    try {
-      commands = await scrapeHelp(detection.binaryHint, { depth, cwd });
-    } catch (err) {
-      console.error(`  Warning: --help scraping failed: ${err instanceof Error ? err.message : err}`);
-    }
-  }
-
-  if (commands.length === 0) {
-    throw new Error('No commands found after all strategies. Try: npx skill-optimizer import-commands --from <binary> --scrape');
-  }
-
-  console.log(`  Found ${commands.length} commands`);
-
-  if (existsSync(absOut) && !opts.force) {
-    const overwrite = await promptOverwrite(absOut);
-    if (!overwrite) {
-      console.log('\n  Aborted. Output file unchanged.');
-      return;
-    }
-  }
-
-  mkdirSync(dirname(absOut), { recursive: true });
-  writeOutput(commands, absOut);
-  console.log(`\n  Wrote ${commands.length} commands to ${out}`);
-  console.log(`  Done. Review the file and run 'npx skill-optimizer doctor' to validate.\n`);
-}
diff --git a/src/import/output.ts b/src/import/output.ts
deleted file mode 100644
index e77d686..0000000
--- a/src/import/output.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-import { writeFileSync } from 'node:fs';
-import { createInterface } from 'node:readline';
-import type { CliCommandDefinition } from './types.js';
-
-export async function promptOverwrite(outPath: string): Promise<boolean> {
-  return new Promise((resolve, reject) => {
-    const rl = createInterface({ input: process.stdin, output: process.stdout });
-    rl.on('error', (err) => {
-      rl.close();
-      reject(err);
-    });
-    rl.question(`  Output: ${outPath} already exists.\n  Overwrite? [y/N] `, (answer) => {
-      rl.close();
-      resolve(answer.trim().toLowerCase() === 'y');
-    });
-  });
-}
-
-export function writeOutput(commands: CliCommandDefinition[], outPath: string): void {
-  writeFileSync(outPath, JSON.stringify(commands, null, 2) + '\n', 'utf-8');
-}
diff --git a/src/import/types.ts b/src/import/types.ts
deleted file mode 100644
index 5052206..0000000
--- a/src/import/types.ts
+++ /dev/null
@@ -1,26 +0,0 @@
-export type { CliCommandDefinition, CliCommandOptionDefinition } from '../benchmark/types.js';
-
-export type FrameworkKind =
-  | 'commander'
-  | 'yargs'
-  | 'optique'
-  | 'click'
-  | 'typer'
-  | 'argparse'
-  | 'clap'
-  | 'unknown';
-
-export interface DetectionResult {
-  kind: FrameworkKind;
-  binaryHint?: string;
-}
-
-export interface ImportOptions {
-  from: string;
-  out: string;
-  scrape: boolean;
-  depth: number;
-  cwd: string;
-  /** Skip the interactive overwrite prompt and always overwrite the output file. */
-  force?: boolean;
-}
diff --git a/src/index.ts b/src/index.ts
index c550f71..3df0e5b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,5 +1 @@
-export * from './benchmark/index.js';
-export * from './actions/index.js';
-export * from './optimizer/index.js';
-export * from './project/index.js';
-export * from './tasks/index.js';
+export * from './workbench/index.js';
diff --git a/src/init/answers.ts b/src/init/answers.ts
deleted file mode 100644
index b05ef1f..0000000
--- a/src/init/answers.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import { readFileSync } from 'node:fs';
-
-export interface WizardAnswers {
-  surface: 'sdk' | 'cli' | 'mcp' | 'prompt';
-  repoPath: string;
-  models: string[];
-  maxTasks: number;
-  maxIterations: number;
-  /** Target overall pass rate (0–1) at which optimization stops early. Default: 0.8 */
-  targetPassRate?: number;
-  /** Path to SKILL.md, relative to repoPath (default: SKILL.md) */
-  skillPath?: string;
-  /** For cli/mcp: path to entry file or binary (relative to repoPath or absolute) */
-  entryFile?: string;
-  name?: string;
-}
-
-const DEFAULT_MODELS = [
-  'openrouter/anthropic/claude-sonnet-4.6',
-  'openrouter/deepseek/deepseek-v3.2',
-  'openrouter/google/gemini-2.5-flash',
-];
-
-export function buildDefaultAnswers(surface: 'sdk' | 'cli' | 'mcp' | 'prompt' = 'sdk', repoPath?: string): WizardAnswers {
-  return {
-    surface,
-    repoPath: repoPath ?? process.cwd(),
-    models: DEFAULT_MODELS,
-    maxTasks: 20,
-    maxIterations: 5,
-    targetPassRate: 0.8,
-  };
-}
-
-export function readAnswersFile(filePath: string): WizardAnswers {
-  const raw = JSON.parse(readFileSync(filePath, 'utf-8')) as Partial<WizardAnswers>;
-  if (!raw.surface || !['sdk', 'cli', 'mcp', 'prompt'].includes(raw.surface)) {
-    throw new Error(`answers file must have surface: sdk | cli | mcp | prompt (got: ${JSON.stringify(raw.surface)})`);
-  }
-  if (!raw.repoPath) {
-    throw new Error('answers file must have repoPath');
-  }
-  if (!Array.isArray(raw.models) || raw.models.length === 0) {
-    throw new Error('answers file must have at least one model (as a JSON array)');
-  }
-  return {
-    surface: raw.surface,
-    repoPath: raw.repoPath,
-    models: raw.models,
-    maxTasks: raw.maxTasks ?? 20,
-    maxIterations: raw.maxIterations ?? 5,
-    targetPassRate: raw.targetPassRate ?? 0.8,
-    skillPath: raw.skillPath,
-    entryFile: raw.entryFile,
-    name: raw.name,
-  };
-}
diff --git a/src/init/detect-project.ts b/src/init/detect-project.ts
deleted file mode 100644
index c037ba6..0000000
--- a/src/init/detect-project.ts
+++ /dev/null
@@ -1,159 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { join, basename } from 'node:path';
-import type { WizardAnswers } from './answers.js';
-
-interface DetectedProject {
-  surface: 'sdk' | 'cli' | 'mcp';
-  name: string;
-  repoPath: string;
-  /** Relative to repoPath */
-  entryFile: string;
-  /** Relative to repoPath, if found */
-  skillFile?: string;
-  confidence: 'high' | 'medium' | 'low';
-  /** Human-readable list of signals that drove detection */
-  signals: string[];
-}
-
-export function detectProject(dir: string): DetectedProject {
-  const signals: string[] = [];
-  let surface: 'sdk' | 'cli' | 'mcp' = 'sdk';
-  let name = basename(dir);
-  let entryFile = 'src/index.ts';
-  let confidence: 'high' | 'medium' | 'low' = 'low';
-
-  // ── package.json ──────────────────────────────────────────────────────────
-  const pkgPath = join(dir, 'package.json');
-  if (existsSync(pkgPath)) {
-    const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8')) as {
-      name?: string;
-      bin?: Record<string, string> | string;
-      main?: string;
-      dependencies?: Record<string, string>;
-      devDependencies?: Record<string, string>;
-    };
-
-    if (pkg.name) name = pkg.name;
-
-    const allDeps = { ...pkg.dependencies, ...pkg.devDependencies };
-
-    // MCP: @modelcontextprotocol/sdk in deps
-    if (allDeps['@modelcontextprotocol/sdk']) {
-      surface = 'mcp';
-      confidence = 'high';
-      signals.push('package.json: @modelcontextprotocol/sdk dependency');
-      entryFile = existsSync(join(dir, 'src', 'server.ts'))
-        ? 'src/server.ts'
-        : 'src/index.ts';
-    } else if (pkg.bin && (typeof pkg.bin === 'string' || Object.keys(pkg.bin).length > 0)) {
-      surface = 'cli';
-      confidence = 'high';
-      signals.push('package.json: bin field');
-      const firstBin = typeof pkg.bin === 'string' ? pkg.bin : Object.values(pkg.bin)[0] ?? '';
-      const srcGuess = firstBin.replace(/dist\//, 'src/').replace(/\.js$/, '.ts');
-      entryFile = existsSync(join(dir, srcGuess)) ? srcGuess : 'src/cli.ts';
-    } else {
-      surface = 'sdk';
-      confidence = 'medium';
-      signals.push('package.json: no bin field (SDK assumed)');
-      const mainGuess = pkg.main?.replace(/dist\//, 'src/').replace(/\.js$/, '.ts');
-      entryFile = mainGuess && existsSync(join(dir, mainGuess))
-        ? mainGuess
-        : 'src/index.ts';
-    }
-  }
-
-  // ── pyproject.toml ────────────────────────────────────────────────────────
-  const pyprojectPath = join(dir, 'pyproject.toml');
-  if (existsSync(pyprojectPath) && confidence === 'low') {
-    const content = readFileSync(pyprojectPath, 'utf-8');
-
-    const nameMatch = content.match(/^\s*name\s*=\s*"([^"]+)"/m);
-    if (nameMatch) name = nameMatch[1]!;
-
-    if (/^\s*mcp\s*[=\[>]/m.test(content) || content.includes('"mcp"')) {
-      surface = 'mcp';
-      confidence = 'high';
-      signals.push('pyproject.toml: mcp dependency');
-      entryFile = existsSync(join(dir, 'server.py')) ? 'server.py' : 'main.py';
-    } else if (/\[project\.scripts\]/m.test(content)) {
-      surface = 'cli';
-      confidence = 'high';
-      signals.push('pyproject.toml: [project.scripts] section');
-      const scriptMatch = content.match(/\[project\.scripts\][^\[]*\n\s*\S+\s*=\s*"([^:]+):/m);
-      entryFile = scriptMatch ? scriptMatch[1]!.replace(/\./g, '/') + '.py' : 'main.py';
-    } else {
-      surface = 'sdk';
-      confidence = 'medium';
-      signals.push('pyproject.toml: no [project.scripts] (SDK assumed)');
-      entryFile = 'src/__init__.py';
-    }
-  }
-
-  // ── Cargo.toml ────────────────────────────────────────────────────────────
-  const cargoPath = join(dir, 'Cargo.toml');
-  if (existsSync(cargoPath) && confidence === 'low') {
-    const content = readFileSync(cargoPath, 'utf-8');
-
-    const nameMatch = content.match(/^\s*name\s*=\s*"([^"]+)"/m);
-    if (nameMatch) name = nameMatch[1]!;
-
-    if (/^\[\[bin\]\]/m.test(content)) {
-      surface = 'cli';
-      confidence = 'high';
-      signals.push('Cargo.toml: [[bin]] section');
-      entryFile = 'src/main.rs';
-    } else if (/^\[lib\]/m.test(content)) {
-      surface = 'sdk';
-      confidence = 'high';
-      signals.push('Cargo.toml: [lib] section');
-      entryFile = 'src/lib.rs';
-    } else if (existsSync(join(dir, 'src', 'main.rs'))) {
-      surface = 'cli';
-      confidence = 'medium';
-      signals.push('Cargo.toml: inferred cli from src/main.rs existence');
-      entryFile = 'src/main.rs';
-    } else {
-      surface = 'sdk';
-      confidence = 'medium';
-      signals.push('Cargo.toml: no [[bin]] or [lib] found, no src/main.rs — defaulting to sdk');
-      entryFile = 'src/lib.rs';
-    }
-  }
-
-  if (signals.length === 0) {
-    signals.push('No manifest found — defaulting to sdk');
-  }
-
-  // ── Skill file ────────────────────────────────────────────────────────────
-  const skillCandidates = ['SKILL.md', 'docs/SKILL.md', 'README.md'];
-  const skillFile = skillCandidates.find(f => existsSync(join(dir, f)));
-
-  return { surface, name, repoPath: dir, entryFile, skillFile, confidence, signals };
-}
-
-export function detectedToPreseed(detected: DetectedProject): Partial<WizardAnswers> {
-  return {
-    surface: detected.surface,
-    repoPath: detected.repoPath,
-    entryFile: detected.entryFile,
-    name: detected.name,
-  };
-}
-
-function confidenceLabel(confidence: DetectedProject['confidence']): string {
-  switch (confidence) {
-    case 'high': return 'high confidence';
-    case 'medium': return 'medium confidence';
-    case 'low': return 'low confidence — review carefully';
-  }
-}
-
-export function printDetectionSummary(detected: DetectedProject): void {
-  console.log(`\nDetected: ${detected.surface} (${confidenceLabel(detected.confidence)})`);
-  console.log(`  Name:    ${detected.name}`);
-  console.log(`  Entry:   ${detected.entryFile}`);
-  if (detected.skillFile) console.log(`  Skill:   ${detected.skillFile}`);
-  console.log(`  Signals: ${detected.signals.join('; ')}`);
-  console.log('');
-}
diff --git a/src/init/scaffold.ts b/src/init/scaffold.ts
deleted file mode 100644
index 0846d8a..0000000
--- a/src/init/scaffold.ts
+++ /dev/null
@@ -1,293 +0,0 @@
-import { writeFileSync, existsSync, mkdirSync } from 'node:fs';
-import { isAbsolute, resolve, relative, join, basename } from 'node:path';
-import type { WizardAnswers } from './answers.js';
-import { importCommands } from '../import/index.js';
-
-const KNOWN_MODELS: Record<string, { name: string; tier: 'flagship' | 'mid' | 'low' }> = {
-  // DeepSeek
-  'openrouter/deepseek/deepseek-v3.2': { name: 'DeepSeek V3.2', tier: 'flagship' },
-  // Anthropic
-  'openrouter/anthropic/claude-opus-4.6': { name: 'Claude Opus 4.6', tier: 'flagship' },
-  'openrouter/anthropic/claude-sonnet-4.6': { name: 'Claude Sonnet 4.6', tier: 'flagship' },
-  // MiniMax
-  'openrouter/minimax/minimax-m2.7': { name: 'MiniMax M2.7', tier: 'flagship' },
-  'openrouter/minimax/minimax-m2.5': { name: 'MiniMax M2.5', tier: 'mid' },
-  // Google
-  'openrouter/google/gemini-3-flash-preview': { name: 'Gemini 3 Flash Preview', tier: 'mid' },
-  'openrouter/google/gemini-3.1-pro-preview': { name: 'Gemini 3.1 Pro Preview', tier: 'flagship' },
-  'openrouter/google/gemini-2.5-flash': { name: 'Gemini 2.5 Flash', tier: 'mid' },
-  'openrouter/google/gemini-2.5-flash-lite': { name: 'Gemini 2.5 Flash Lite', tier: 'low' },
-  'openrouter/google/gemma-4-31b-it': { name: 'Gemma 4 31B-it', tier: 'mid' },
-  // Qwen
-  'openrouter/qwen/qwen3.5-397b-a17b': { name: 'Qwen3.5 397B A17B', tier: 'flagship' },
-  'openrouter/qwen/qwen3.6-plus': { name: 'Qwen 3.6 Plus', tier: 'mid' },
-  // Xiaomi
-  'openrouter/xiaomi/mimo-v2-pro': { name: 'MiMo-V2-Pro', tier: 'mid' },
-  // Nvidia
-  'openrouter/nvidia/nemotron-3-super-120b-a12b': { name: 'Nemotron 3 Super 120B', tier: 'mid' },
-  // Moonshot
-  'openrouter/moonshotai/kimi-k2.5': { name: 'Kimi K2.5', tier: 'flagship' },
-  // xAI
-  'openrouter/x-ai/grok-4.1-fast': { name: 'Grok 4.1 Fast', tier: 'flagship' },
-  // OpenAI
-  'openrouter/openai/gpt-5.4': { name: 'GPT-5.4', tier: 'flagship' },
-  'openrouter/openai/gpt-4o-mini': { name: 'GPT-4o Mini', tier: 'mid' },
-  'openrouter/openai/gpt-oss-120b': { name: 'GPT-OSS 120B', tier: 'mid' },
-  // Meta
-  'openrouter/meta-llama/llama-4-maverick': { name: 'Llama 4 Maverick', tier: 'mid' },
-  // Z-AI
-  'openrouter/z-ai/glm-5': { name: 'GLM 5', tier: 'mid' },
-  'openrouter/z-ai/glm-5.1': { name: 'GLM 5.1', tier: 'mid' },
-  'openrouter/z-ai/glm-5-turbo': { name: 'GLM 5 Turbo', tier: 'low' },
-};
-
-function resolveModel(id: string): { id: string; name: string; tier: 'flagship' | 'mid' | 'low' } {
-  const known = KNOWN_MODELS[id];
-  if (known) return { id, ...known };
-  const slug = id.split('/').pop() ?? id;
-  const name = slug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
-  return { id, name, tier: 'mid' };
-}
-
-
-function buildConfigFromAnswers(answers: WizardAnswers, configDir: string): object {
-  const { surface, repoPath, models, maxTasks, maxIterations, name } = answers;
-  const targetPassRate = answers.targetPassRate ?? 0.8;
-  const projectName = name ?? basename(repoPath);
-
-  // Paths stored in the JSON are relative to configDir so the config is portable
-  const relRepo = relative(configDir, repoPath) || '.';
-  // skillPath may be absolute (from wizard) or relative to repoPath (from answers file)
-  const skillAbsPath = answers.skillPath
-    ? (isAbsolute(answers.skillPath) ? answers.skillPath : resolve(repoPath, answers.skillPath))
-    : resolve(repoPath, 'SKILL.md');
-  // target.skill and discovery.sources are resolved relative to configDir by the loader
-  const skillConfigPath = relative(configDir, skillAbsPath);
-  // optimize.allowedPaths are validated relative to repoPath (not configDir) by the validator
-  const skillAllowedPath = relative(repoPath, skillAbsPath);
-
-  const commonBenchmark = {
-    format: 'pi',
-    timeout: 240000,
-    taskGeneration: { enabled: true, maxTasks, outputDir: '.' },
-    models: models.map(resolveModel),
-    output: { dir: '../benchmark-results' },
-    verdict: { perModelFloor: Math.max(0.5, targetPassRate - 0.2), targetWeightedAverage: targetPassRate },
-  };
-
-  const commonOptimize = {
-    model: 'openrouter/anthropic/claude-sonnet-4.6',
-    allowedPaths: [skillAllowedPath],
-    validation: [],
-    maxIterations,
-  };
-
-  if (surface === 'sdk') {
-    return {
-      name: projectName,
-      target: {
-        surface: 'sdk',
-        repoPath: relRepo,
-        skill: skillConfigPath,
-        discovery: { mode: 'auto', sources: [join(relRepo, 'src/index.ts')] },
-      },
-      benchmark: commonBenchmark,
-      optimize: commonOptimize,
-    };
-  }
-
-  const defaultEntry = surface === 'cli' ? 'src/cli.ts' : 'src/server.ts';
-  // entryFile from wizard is now absolute; compute relative to configDir for config JSON
-  const entryAbsPath = answers.entryFile
-    ? answers.entryFile
-    : resolve(repoPath, defaultEntry);
-  const entryConfigPath = relative(configDir, entryAbsPath);
-
-  if (surface === 'cli') {
-    return {
-      name: projectName,
-      target: {
-        surface: 'cli',
-        repoPath: relRepo,
-        skill: skillConfigPath,
-        discovery: { mode: 'auto', sources: [entryConfigPath] },
-        cli: { commands: './cli-commands.json' },
-      },
-      benchmark: commonBenchmark,
-      optimize: commonOptimize,
-    };
-  }
-
-  if (surface === 'prompt') {
-    return {
-      name: projectName,
-      target: {
-        surface: 'prompt',
-        repoPath: relRepo,
-        skill: skillConfigPath,
-      },
-      benchmark: commonBenchmark,
-      optimize: commonOptimize,
-    };
-  }
-
-  // mcp
-  return {
-    name: projectName,
-    target: {
-      surface: 'mcp',
-      repoPath: relRepo,
-      skill: skillConfigPath,
-      discovery: { mode: 'auto', sources: [entryConfigPath] },
-      mcp: { tools: './tools.json' },
-    },
-    benchmark: commonBenchmark,
-    optimize: commonOptimize,
-  };
-}
-
-export async function scaffoldInit(answers: WizardAnswers, cwd: string): Promise<void> {
-  const generatedDir = resolve(cwd, '.skill-optimizer');
-  mkdirSync(generatedDir, { recursive: true });
-
-  const configPath = resolve(generatedDir, 'skill-optimizer.json');
-  const configExisted = existsSync(configPath);
-  writeFileSync(configPath, JSON.stringify(buildConfigFromAnswers(answers, generatedDir), null, 2) + '\n', 'utf-8');
-  console.log(`[init] ${configExisted ? 'Updated' : 'Created'} ${configPath}`);
-
-  // 'extracted' = auto-extracted from source, 'template' = placeholder written, undefined = n/a
-  let commandsSource: 'extracted' | 'template' | undefined;
-
-  if (answers.surface === 'cli') {
-    const commandsPath = resolve(generatedDir, 'cli-commands.json');
-    if (answers.entryFile) {
-      console.log(`[init] Running import-commands from ${answers.entryFile}...`);
-      try {
-        const TIMEOUT_MS = 20_000;
-        const timeout = new Promise<never>((_, reject) =>
-          setTimeout(() => reject(new Error(`timed out after ${TIMEOUT_MS / 1000}s`)), TIMEOUT_MS),
-        );
-        await Promise.race([
-          importCommands({ from: answers.entryFile, out: commandsPath, scrape: false, depth: 2, cwd: answers.repoPath, force: true }),
-          timeout,
-        ]);
-        commandsSource = 'extracted';
-      } catch (err) {
-        console.warn(`[init] Warning: import-commands failed: ${err instanceof Error ? err.message : err}`);
-        console.warn('[init] Writing template cli-commands.json instead — edit it with your real commands.');
-        writeCliTemplate(commandsPath);
-        commandsSource = 'template';
-      }
-    } else if (!existsSync(commandsPath)) {
-      writeCliTemplate(commandsPath);
-      commandsSource = 'template';
-    }
-    // If it already exists and no entry file was given, leave it untouched (commandsSource stays undefined)
-  }
-
-  if (answers.surface === 'mcp') {
-    const toolsPath = resolve(generatedDir, 'tools.json');
-    if (!existsSync(toolsPath)) {
-      writeMcpTemplate(toolsPath);
-    }
-  }
-
-  printNextSteps(answers, configPath, commandsSource);
-}
-
-function writeCliTemplate(commandsPath: string): void {
-  const commands = [
-    {
-      command: 'example-create',
-      description: 'Create a new item',
-      options: [{ name: '--name', takesValue: true, description: 'Name for the item' }],
-    },
-    {
-      command: 'example-list',
-      description: 'List all items',
-      options: [{ name: '--format', takesValue: true, description: 'Output format: json | table (default: table)' }],
-    },
-  ];
-  writeFileSync(commandsPath, JSON.stringify(commands, null, 2) + '\n', 'utf-8');
-  console.log(`[init] Created ${commandsPath} (template — edit or run import-commands to replace)`);
-}
-
-function writeMcpTemplate(toolsPath: string): void {
-  const tools = [
-    {
-      type: 'function',
-      function: {
-        name: 'get_data',
-        description: 'Get data for a given item ID',
-        parameters: {
-          type: 'object',
-          properties: { item_id: { type: 'string', description: 'The item identifier' } },
-          required: ['item_id'],
-        },
-      },
-    },
-    {
-      type: 'function',
-      function: {
-        name: 'send_data',
-        description: 'Send data to a recipient',
-        parameters: {
-          type: 'object',
-          properties: {
-            value: { type: 'string', description: 'The data to send' },
-            recipient: { type: 'string', description: 'The recipient identifier' },
-          },
-          required: ['value', 'recipient'],
-        },
-      },
-    },
-  ];
-  writeFileSync(toolsPath, JSON.stringify(tools, null, 2) + '\n', 'utf-8');
-  console.log(`[init] Created ${toolsPath} (template — edit with your real tools)`);
-}
-
-function printNextSteps(answers: WizardAnswers, configPath: string, commandsSource: 'extracted' | 'template' | undefined): void {
-  const skillAbsPath = answers.skillPath
-    ? (isAbsolute(answers.skillPath) ? answers.skillPath : resolve(answers.repoPath, answers.skillPath))
-    : resolve(answers.repoPath, 'SKILL.md');
-  const skillMissing = !existsSync(skillAbsPath);
-
-  console.log('\n[init] Done!');
-  console.log(`  Surface:    ${answers.surface}`);
-  console.log(`  Repo:       ${answers.repoPath}`);
-  console.log(`  SKILL.md:   ${skillAbsPath}${skillMissing ? ' (not found yet — create it)' : ''}`);
-  console.log(`  Models:     ${answers.models.length} — ${answers.models.map(m => m.split('/').pop()).join(', ')}`);
-  console.log(`  Tasks:      up to ${answers.maxTasks} per run`);
-  console.log(`  Iterations: up to ${answers.maxIterations}`);
-  console.log(`  Target:     ${Math.round((answers.targetPassRate ?? 0.8) * 100)}% pass rate`);
-  console.log(`  Config:     ${configPath}`);
-
-  // Only show a manifest step if user action is actually required
-  const needsManifestEdit =
-    (answers.surface === 'cli' && commandsSource === 'template') ||
-    answers.surface === 'mcp';
-
-  const steps: string[] = [];
-
-  if (needsManifestEdit) {
-    if (answers.surface === 'cli') {
-      steps.push(
-        'Edit .skill-optimizer/cli-commands.json — replace the template with your real commands\n' +
-        '     (or rerun with an entry file: skill-optimizer import-commands --from <entry-file>)',
-      );
-    } else {
-      steps.push('Edit .skill-optimizer/tools.json — replace the template with your real MCP tools');
-    }
-  }
-
-  if (skillMissing) {
-    steps.push(`Create ${skillAbsPath}\n     Explain your surface to the model: what it does, key concepts, usage examples`);
-  }
-
-  steps.push('Run: skill-optimizer optimize --config ./.skill-optimizer/skill-optimizer.json');
-
-  if (steps.length > 0) {
-    console.log('\n  Next steps:');
-    steps.forEach((s, i) => console.log(`  ${i + 1}. ${s}`));
-  }
-}
diff --git a/src/init/wizard.ts b/src/init/wizard.ts
deleted file mode 100644
index e4c3e4b..0000000
--- a/src/init/wizard.ts
+++ /dev/null
@@ -1,190 +0,0 @@
-import * as p from '@clack/prompts';
-import { resolve } from 'node:path';
-import type { WizardAnswers } from './answers.js';
-import { scaffoldInit } from './scaffold.js';
-
-// All values use OpenRouter model IDs (openrouter/provider/model).
-// Ordered by OpenRouter weekly token volume (top 20).
-export const MODEL_PRESETS = [
-  // DeepSeek
-  { value: 'openrouter/deepseek/deepseek-v3.2',            label: 'DeepSeek V3.2          · DeepSeek',  hint: '#1' },
-  // Anthropic
-  { value: 'openrouter/anthropic/claude-opus-4.6',         label: 'Claude Opus 4.6        · Anthropic', hint: 'flagship' },
-  { value: 'openrouter/anthropic/claude-sonnet-4.6',       label: 'Claude Sonnet 4.6      · Anthropic' },
-  // MiniMax
-  { value: 'openrouter/minimax/minimax-m2.7',              label: 'MiniMax M2.7           · MiniMax',   hint: 'flagship' },
-  { value: 'openrouter/minimax/minimax-m2.5',              label: 'MiniMax M2.5           · MiniMax' },
-  // Google
-  { value: 'openrouter/google/gemini-3-flash-preview',     label: 'Gemini 3 Flash Preview · Google',    hint: 'fast' },
-  { value: 'openrouter/google/gemini-3.1-pro-preview',     label: 'Gemini 3.1 Pro Preview · Google',    hint: 'flagship' },
-  { value: 'openrouter/google/gemini-2.5-flash',           label: 'Gemini 2.5 Flash       · Google',    hint: 'fast' },
-  { value: 'openrouter/google/gemini-2.5-flash-lite',      label: 'Gemini 2.5 Flash Lite  · Google',    hint: 'fast' },
-  { value: 'openrouter/google/gemma-4-31b-it',             label: 'Gemma 4 31B-it         · Google',    hint: 'open' },
-  // Qwen
-  { value: 'openrouter/qwen/qwen3.5-397b-a17b',            label: 'Qwen3.5 397B A17B      · Alibaba',   hint: 'MoE' },
-  { value: 'openrouter/qwen/qwen3.6-plus',                 label: 'Qwen 3.6 Plus          · Alibaba',   hint: 'open' },
-  // Xiaomi
-  { value: 'openrouter/xiaomi/mimo-v2-pro',                label: 'MiMo-V2-Pro            · Xiaomi' },
-  // Nvidia
-  { value: 'openrouter/nvidia/nemotron-3-super-120b-a12b', label: 'Nemotron 3 Super 120B  · Nvidia',    hint: 'open' },
-  // Moonshot
-  { value: 'openrouter/moonshotai/kimi-k2.5',              label: 'Kimi K2.5              · Moonshot',  hint: 'flagship' },
-  // xAI
-  { value: 'openrouter/x-ai/grok-4.1-fast',               label: 'Grok 4.1 Fast          · xAI' },
-  // OpenAI
-  { value: 'openrouter/openai/gpt-5.4',                    label: 'GPT-5.4                · OpenAI',    hint: 'flagship' },
-  { value: 'openrouter/openai/gpt-4o-mini',                label: 'GPT-4o Mini            · OpenAI',    hint: 'fast' },
-  { value: 'openrouter/openai/gpt-oss-120b',               label: 'GPT-OSS 120B           · OpenAI',    hint: 'open' },
-  // Meta
-  { value: 'openrouter/meta-llama/llama-4-maverick',       label: 'Llama 4 Maverick       · Meta',      hint: 'open' },
-  // Z-AI
-  { value: 'openrouter/z-ai/glm-5',                        label: 'GLM 5                  · Z-AI' },
-  { value: 'openrouter/z-ai/glm-5.1',                      label: 'GLM 5.1                · Z-AI',      hint: 'new' },
-  { value: 'openrouter/z-ai/glm-5-turbo',                  label: 'GLM 5 Turbo            · Z-AI',      hint: 'fast' },
-];
-
-function cancelGuard<T>(value: T | symbol): T {
-  if (p.isCancel(value)) {
-    p.cancel('Cancelled.');
-    process.exit(0);
-  }
-  return value as T;
-}
-
-export async function runWizard(cwd: string, preseed?: Partial<WizardAnswers>): Promise<void> {
-  p.intro('skill-optimizer init');
-
-  // 1. Surface
-  let surface: 'sdk' | 'cli' | 'mcp' | 'prompt';
-  if (preseed?.surface) {
-    surface = preseed.surface;
-    p.log.info(`Surface: ${surface}`);
-  } else {
-    surface = cancelGuard(await p.select({
-      message: 'What surface are you targeting?',
-      options: [
-        { value: 'sdk', label: 'sdk', hint: 'TypeScript / Python / Rust library' },
-        { value: 'cli', label: 'cli', hint: 'command-line tool with subcommands' },
-        { value: 'mcp', label: 'mcp', hint: 'MCP server with tools' },
-        { value: 'prompt', label: 'prompt', hint: 'markdown skill/prompt file' },
-      ],
-    }) as 'sdk' | 'cli' | 'mcp' | 'prompt');
-  }
-
-  // 2. Target repo path
-  const repoPathRaw = cancelGuard(await p.text({
-    message: 'Target repo path (absolute):',
-    defaultValue: preseed?.repoPath ?? cwd,
-    placeholder: preseed?.repoPath ?? cwd,
-    validate: (v) => (v !== undefined && v.trim().length === 0 ? 'Required — enter the absolute path to your project' : undefined),
-  }) as string);
-  const repoPath = resolve(repoPathRaw.trim() || preseed?.repoPath || cwd);
-
-  // 3. Entry file (cli / mcp only) — grouped with paths
-  let entryFile: string | undefined;
-  if (surface === 'cli' || surface === 'mcp') {
-    const message = surface === 'cli'
-      ? 'Absolute path to CLI entry file or binary (leave blank to skip auto-extraction):'
-      : 'Absolute path to MCP server entry file (leave blank to skip auto-extraction):';
-    const defaultEntry = surface === 'cli' ? 'src/cli.ts' : 'src/server.ts';
-    const defaultEntryAbs = preseed?.entryFile
-      ? resolve(preseed.entryFile.startsWith('/') ? preseed.entryFile : resolve(repoPath, preseed.entryFile))
-      : resolve(repoPath, defaultEntry);
-    const raw = cancelGuard(await p.text({ message, placeholder: defaultEntryAbs, defaultValue: defaultEntryAbs }) as string);
-    const rawTrimmed = raw.trim();
-    entryFile = rawTrimmed
-      ? (rawTrimmed.startsWith('/') ? rawTrimmed : resolve(repoPath, rawTrimmed))
-      : undefined;
-  }
-
-  // 4. SKILL.md path
-  const defaultSkillPath = preseed?.skillPath
-    ? resolve(preseed.skillPath.startsWith('/') ? preseed.skillPath : resolve(repoPath, preseed.skillPath))
-    : resolve(repoPath, 'SKILL.md');
-  const skillPathRaw = cancelGuard(await p.text({
-    message: 'Absolute path to your SKILL.md:',
-    placeholder: defaultSkillPath,
-    defaultValue: defaultSkillPath,
-  }) as string);
-  const skillPathTrimmed = skillPathRaw.trim();
-  const skillPath = skillPathTrimmed
-    ? (skillPathTrimmed.startsWith('/') ? skillPathTrimmed : resolve(repoPath, skillPathTrimmed))
-    : defaultSkillPath;
-
-  // 5. Models (multi-select)
-  const selectedPresets = cancelGuard(await p.multiselect({
-    message: 'Which models to benchmark? (space to toggle, enter to confirm)',
-    options: MODEL_PRESETS,
-    required: true,
-    initialValues: ['openrouter/anthropic/claude-sonnet-4.6', 'openrouter/deepseek/deepseek-v3.2', 'openrouter/google/gemini-2.5-flash'],
-  }) as string[]);
-  const models: string[] = selectedPresets;
-
-  // Optional custom model
-  const customModel = cancelGuard(await p.text({
-    message: 'Add a custom model ID? (leave blank to skip)',
-    placeholder: 'openrouter/provider/model-name',
-    validate: (v) => {
-      if (!v || !v.trim()) return undefined;
-      if (!v.startsWith('openrouter/') && !v.startsWith('anthropic/') && !v.startsWith('openai/')) {
-        return 'Must start with openrouter/, anthropic/, or openai/';
-      }
-      return undefined;
-    },
-  }) as string);
-  if (customModel.trim()) models.push(customModel.trim());
-
-  // 6. Max tasks
-  const maxTasksRaw = cancelGuard(await p.text({
-    message: 'Max tasks to generate per benchmark run:',
-    defaultValue: '20',
-    placeholder: '20',
-    validate: (v) => {
-      const n = parseInt(v ?? '', 10);
-      if (isNaN(n) || n < 1) return 'Must be a positive integer';
-      return undefined;
-    },
-  }) as string);
-  const maxTasks = parseInt(maxTasksRaw || '20', 10);
-
-  // 7. Max iterations
-  const maxIterationsRaw = cancelGuard(await p.text({
-    message: 'Max optimize iterations:',
-    defaultValue: '5',
-    placeholder: '5',
-    validate: (v) => {
-      const n = parseInt(v ?? '', 10);
-      if (isNaN(n) || n < 1) return 'Must be a positive integer';
-      return undefined;
-    },
-  }) as string);
-  const maxIterations = parseInt(maxIterationsRaw || '5', 10);
-
-  // 8. Target pass rate
-  const targetPassRateRaw = cancelGuard(await p.text({
-    message: 'Target pass rate to stop optimization early (%):',
-    defaultValue: '80',
-    placeholder: '80',
-    validate: (v) => {
-      const n = parseFloat(v ?? '');
-      if (isNaN(n) || n < 1 || n > 100) return 'Must be a number between 1 and 100';
-      return undefined;
-    },
-  }) as string);
-  const targetPassRate = parseFloat(targetPassRateRaw || '80') / 100;
-
-  const answers: WizardAnswers = { surface, repoPath, models, maxTasks, maxIterations, targetPassRate, skillPath, entryFile, name: preseed?.name };
-
-  const spinner = p.spinner();
-  spinner.start('Scaffolding...');
-  try {
-    await scaffoldInit(answers, cwd);
-    spinner.stop('Done!');
-  } catch (err) {
-    spinner.stop('Error during scaffolding');
-    p.log.error(err instanceof Error ? err.message : String(err));
-    process.exit(1);
-  }
-
-  p.outro('Config written. Next: skill-optimizer optimize --config ./.skill-optimizer/skill-optimizer.json');
-}
diff --git a/src/optimizer/benchmark-adapter.ts b/src/optimizer/benchmark-adapter.ts
deleted file mode 100644
index ed9be0a..0000000
--- a/src/optimizer/benchmark-adapter.ts
+++ /dev/null
@@ -1,35 +0,0 @@
-import { mkdirSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-import { runBenchmark } from '../benchmark/runner.js';
-import type { BenchmarkReport } from '../benchmark/types.js';
-
-interface BenchmarkAdapterRunOptions {
-  outputDir: string;
-  label: string;
-  verdictPolicy?: { perModelFloor: number; targetWeightedAverage: number };
-  /** Override the skill source for this benchmark run (local versioned copy). */
-  skillOverride?: string;
-}
-
-interface BenchmarkAdapterRunResult {
-  report: BenchmarkReport;
-  reportPath: string;
-}
-
-export function createBenchmarkAdapter(): {
-  run(configPath: string, opts: BenchmarkAdapterRunOptions): Promise<BenchmarkAdapterRunResult>;
-} {
-  return {
-    async run(configPath: string, opts: BenchmarkAdapterRunOptions) {
-      const runOutputDir = resolve(opts.outputDir, opts.label);
-      mkdirSync(runOutputDir, { recursive: true });
-
-      const report = await runBenchmark({ configPath, outputDir: runOutputDir, verdictPolicy: opts.verdictPolicy, skillOverride: opts.skillOverride });
-      return {
-        report,
-        reportPath: resolve(runOutputDir, 'report.json'),
-      };
-    },
-  };
-}
diff --git a/src/optimizer/failure-analysis.ts b/src/optimizer/failure-analysis.ts
deleted file mode 100644
index 9d1df76..0000000
--- a/src/optimizer/failure-analysis.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-import type { BenchmarkReport } from '../benchmark/types.js';
-import type { FailureBucket, FailureBucketKind } from './types.js';
-
-const BUCKET_PRIORITY: Record<FailureBucketKind, number> = {
-  'missing-tool': 0,
-  'bad-args': 1,
-  hallucination: 2,
-  error: 3,
-};
-
-export function analyzeFailures(report: BenchmarkReport): FailureBucket[] {
-  const grouped = new Map<FailureBucketKind, FailureBucket>();
-
-  for (const result of report.results) {
-    const kind = classifyFailure(result);
-    if (!kind) continue;
-
-    const bucket = grouped.get(kind) ?? {
-      kind,
-      count: 0,
-      taskIds: [],
-      modelIds: [],
-    };
-
-    bucket.count += 1;
-    if (!bucket.taskIds.includes(result.task.id)) bucket.taskIds.push(result.task.id);
-    if (!bucket.modelIds.includes(result.model.id)) bucket.modelIds.push(result.model.id);
-    grouped.set(kind, bucket);
-  }
-
-  return [...grouped.values()].sort((a, b) => {
-    if (b.count !== a.count) return b.count - a.count;
-    return BUCKET_PRIORITY[a.kind] - BUCKET_PRIORITY[b.kind];
-  });
-}
-
-function classifyFailure(result: BenchmarkReport['results'][number]): FailureBucketKind | null {
-  if (result.error) return 'error';
-  if (result.metrics.hallucinatedActions.length > 0) return 'hallucination';
-  if (result.actionMatches.some((match) => match.methodFound && !match.argsCorrect)) return 'bad-args';
-  if (result.actionMatches.some((match) => !match.methodFound)) return 'missing-tool';
-  return null;
-}
diff --git a/src/optimizer/feedback/failure-details.ts b/src/optimizer/feedback/failure-details.ts
deleted file mode 100644
index 2f96cd8..0000000
--- a/src/optimizer/feedback/failure-details.ts
+++ /dev/null
@@ -1,88 +0,0 @@
-import type { TaskResult } from '../../benchmark/types.js';
-import { getExpectedActionName } from '../../benchmark/types.js';
-
-export type FailureKind = 'missing-tool' | 'bad-args' | 'hallucination' | 'error';
-
-export interface FailureDetail {
-  task_id: string;
-  model_id: string;
-  kind: FailureKind;
-  expected_action: string;
-  expected_args: Record<string, unknown>;
-  actual_calls: Array<{ action: string; args: Record<string, unknown> }>;
-  mismatch_detail: string;
-}
-
-export function extractFailureDetails(results: TaskResult[]): FailureDetail[] {
-  const out: FailureDetail[] = [];
-  for (const r of results) {
-    if (r.metrics.taskPassed) continue;
-
-    const actual = r.extractedCalls.map((c) => ({
-      action: c.method,
-      args: c.args,
-    }));
-
-    if (r.error) {
-      out.push({
-        task_id: r.task.id,
-        model_id: r.model.id,
-        kind: 'error',
-        expected_action: '',
-        expected_args: {},
-        actual_calls: actual,
-        mismatch_detail: r.error,
-      });
-      continue;
-    }
-
-    const matches = r.actionMatches;
-    for (const m of matches) {
-      const expectedName = getExpectedActionName(m.expected);
-      // Cross-check whether the expected method actually appears in extracted calls,
-      // regardless of the methodFound flag (fixture data may set methodFound=false even
-      // when the method name is present but args differ).
-      const methodActuallyFound = m.methodFound
-        || actual.some((a) => a.action === expectedName);
-      if (!methodActuallyFound) {
-        const alts = actual.map((a) => a.action).filter(Boolean);
-        out.push({
-          task_id: r.task.id,
-          model_id: r.model.id,
-          kind: 'missing-tool',
-          expected_action: expectedName,
-          expected_args: m.expected.args ?? {},
-          actual_calls: actual,
-          mismatch_detail: alts.length > 0 ? `called ${alts.join(', ')} instead` : 'no action calls produced',
-        });
-      } else if (!m.argsCorrect) {
-        const wrongArgs: string[] = [];
-        for (const [k, v] of Object.entries(m.argResults ?? {})) {
-          if (!v.match) wrongArgs.push(`${k}: expected ${v.expected}, got ${JSON.stringify(v.got)}`);
-        }
-        out.push({
-          task_id: r.task.id,
-          model_id: r.model.id,
-          kind: 'bad-args',
-          expected_action: expectedName,
-          expected_args: m.expected.args ?? {},
-          actual_calls: actual,
-          mismatch_detail: wrongArgs.join('; ') || 'args differed',
-        });
-      }
-    }
-
-    if (r.metrics.hallucinatedActions.length > 0) {
-      out.push({
-        task_id: r.task.id,
-        model_id: r.model.id,
-        kind: 'hallucination',
-        expected_action: matches.map((m) => getExpectedActionName(m.expected)).join(', '),
-        expected_args: {},
-        actual_calls: actual,
-        mismatch_detail: `hallucinated: ${r.metrics.hallucinatedActions.join(', ')}`,
-      });
-    }
-  }
-  return out;
-}
diff --git a/src/optimizer/feedback/mutation-context.ts b/src/optimizer/feedback/mutation-context.ts
deleted file mode 100644
index 474a16d..0000000
--- a/src/optimizer/feedback/mutation-context.ts
+++ /dev/null
@@ -1,46 +0,0 @@
-import type { BenchmarkReport } from '../../benchmark/types.js';
-import { extractFailureDetails, type FailureDetail } from './failure-details.js';
-import { detectPatterns, type Pattern } from './patterns.js';
-import { buildPassingFailingDiff, type PassingFailingDiff } from './passing-failing-diff.js';
-
-interface FeedbackPackage {
-  failureDetails: FailureDetail[];
-  patterns: Pattern[];
-  passingFailingDiffs: PassingFailingDiff[];
-  serialized: string;
-}
-
-export function buildMutationContext(report: BenchmarkReport, maxBytes: number): FeedbackPackage {
-  const failureDetails = extractFailureDetails(report.results);
-  const patterns = detectPatterns(failureDetails);
-  const diffs = buildPassingFailingDiff(report.results);
-
-  const details = budgetSlice(failureDetails, Math.floor(maxBytes * 0.3));
-  const patternSlice = budgetSlice(patterns, Math.floor(maxBytes * 0.4));
-  const diffSlice = budgetSlice(diffs, Math.floor(maxBytes * 0.3));
-
-  const serialized = [
-    '## Failure details',
-    JSON.stringify(details, null, 2),
-    '',
-    '## Cross-task patterns',
-    JSON.stringify(patternSlice, null, 2),
-    '',
-    '## Passing vs failing by task',
-    JSON.stringify(diffSlice, null, 2),
-  ].join('\n');
-
-  return { failureDetails: details, patterns: patternSlice, passingFailingDiffs: diffSlice, serialized };
-}
-
-function budgetSlice<T>(items: T[], maxBytes: number): T[] {
-  const kept: T[] = [];
-  let bytes = 0;
-  for (const item of items) {
-    const size = Buffer.byteLength(JSON.stringify(item));
-    if (bytes + size > maxBytes) break;
-    kept.push(item);
-    bytes += size;
-  }
-  return kept;
-}
diff --git a/src/optimizer/feedback/passing-failing-diff.ts b/src/optimizer/feedback/passing-failing-diff.ts
deleted file mode 100644
index b869ef6..0000000
--- a/src/optimizer/feedback/passing-failing-diff.ts
+++ /dev/null
@@ -1,41 +0,0 @@
-import type { TaskResult } from '../../benchmark/types.js';
-
-export interface PassingFailingDiff {
-  task_id: string;
-  prompt: string;
-  passing_models: string[];
-  failing_models: string[];
-  passing_calls: Array<{ model: string; actions: string[] }>;
-  failing_calls: Array<{ model: string; actions: string[] }>;
-}
-
-export function buildPassingFailingDiff(results: TaskResult[]): PassingFailingDiff[] {
-  const byTask = new Map<string, TaskResult[]>();
-  for (const r of results) {
-    const arr = byTask.get(r.task.id) ?? [];
-    arr.push(r);
-    byTask.set(r.task.id, arr);
-  }
-
-  const diffs: PassingFailingDiff[] = [];
-  for (const [taskId, rs] of byTask) {
-    const passing = rs.filter((r) => r.metrics.taskPassed);
-    const failing = rs.filter((r) => !r.metrics.taskPassed);
-    if (passing.length === 0 || failing.length === 0) continue;
-    diffs.push({
-      task_id: taskId,
-      prompt: rs[0]!.task.prompt,
-      passing_models: passing.map((r) => r.model.name),
-      failing_models: failing.map((r) => r.model.name),
-      passing_calls: passing.map((r) => ({
-        model: r.model.name,
-        actions: r.extractedCalls.map((c) => c.method),
-      })),
-      failing_calls: failing.map((r) => ({
-        model: r.model.name,
-        actions: r.extractedCalls.map((c) => c.method),
-      })),
-    });
-  }
-  return diffs;
-}
diff --git a/src/optimizer/feedback/patterns.ts b/src/optimizer/feedback/patterns.ts
deleted file mode 100644
index d13dc3c..0000000
--- a/src/optimizer/feedback/patterns.ts
+++ /dev/null
@@ -1,69 +0,0 @@
-import type { FailureDetail } from './failure-details.js';
-
-export type PatternKind = 'naming-mismatch' | 'systematic-hallucination' | 'arg-type-confusion';
-
-export interface Pattern {
-  kind: PatternKind;
-  summary: string;
-  modelCount: number;
-  taskCount: number;
-  evidence: string[];
-}
-
-export function detectPatterns(details: FailureDetail[]): Pattern[] {
-  const patterns: Pattern[] = [];
-
-  const hallucinationKey: Record<string, { tasks: Set<string>; models: Set<string> }> = {};
-  for (const d of details) {
-    if (d.kind === 'missing-tool') {
-      for (const call of d.actual_calls) {
-        if (call.action && call.action !== d.expected_action) {
-          const key = `${d.expected_action}→${call.action}`;
-          if (!hallucinationKey[key]) hallucinationKey[key] = { tasks: new Set(), models: new Set() };
-          hallucinationKey[key].tasks.add(d.task_id);
-          hallucinationKey[key].models.add(d.model_id);
-        }
-      }
-    }
-  }
-  for (const [key, s] of Object.entries(hallucinationKey)) {
-    if (s.models.size >= 2) {
-      patterns.push({
-        kind: 'systematic-hallucination',
-        summary: `Multiple models substitute ${key}`,
-        modelCount: s.models.size,
-        taskCount: s.tasks.size,
-        evidence: [...s.tasks],
-      });
-    }
-  }
-
-  const argConfusion: Record<string, { tasks: Set<string>; models: Set<string>; evidence: string[] }> = {};
-  for (const d of details) {
-    if (d.kind === 'bad-args') {
-      for (const line of d.mismatch_detail.split(';')) {
-        const keyMatch = line.match(/^\s*([A-Za-z0-9_]+):/);
-        if (keyMatch) {
-          const argKey = `${d.expected_action}.${keyMatch[1]}`;
-          if (!argConfusion[argKey]) argConfusion[argKey] = { tasks: new Set(), models: new Set(), evidence: [] };
-          argConfusion[argKey].tasks.add(d.task_id);
-          argConfusion[argKey].models.add(d.model_id);
-          argConfusion[argKey].evidence.push(line.trim());
-        }
-      }
-    }
-  }
-  for (const [key, s] of Object.entries(argConfusion)) {
-    if (s.tasks.size >= 2) {
-      patterns.push({
-        kind: 'arg-type-confusion',
-        summary: `Arg ${key} confused across ${s.tasks.size} tasks`,
-        modelCount: s.models.size,
-        taskCount: s.tasks.size,
-        evidence: [...new Set(s.evidence)].slice(0, 5),
-      });
-    }
-  }
-
-  return patterns;
-}
diff --git a/src/optimizer/index.ts b/src/optimizer/index.ts
deleted file mode 100644
index fc98cd9..0000000
--- a/src/optimizer/index.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-export { loadOptimizeManifest } from './manifest.js';
-export { analyzeFailures } from './failure-analysis.js';
-export { runOptimizeLoop } from './loop.js';
-export { createBenchmarkAdapter } from './benchmark-adapter.js';
-export { createRepoStateManager } from './repo-state.js';
-export { createValidationRunner } from './validation.js';
-export { createJsonLedger } from './ledger.js';
-export { PiCodingMutationExecutor } from './mutation/pi-coding.js';
-export { getMockRepoTemplatePath, listMockRepoTemplates, materializeMockRepo } from './mock-repos.js';
-export { createDefaultPiTaskGenerator, generateTasksForProject } from '../tasks/index.js';
-
-export type {
-  FailureBucket,
-  FailureBucketKind,
-  MutationCandidate,
-  MutationContext,
-  OptimizeIteration,
-  OptimizeLoopDependencies,
-  OptimizeManifest,
-  OptimizePolicy,
-  OptimizeResult,
-  OptimizeTargetRepo,
-  ResolvedOptimizeManifest,
-  StopReason,
-  TaskGenerationResult,
-  ValidationCommandResult,
-  ValidationResult,
-} from './types.js';
diff --git a/src/optimizer/ledger.ts b/src/optimizer/ledger.ts
deleted file mode 100644
index 679e80f..0000000
--- a/src/optimizer/ledger.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-import { existsSync, mkdirSync, readFileSync, renameSync, rmSync, writeFileSync } from 'node:fs';
-import { dirname } from 'node:path';
-
-export function createJsonLedger(path: string) {
-  return {
-    async record(event: Record<string, unknown>): Promise<void> {
-      mkdirSync(dirname(path), { recursive: true });
-      const corruptPath = `${path}.corrupt`;
-      let current = { version: 1, events: [] as Record<string, unknown>[] };
-
-      if (existsSync(path)) {
-        try {
-          current = JSON.parse(readFileSync(path, 'utf-8')) as { version: number; events: Record<string, unknown>[] };
-        } catch {
-          if (existsSync(corruptPath)) {
-            rmSync(corruptPath, { force: true });
-          }
-          renameSync(path, corruptPath);
-        }
-      }
-
-      current.events.push(event);
-      const tempPath = `${path}.tmp`;
-      writeFileSync(tempPath, JSON.stringify(current, null, 2) + '\n', 'utf-8');
-      renameSync(tempPath, path);
-    },
-  };
-}
diff --git a/src/optimizer/loop.ts b/src/optimizer/loop.ts
deleted file mode 100644
index 2014461..0000000
--- a/src/optimizer/loop.ts
+++ /dev/null
@@ -1,543 +0,0 @@
-import { copyFileSync, existsSync, mkdirSync } from 'node:fs';
-import { isAbsolute, join, relative, resolve } from 'node:path';
-
-import { analyzeFailures } from './failure-analysis.js';
-import { accept } from '../benchmark/scoring.js';
-import type { BenchmarkReport } from '../benchmark/types.js';
-import { getExpectedActionName } from '../benchmark/types.js';
-import type {
-  MutationCandidate,
-  OptimizeIteration,
-  OptimizeLoopDependencies,
-  OptimizeManifest,
-  OptimizeResult,
-  ResolvedOptimizeManifest,
-  TaskGenerationResult,
-} from './types.js';
-
-export async function runOptimizeLoop(
-  manifest: OptimizeManifest | ResolvedOptimizeManifest,
-  deps: OptimizeLoopDependencies,
-): Promise<OptimizeResult> {
-  const resolvedManifest = resolveManifest(manifest);
-  const sourceBenchmarkConfig = resolvedManifest.benchmarkConfig;
-  console.log(`[optimize] Target repo: ${resolvedManifest.targetRepo.path}`);
-  await deps.repo.ensureReady(resolvedManifest.targetRepo);
-  console.log('[optimize] Repository is ready.');
-
-  const outputDir = resolvedManifest.optimizer.taskGeneration.outputDir;
-  mkdirSync(outputDir, { recursive: true });
-  console.log(`[optimize] Artifact output dir: ${outputDir}`);
-
-  // ── Local skill versioning ──────────────────────────────────────────────────
-  // Copy the source skill from the target repo into outputDir as skill-v0.md.
-  // All subsequent iterations mutate a local versioned copy — the target repo
-  // is never written to. Each accepted iteration produces skill-v{N}.md.
-  let currentBestSkillPath: string | undefined;
-  if (resolvedManifest.skillPath && existsSync(resolvedManifest.skillPath)) {
-    const v0Path = join(outputDir, 'skill-v0.md');
-    copyFileSync(resolvedManifest.skillPath, v0Path);
-    currentBestSkillPath = v0Path;
-    console.log(`[optimize] Skill baseline copied to ${v0Path}`);
-  }
-
-  let generation: TaskGenerationResult | undefined;
-  if (resolvedManifest.optimizer.taskGeneration.enabled) {
-    if (!deps.taskGenerator) {
-      throw new Error('Optimize loop requires a task generator when optimizer.taskGeneration.enabled=true');
-    }
-
-    console.log('[optimize] Task generation is enabled.');
-    generation = await deps.taskGenerator.generate(resolvedManifest, { outputDir });
-    resolvedManifest.benchmarkConfig = generation.benchmarkConfigPath;
-    console.log(
-      `[optimize] Using generated benchmark config: ${generation.benchmarkConfigPath} ` +
-        `(tasks=${generation.taskCount}, rejected=${generation.rejectedCount})`,
-    );
-  } else {
-    // Task generation disabled (e.g. --skip-generation). Try to reuse existing frozen benchmark config.
-    const frozenConfigPath = join(outputDir, 'benchmark.generated.json');
-    if (existsSync(frozenConfigPath)) {
-      resolvedManifest.benchmarkConfig = frozenConfigPath;
-      console.log(`[optimize] Using existing frozen benchmark config: ${frozenConfigPath}`);
-    }
-  }
-  if (resolvedManifest.optimizer.mode === 'surface-changing' && !resolvedManifest.optimizer.taskGeneration.enabled) {
-    throw new Error('surface-changing optimize mode requires task generation to stay enabled so new epochs can regenerate tasks');
-  }
-
-  let acceptedCheckpoint = await deps.repo.captureCheckpoint(resolvedManifest.targetRepo);
-  console.log('[optimize] Captured initial checkpoint.');
-  console.log('[optimize] Running baseline benchmark...');
-  const verdictPolicy = {
-    perModelFloor: resolvedManifest.optimizer.perModelFloor,
-    targetWeightedAverage: resolvedManifest.optimizer.targetWeightedAverage,
-  };
-  const baselineResult = await deps.benchmark.run(resolvedManifest.benchmarkConfig, {
-    outputDir,
-    label: 'baseline',
-    verdictPolicy,
-    skillOverride: currentBestSkillPath,
-  });
-  const baselineReport = baselineResult.report;
-  let bestReport = baselineResult.report;
-  let lastReportPath = baselineResult.reportPath;
-  console.log(
-    `[optimize] Baseline complete: ${(baselineReport.summary.overallPassRate * 100).toFixed(1)}% ` +
-      `(report: ${baselineResult.reportPath})`,
-  );
-  let consecutiveStableIterations = 0;
-  const iterations: OptimizeIteration[] = [];
-
-  await deps.ledger.record({
-    type: 'baseline',
-    score: baselineReport.summary.overallPassRate,
-  });
-
-  for (let index = 1; index <= resolvedManifest.optimizer.maxIterations; index++) {
-    console.log(`\n[optimize] Iteration ${index}/${resolvedManifest.optimizer.maxIterations}`);
-    const failureBuckets = analyzeFailures(bestReport);
-    const failureSummaryLines = summarizeTopFailures(bestReport);
-    if (failureSummaryLines.length > 0) {
-      console.log('[optimize] Benchmark failure analysis (derived from the report, not agent output):');
-      for (const line of failureSummaryLines) {
-        console.log(`  - ${line}`);
-      }
-    }
-    let candidate: MutationCandidate | null = null;
-
-    // Prepare a versioned local skill file for this iteration.
-    // The mutation writes here; the benchmark reads from here.
-    let localSkillPath: string | undefined;
-    if (currentBestSkillPath) {
-      localSkillPath = join(outputDir, `skill-v${index}.md`);
-      copyFileSync(currentBestSkillPath, localSkillPath);
-    }
-
-    try {
-      console.log('[optimize] Applying mutation...');
-      candidate = await deps.mutation.apply({
-        manifest: resolvedManifest,
-        iteration: index,
-        currentReport: bestReport,
-        failureBuckets,
-        reportPath: lastReportPath,
-        localSkillPath,
-      });
-      if (candidate.toolActivity && candidate.toolActivity.length > 0) {
-        console.log('[optimize] Orchestrator tool activity:');
-        for (const line of candidate.toolActivity) {
-          console.log(`  ${line}`);
-        }
-      }
-      console.log('[optimize] Orchestrator response:');
-      for (const line of candidate.summary.split('\n')) {
-        if (line.trim()) {
-          console.log(`  ${line}`);
-        }
-      }
-
-      let changedFiles = await getChangedFiles(deps, resolvedManifest, candidate, localSkillPath);
-      console.log(
-        `[optimize] Changed files: ${changedFiles.length > 0 ? changedFiles.join(', ') : '(none)'}`,
-      );
-      // In local-skill mode: restore the repo to undo any rogue writes the agent may have made
-      // (the agent cwd is dirname(localSkillPath), not the target repo, so rogue writes are unlikely but not impossible),
-      // then skip scope validation (the local file is always in scope).
-      if (localSkillPath) {
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-      }
-      const scopeValidation = localSkillPath
-        ? null
-        : validateChangedFiles(changedFiles, resolvedManifest);
-      console.log('[optimize] Running validation...');
-      let validation = scopeValidation ?? await deps.validation.run(resolvedManifest.targetRepo);
-
-      const iteration: OptimizeIteration = {
-        index,
-        accepted: false,
-        summary: candidate.summary,
-        changedFiles,
-        validation,
-        scoreBefore: bestReport.summary.overallPassRate,
-        delta: 0,
-        failureBuckets,
-      };
-
-      if (!validation.ok) {
-        console.log(
-          `[optimize] Validation failed: ${validation.commands.map((command) => command.stderr || command.command).join(' | ')}`,
-        );
-        console.log('[optimize] Restoring checkpoint.');
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-        iterations.push(iteration);
-        await deps.ledger.record({ type: 'iteration', iteration });
-        continue;
-      }
-
-      changedFiles = await getChangedFiles(deps, resolvedManifest, candidate, localSkillPath);
-      iteration.changedFiles = changedFiles;
-      const postValidationScopeValidation = localSkillPath
-        ? null
-        : validateChangedFiles(changedFiles, resolvedManifest);
-      if (postValidationScopeValidation) {
-        console.log('[optimize] Validation introduced out-of-scope changes. Restoring checkpoint.');
-        iteration.validation = postValidationScopeValidation;
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-        iterations.push(iteration);
-        await deps.ledger.record({ type: 'iteration', iteration });
-        continue;
-      }
-
-      const surfaceChanged = didSurfaceChange(changedFiles, resolvedManifest);
-      if (surfaceChanged && resolvedManifest.optimizer.mode === 'stable-surface') {
-        console.log('[optimize] Callable surface changed during stable-surface mode. Restoring checkpoint.');
-        iteration.validation = buildInvariantValidationError(
-          'surface-invariant',
-          'stable-surface mode rejects callable surface changes; switch to surface-changing mode to allow them',
-        );
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-        iterations.push(iteration);
-        await deps.ledger.record({ type: 'iteration', iteration });
-        continue;
-      }
-      if (surfaceChanged && resolvedManifest.optimizer.mode === 'surface-changing') {
-        console.log('[optimize] Callable surface changed. Starting a new benchmark epoch...');
-        if (resolvedManifest.optimizer.taskGeneration.enabled) {
-          if (!deps.taskGenerator) {
-            throw new Error('Optimize loop requires a task generator to regenerate tasks after surface changes');
-          }
-          const regenerated = await deps.taskGenerator.generate(
-            { ...resolvedManifest, benchmarkConfig: sourceBenchmarkConfig },
-            { outputDir },
-          );
-          resolvedManifest.benchmarkConfig = regenerated.benchmarkConfigPath;
-          generation = regenerated;
-          console.log(
-            `[optimize] New epoch uses regenerated benchmark config: ${regenerated.benchmarkConfigPath} ` +
-              `(tasks=${regenerated.taskCount}, rejected=${regenerated.rejectedCount})`,
-          );
-        }
-
-        const epochBaseline = await deps.benchmark.run(resolvedManifest.benchmarkConfig, {
-          outputDir,
-          label: `epoch-${index + 1}-baseline`,
-          verdictPolicy,
-          skillOverride: localSkillPath,
-        });
-        iteration.accepted = true;
-        iteration.scoreAfter = epochBaseline.report.summary.overallPassRate;
-        iteration.perModelAfter = epochBaseline.report.summary.perModel;
-        iteration.delta = epochBaseline.report.summary.overallPassRate - bestReport.summary.overallPassRate;
-        bestReport = epochBaseline.report;
-        lastReportPath = epochBaseline.reportPath;
-        if (localSkillPath) {
-          currentBestSkillPath = localSkillPath;
-        } else {
-          acceptedCheckpoint = await deps.repo.updateAcceptedCheckpoint(
-            resolvedManifest.targetRepo,
-            acceptedCheckpoint,
-            candidate,
-            changedFiles,
-          );
-        }
-        consecutiveStableIterations = 0;
-        console.log(
-          `[optimize] Started new epoch baseline at ${(bestReport.summary.overallPassRate * 100).toFixed(1)}% ` +
-            `(report: ${epochBaseline.reportPath}).`,
-        );
-        iterations.push(iteration);
-        await deps.ledger.record({ type: 'iteration', iteration });
-        continue;
-      }
-
-      console.log('[optimize] Re-running benchmark for candidate changes...');
-      const candidateResult = await deps.benchmark.run(resolvedManifest.benchmarkConfig, {
-        outputDir,
-        label: `iteration-${index}`,
-        verdictPolicy,
-        skillOverride: localSkillPath,
-      });
-      const candidateReport = candidateResult.report;
-      changedFiles = await getChangedFiles(deps, resolvedManifest, candidate, localSkillPath);
-      iteration.changedFiles = changedFiles;
-      const postBenchmarkScopeValidation = localSkillPath
-        ? null
-        : validateChangedFiles(changedFiles, resolvedManifest);
-      if (postBenchmarkScopeValidation) {
-        console.log('[optimize] Benchmark rerun introduced out-of-scope changes. Restoring checkpoint.');
-        iteration.validation = postBenchmarkScopeValidation;
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-        iterations.push(iteration);
-        await deps.ledger.record({ type: 'iteration', iteration });
-        continue;
-      }
-
-      const beforeReport = bestReport;
-      const afterReport = candidateReport;
-      iteration.scoreAfter = afterReport.summary.overallPassRate;
-      iteration.perModelAfter = afterReport.summary.perModel;
-      iteration.delta = afterReport.summary.overallPassRate - beforeReport.summary.overallPassRate;
-
-      const accepted = accept(beforeReport, afterReport, resolvedManifest.optimizer.models, {
-        perModelFloor: resolvedManifest.optimizer.perModelFloor,
-        targetWeightedAverage: resolvedManifest.optimizer.targetWeightedAverage,
-        minImprovement: resolvedManifest.optimizer.minImprovement,
-      });
-
-      if (accepted) {
-        iteration.accepted = true;
-        bestReport = afterReport;
-        lastReportPath = candidateResult.reportPath;
-        const beforeAvg = (beforeReport.summary.weightedAverage ?? 0) * 100;
-        const afterAvg = (afterReport.summary.weightedAverage ?? 0) * 100;
-        console.log(
-          `[optimize] Accepted iteration ${index}: weighted average ${beforeAvg.toFixed(1)}% -> ${afterAvg.toFixed(1)}%.`,
-        );
-        if (localSkillPath) {
-          currentBestSkillPath = localSkillPath;
-        } else {
-          acceptedCheckpoint = await deps.repo.updateAcceptedCheckpoint(
-            resolvedManifest.targetRepo,
-            acceptedCheckpoint,
-            candidate,
-            changedFiles,
-          );
-        }
-        consecutiveStableIterations = 0;
-      } else {
-        const beforeAvg = (beforeReport.summary.weightedAverage ?? 0) * 100;
-        const afterAvg = (afterReport.summary.weightedAverage ?? 0) * 100;
-        console.log(
-          `[optimize] Rejected iteration ${index}: gates not satisfied ` +
-            `(weighted ${beforeAvg.toFixed(1)}% -> ${afterAvg.toFixed(1)}%; ` +
-            `min improvement ${(resolvedManifest.optimizer.minImprovement * 100).toFixed(1)} pts; ` +
-            `per-model floor ${(resolvedManifest.optimizer.perModelFloor * 100).toFixed(1)}%).`,
-        );
-        console.log('[optimize] Restoring checkpoint.');
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-        consecutiveStableIterations += 1;
-      }
-
-      iterations.push(iteration);
-      await deps.ledger.record({ type: 'iteration', iteration });
-
-      if (consecutiveStableIterations >= resolvedManifest.optimizer.stabilityWindow) {
-        console.log(
-          `[optimize] Stopping because we saw ${resolvedManifest.optimizer.stabilityWindow} consecutive iterations ` +
-            'without a meaningful improvement.',
-        );
-        return { baselineReport, bestReport, iterations, stopReason: 'stable', generation };
-      }
-    } catch (error) {
-      try {
-        await deps.repo.restoreCheckpoint(resolvedManifest.targetRepo, acceptedCheckpoint);
-      } catch (restoreError) {
-        throw new AggregateError(
-          [error, restoreError],
-          'Optimize iteration failed and checkpoint restore also failed',
-        );
-      }
-      throw error;
-    }
-  }
-
-  return {
-    baselineReport,
-    bestReport,
-    iterations,
-    stopReason: 'max-iterations',
-    generation,
-  };
-}
-
-function summarizeTopFailures(report: BenchmarkReport, limit = 3): string[] {
-  const grouped = new Map<string, {
-    prompt: string;
-    failCount: number;
-    models: Set<string>;
-    missing: Set<string>;
-    badArgs: Set<string>;
-  }>();
-
-  for (const result of report.results) {
-    if (result.metrics.taskPassed) continue;
-
-    const existing = grouped.get(result.task.id) ?? {
-      prompt: result.task.prompt,
-      failCount: 0,
-      models: new Set<string>(),
-      missing: new Set<string>(),
-      badArgs: new Set<string>(),
-    };
-    existing.failCount += 1;
-    existing.models.add(result.model.name);
-
-    for (const match of result.actionMatches) {
-      if (!match.methodFound) {
-        existing.missing.add(getExpectedActionName(match.expected));
-      } else if (!match.argsCorrect) {
-        existing.badArgs.add(getExpectedActionName(match.expected));
-      }
-    }
-
-    grouped.set(result.task.id, existing);
-  }
-
-  return [...grouped.entries()]
-    .sort((a, b) => b[1].failCount - a[1].failCount)
-    .slice(0, limit)
-    .map(([taskId, info]) => {
-      const details: string[] = [];
-      if (info.missing.size > 0) {
-        details.push(`missing tools: ${[...info.missing].join(', ')}`);
-      }
-      if (info.badArgs.size > 0) {
-        details.push(`bad args: ${[...info.badArgs].join(', ')}`);
-      }
-
-      return `${taskId} fails on ${[...info.models].join(', ')}. ${details.join('; ') || 'See report for details.'}`;
-    });
-}
-
-function validateChangedFiles(changedFiles: string[], manifest: ResolvedOptimizeManifest) {
-  const repoRoot = manifest.targetRepo.path;
-  const disallowedFiles = changedFiles.filter(
-    (file) => !isFrameworkArtifactPath(file, manifest) && !isAllowedPath(file, manifest.targetRepo.allowedPaths, repoRoot),
-  );
-  if (disallowedFiles.length === 0) {
-    return null;
-  }
-
-  return buildInvariantValidationError('scope-check', `Changed files outside allowed paths: ${disallowedFiles.join(', ')}`);
-}
-
-function buildInvariantValidationError(command: string, message: string) {
-  return {
-    ok: false,
-    commands: [
-      {
-        command,
-        ok: false,
-        exitCode: 1,
-        stdout: '',
-        stderr: message,
-      },
-    ],
-  };
-}
-
-function didSurfaceChange(changedFiles: string[], manifest: ResolvedOptimizeManifest): boolean {
-  const surfacePaths = manifest.targetRepo.surfacePaths ?? [];
-  const relevantSurfacePaths = new Set(
-    surfacePaths.map((surfacePath) => normalizeRelativePath(toRelativeTargetPath(surfacePath, manifest))),
-  );
-
-  return changedFiles.some((file) => relevantSurfacePaths.has(normalizeRelativePath(file)));
-}
-
-function toRelativeTargetPath(path: string, manifest: ResolvedOptimizeManifest): string {
-  if (path.startsWith('/') || path.startsWith('\\')) {
-    return relative(manifest.targetRepo.path, path);
-  }
-
-  return path;
-}
-
-function isAllowedPath(file: string, allowedPaths: string[], repoRoot?: string): boolean {
-  const normalizedFile = normalizeRelativePath(
-    repoRoot && isAbsolute(file) ? relative(repoRoot, file) : file,
-  );
-  return allowedPaths.some((allowedPath) => {
-    const normalizedAllowed = normalizeRelativePath(allowedPath);
-    return normalizedFile === normalizedAllowed || normalizedFile.startsWith(`${normalizedAllowed}/`);
-  });
-}
-
-function normalizeRelativePath(path: string): string {
-  return path.replace(/^\.\//, '').replace(/^\/+/, '').replace(/\/+$/, '');
-}
-
-async function getChangedFiles(
-  deps: OptimizeLoopDependencies,
-  manifest: ResolvedOptimizeManifest,
-  candidate: MutationCandidate,
-  localSkillPath?: string,
-): Promise<string[]> {
-  // In local-skill mode the changed file lives outside the target repo, so git
-  // status will never list it. Return the skill path directly so logs and
-  // iteration metadata correctly reflect what the agent wrote.
-  if (localSkillPath) {
-    return [localSkillPath];
-  }
-  const changedFiles = deps.repo.listChangedFiles
-    ? deps.repo.listChangedFiles(manifest.targetRepo)
-    : [...candidate.changedFiles];
-  return (await changedFiles).filter((file) => !isFrameworkArtifactPath(file, manifest));
-}
-
-function isFrameworkArtifactPath(file: string, manifest: ResolvedOptimizeManifest): boolean {
-  const artifactDir = relative(manifest.targetRepo.path, manifest.optimizer.taskGeneration.outputDir);
-  if (!artifactDir || artifactDir.startsWith('..')) {
-    return false;
-  }
-
-  const normalizedFile = normalizeRelativePath(file);
-  const normalizedArtifactDir = normalizeRelativePath(artifactDir);
-  return normalizedFile === normalizedArtifactDir || normalizedFile.startsWith(`${normalizedArtifactDir}/`);
-}
-
-function resolveManifest(manifest: OptimizeManifest | ResolvedOptimizeManifest): ResolvedOptimizeManifest {
-  const unresolved = manifest as OptimizeManifest;
-  if (unresolved.targetRepo.requireCleanGit === false) {
-    throw new Error('Optimize target repos must keep requireCleanGit=true in v1');
-  }
-
-  return {
-    benchmarkConfig: unresolved.benchmarkConfig,
-    skillPath: (unresolved as Partial<ResolvedOptimizeManifest>).skillPath,
-    targetRepo: {
-      ...unresolved.targetRepo,
-      surfacePaths: unresolved.targetRepo.surfacePaths ?? [],
-      cleanIgnorePaths: unresolved.targetRepo.cleanIgnorePaths ?? deriveCleanIgnorePaths(
-        unresolved.targetRepo.path,
-        unresolved.optimizer?.taskGeneration?.outputDir,
-      ),
-      requireCleanGit: unresolved.targetRepo.requireCleanGit ?? true,
-    },
-    optimizer: {
-      maxIterations: unresolved.optimizer?.maxIterations ?? 5,
-      stabilityWindow: unresolved.optimizer?.stabilityWindow ?? 2,
-      minImprovement: unresolved.optimizer?.minImprovement ?? 0.02,
-      taskGeneration: {
-        enabled: unresolved.optimizer?.taskGeneration?.enabled ?? false,
-        maxGenerated: unresolved.optimizer?.taskGeneration?.maxGenerated ?? 10,
-        seed: unresolved.optimizer?.taskGeneration?.seed ?? 1,
-        outputDir: resolve(unresolved.optimizer?.taskGeneration?.outputDir ?? '.skill-optimizer'),
-      },
-      mode: unresolved.optimizer?.mode ?? 'stable-surface',
-      perModelFloor: unresolved.optimizer?.perModelFloor ?? 0.6,
-      targetWeightedAverage: unresolved.optimizer?.targetWeightedAverage ?? 0.7,
-      models: unresolved.optimizer?.models ?? [],
-    },
-    mutation: unresolved.mutation
-      ? {
-          ...unresolved.mutation,
-          reportContextMaxBytes: unresolved.mutation.reportContextMaxBytes ?? 16_000,
-        }
-      : undefined,
-  };
-}
-
-function deriveCleanIgnorePaths(targetRepoPath: string, outputDir?: string): string[] {
-  if (!outputDir) {
-    return [];
-  }
-
-  const relativeOutputDir = relative(targetRepoPath, resolve(outputDir));
-  if (!relativeOutputDir || relativeOutputDir.startsWith('..')) {
-    return [];
-  }
-
-  return [relativeOutputDir];
-}
diff --git a/src/optimizer/main.ts b/src/optimizer/main.ts
deleted file mode 100644
index 647dd34..0000000
--- a/src/optimizer/main.ts
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/usr/bin/env node
-
-import { resolve } from 'node:path';
-import { pathToFileURL } from 'node:url';
-import {
-  createBenchmarkAdapter,
-  createJsonLedger,
-  createRepoStateManager,
-  createValidationRunner,
-  loadOptimizeManifest,
-  PiCodingMutationExecutor,
-  runOptimizeLoop,
-} from './index.js';
-import { createDefaultPiTaskGenerator, generateTasksForProject } from '../tasks/index.js';
-import { renderProgressTable } from './progress-table.js';
-
-function printUsage(): void {
-  console.log(`
-Usage:
-  tsx src/optimizer/main.ts <skill-optimizer.json> [--max-iterations <n>] [--skip-generation]
-
-Examples:
-  tsx src/optimizer/main.ts ./skill-optimizer.json
-  tsx src/optimizer/main.ts ./skill-optimizer.json --max-iterations 8
-  tsx src/optimizer/main.ts ./skill-optimizer.json --skip-generation
-`);
-}
-
-function getFlag(args: string[], flag: string): string | undefined {
-  const idx = args.indexOf(flag);
-  if (idx === -1) return undefined;
-  const value = args[idx + 1];
-  if (!value || value.startsWith('--')) {
-    throw new Error(`Flag ${flag} requires a value`);
-  }
-  return value;
-}
-
-async function main(): Promise<void> {
-  const args = process.argv.slice(2);
-  if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
-    printUsage();
-    process.exit(args.length === 0 ? 1 : 0);
-  }
-
-  const manifestPath = args[0];
-  if (!manifestPath || manifestPath.startsWith('--')) {
-    printUsage();
-    process.exit(1);
-  }
-
-  const { result, resolvedManifest, ledgerPath } = await runOptimizeFromConfig(manifestPath, {
-    maxIterationsRaw: getFlag(args, '--max-iterations'),
-    skipGeneration: args.includes('--skip-generation'),
-  });
-
-  printOptimizeSummary(result, resolvedManifest, ledgerPath);
-}
-
-export async function runOptimizeFromConfig(
-  manifestPath: string,
-  options: { maxIterationsRaw?: string; skipGeneration?: boolean } = {},
-) {
-  const manifest = await loadOptimizeManifest(manifestPath);
-  const maxIterations = options.maxIterationsRaw ? Number(options.maxIterationsRaw) : undefined;
-  if (options.maxIterationsRaw && (!Number.isInteger(maxIterations) || (maxIterations ?? 0) <= 0)) {
-    throw new Error(`Invalid --max-iterations value '${options.maxIterationsRaw}'. Must be a positive integer.`);
-  }
-
-  const resolvedManifest = maxIterations
-    ? {
-        ...manifest,
-        optimizer: {
-          ...manifest.optimizer,
-          maxIterations,
-          taskGeneration: {
-            ...manifest.optimizer.taskGeneration,
-            enabled: options.skipGeneration ? false : manifest.optimizer.taskGeneration.enabled,
-          },
-        },
-      }
-    : {
-        ...manifest,
-        optimizer: {
-          ...manifest.optimizer,
-          taskGeneration: {
-            ...manifest.optimizer.taskGeneration,
-            enabled: options.skipGeneration ? false : manifest.optimizer.taskGeneration.enabled,
-          },
-        },
-      };
-
-  const taskGenerator = resolvedManifest.optimizer.taskGeneration.enabled
-    ? {
-        generate: async (loopManifest: typeof resolvedManifest, opts: { outputDir: string }) => {
-          const mutation = loopManifest.mutation;
-          if (!mutation) {
-            throw new Error('Optimize manifest must define a mutation section when task generation is enabled');
-          }
-
-          const deps = createDefaultPiTaskGenerator({
-            provider: mutation.provider,
-            model: mutation.model,
-            authMode: mutation.authMode,
-            apiKeyEnv: mutation.apiKeyEnv,
-          });
-          const generation = await generateTasksForProject({
-            configPath: loopManifest.benchmarkConfig,
-            maxTasks: loopManifest.optimizer.taskGeneration.maxGenerated,
-            seed: loopManifest.optimizer.taskGeneration.seed,
-            outputDir: opts.outputDir,
-            deps,
-          });
-          return {
-            benchmarkConfigPath: generation.artifacts.benchmarkPath,
-            taskCount: generation.kept.length,
-            rejectedCount: generation.rejected.length,
-          };
-        },
-      }
-    : undefined;
-
-  const ledgerPath = resolve(resolvedManifest.optimizer.taskGeneration.outputDir, 'optimize-ledger.json');
-  const result = await runOptimizeLoop(resolvedManifest, {
-    benchmark: createBenchmarkAdapter(),
-    repo: createRepoStateManager(),
-    mutation: new PiCodingMutationExecutor(),
-    taskGenerator,
-    validation: createValidationRunner(),
-    ledger: createJsonLedger(ledgerPath),
-  });
-
-  return { result, resolvedManifest, ledgerPath };
-}
-
-export function printOptimizeSummary(
-  result: Awaited<ReturnType<typeof runOptimizeLoop>>,
-  resolvedManifest: Awaited<ReturnType<typeof loadOptimizeManifest>>,
-  ledgerPath: string,
-): void {
-  console.log('');
-  if (result.generation) {
-    console.log(`Generated tasks: ${result.generation.taskCount} (rejected: ${result.generation.rejectedCount})`);
-    console.log(`Frozen config: ${result.generation.benchmarkConfigPath}`);
-  }
-  console.log(`Iterations: ${result.iterations.length}`);
-  if (result.stopReason === 'stable') {
-    console.log(
-      `Stop reason: stable (${resolvedManifest.optimizer.stabilityWindow} consecutive iterations without a meaningful improvement)`,
-    );
-  } else {
-    console.log(`Stop reason: max iterations reached (${resolvedManifest.optimizer.maxIterations})`);
-  }
-  console.log(`Run log: ${ledgerPath}`);
-  console.log(renderProgressTable(result.baselineReport, result.bestReport, result.iterations));
-}
-
-function isExecutedDirectly(): boolean {
-  const entry = process.argv[1];
-  return Boolean(entry) && import.meta.url === pathToFileURL(entry).href;
-}
-
-if (isExecutedDirectly()) {
-  main().catch((error) => {
-    console.error(`FATAL: ${error instanceof Error ? error.message : String(error)}`);
-    if (error instanceof Error && error.stack) {
-      console.error(error.stack);
-    }
-    process.exit(1);
-  });
-}
diff --git a/src/optimizer/manifest.ts b/src/optimizer/manifest.ts
deleted file mode 100644
index 8bcebc8..0000000
--- a/src/optimizer/manifest.ts
+++ /dev/null
@@ -1,8 +0,0 @@
-import { loadProjectConfig, toOptimizeManifest } from '../project/index.js';
-
-import type { ResolvedOptimizeManifest } from './types.js';
-
-export async function loadOptimizeManifest(configPath: string): Promise<ResolvedOptimizeManifest> {
-  const project = await loadProjectConfig(configPath);
-  return toOptimizeManifest(project);
-}
diff --git a/src/optimizer/materialize-mock-repo.ts b/src/optimizer/materialize-mock-repo.ts
deleted file mode 100644
index a30b59a..0000000
--- a/src/optimizer/materialize-mock-repo.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env node
-
-import { mkdtempSync } from 'node:fs';
-import { join, resolve } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import { listMockRepoTemplates, materializeMockRepo } from './mock-repos.js';
-
-function printUsage(): void {
-  console.log(`
-Usage:
-  tsx src/optimizer/materialize-mock-repo.ts <mcp-tracker-demo|sdk-counter-demo|cli-taskfile-demo> [destination-root]
-
-Examples:
-  tsx src/optimizer/materialize-mock-repo.ts mcp-tracker-demo
-  tsx src/optimizer/materialize-mock-repo.ts sdk-counter-demo ./.tmp/mock-repos
-  tsx src/optimizer/materialize-mock-repo.ts cli-taskfile-demo
-`);
-}
-
-async function main(): Promise<void> {
-  const [name, destinationRootArg] = process.argv.slice(2);
-  if (!name || name === '--help' || name === '-h') {
-    printUsage();
-    process.exit(name ? 0 : 1);
-  }
-
-  if (!listMockRepoTemplates().includes(name as never)) {
-    throw new Error(`Unknown mock repo '${name}'. Expected one of: ${listMockRepoTemplates().join(', ')}`);
-  }
-
-  const destinationRoot = destinationRootArg
-    ? resolve(destinationRootArg)
-    : mkdtempSync(join(tmpdir(), 'skill-optimizer-materialized-'));
-
-  const repoPath = await materializeMockRepo(name as never, destinationRoot);
-  console.log(repoPath);
-}
-
-main().catch((error) => {
-  console.error(`FATAL: ${error instanceof Error ? error.message : String(error)}`);
-  process.exit(1);
-});
diff --git a/src/optimizer/mock-repos.ts b/src/optimizer/mock-repos.ts
deleted file mode 100644
index 89993d1..0000000
--- a/src/optimizer/mock-repos.ts
+++ /dev/null
@@ -1,59 +0,0 @@
-import { cpSync, existsSync, mkdirSync, rmSync } from 'node:fs';
-import { execFile } from 'node:child_process';
-import { dirname, join, resolve, sep } from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { promisify } from 'node:util';
-
-const execFileAsync = promisify(execFile);
-
-const MOCK_REPO_NAMES = ['mcp-tracker-demo', 'sdk-counter-demo', 'cli-taskfile-demo'] as const;
-
-export type MockRepoName = (typeof MOCK_REPO_NAMES)[number];
-
-export function listMockRepoTemplates(): MockRepoName[] {
-  return MOCK_REPO_NAMES.filter((name) => existsSync(getMockRepoTemplatePath(name)));
-}
-
-export function getMockRepoTemplatePath(name: MockRepoName): string {
-  if (!MOCK_REPO_NAMES.includes(name)) {
-    throw new Error(`Unknown mock repo template: ${name}`);
-  }
-
-  const repoRoot = resolve(dirname(fileURLToPath(import.meta.url)), '..', '..');
-  return join(repoRoot, 'mock-repos', name);
-}
-
-export async function materializeMockRepo(name: MockRepoName, destinationRoot: string): Promise<string> {
-  const templatePath = getMockRepoTemplatePath(name);
-  if (!existsSync(templatePath)) {
-    throw new Error(`Mock repo template not found: ${templatePath}`);
-  }
-
-  mkdirSync(destinationRoot, { recursive: true });
-  const destinationPath = resolve(destinationRoot, name);
-  assertNoPathOverlap(templatePath, destinationPath);
-  rmSync(destinationPath, { recursive: true, force: true });
-  cpSync(templatePath, destinationPath, {
-    recursive: true,
-    force: true,
-  });
-
-  await initializeGitRepo(destinationPath);
-  return destinationPath;
-}
-
-async function initializeGitRepo(cwd: string): Promise<void> {
-  await execFileAsync('git', ['init'], { cwd, encoding: 'utf-8' });
-  await execFileAsync('git', ['config', 'user.name', 'OpenCode Mock Repo'], { cwd, encoding: 'utf-8' });
-  await execFileAsync('git', ['config', 'user.email', 'opencode-mock@example.com'], { cwd, encoding: 'utf-8' });
-  await execFileAsync('git', ['add', '-A'], { cwd, encoding: 'utf-8' });
-  await execFileAsync('git', ['-c', 'commit.gpgsign=false', 'commit', '-m', 'chore: initialize mock repo'], { cwd, encoding: 'utf-8' });
-}
-
-function assertNoPathOverlap(templatePath: string, destinationPath: string): void {
-  const source = resolve(templatePath);
-  const destination = resolve(destinationPath);
-  if (source === destination || source.startsWith(`${destination}${sep}`) || destination.startsWith(`${source}${sep}`)) {
-    throw new Error(`Materialized destination overlaps the tracked template path: ${destination}`);
-  }
-}
diff --git a/src/optimizer/mutation/git-changes.ts b/src/optimizer/mutation/git-changes.ts
deleted file mode 100644
index 5b3432a..0000000
--- a/src/optimizer/mutation/git-changes.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import { execFile } from 'node:child_process';
-import { promisify } from 'node:util';
-
-const execFileAsync = promisify(execFile);
-
-export async function collectGitChangedFiles(cwd: string): Promise<string[]> {
-  const [unstaged, staged, untracked, ignored] = await Promise.all([
-    gitList(cwd, ['diff', '--name-only']),
-    gitList(cwd, ['diff', '--name-only', '--cached']),
-    gitList(cwd, ['ls-files', '--others', '--exclude-standard']),
-    gitList(cwd, ['ls-files', '--others', '-i', '--exclude-standard']),
-  ]);
-
-  return [...new Set([...unstaged, ...staged, ...untracked, ...ignored])].sort();
-}
-
-async function gitList(cwd: string, args: string[]): Promise<string[]> {
-  const result = await execFileAsync('git', args, { cwd, encoding: 'utf-8' });
-  return result.stdout
-    .split('\n')
-    .map((line) => line.trim())
-    .filter(Boolean);
-}
diff --git a/src/optimizer/mutation/pi-coding.ts b/src/optimizer/mutation/pi-coding.ts
deleted file mode 100644
index 697c9c0..0000000
--- a/src/optimizer/mutation/pi-coding.ts
+++ /dev/null
@@ -1,216 +0,0 @@
-import { readFileSync } from 'node:fs';
-import { createHash } from 'node:crypto';
-import { dirname, basename } from 'node:path';
-import type { AgentMessage } from '@mariozechner/pi-agent-core';
-import type { MutationCandidate, MutationContext } from '../types.js';
-import { collectGitChangedFiles } from './git-changes.js';
-import { buildMutationContext } from '../feedback/mutation-context.js';
-import { createCodingOrchestratorSession } from '../../runtime/pi/index.js';
-import { SKILL_WRITING_GUIDE } from './skill-writing-guide.js';
-
-export class PiCodingMutationExecutor {
-  async apply(context: MutationContext): Promise<MutationCandidate> {
-    const mutation = context.manifest.mutation;
-    if (!mutation) {
-      throw new Error('Optimize manifest must define a "mutation" section for pi-coding execution');
-    }
-
-    // When localSkillPath is provided, the skill file is a local versioned copy
-    // outside the target repo. Set cwd to the skill file's directory so the agent
-    // operates in isolation — it can't see or accidentally edit the target repo,
-    // and the file it needs to edit is right in its working directory.
-    const agentCwd = context.localSkillPath
-      ? dirname(context.localSkillPath)
-      : context.manifest.targetRepo.path;
-
-    // Snapshot the skill file before mutation so we can detect no-ops.
-    const beforeHash = context.localSkillPath ? hashFile(context.localSkillPath) : null;
-
-    const { session } = await createCodingOrchestratorSession({
-      cwd: agentCwd,
-      modelRef: `${mutation.provider}/${mutation.model}`,
-      authMode: mutation.authMode,
-      apiKeyEnv: mutation.apiKeyEnv,
-      thinkingLevel: mutation.thinkingLevel ?? 'medium',
-    });
-
-    await session.prompt(buildMutationPrompt(context));
-
-    const messages = session.state.messages;
-    const toolActivity = extractToolActivity(messages);
-    const assistantText = extractLatestAssistantText(messages);
-
-    // If the orchestrator produced no text and no tool calls the model failed to act.
-    // This usually means the optimizer model is too weak or the API call silently failed.
-    // Throw rather than silently producing an identical skill file and wasting a benchmark run.
-    if (!assistantText && toolActivity.length === 0) {
-      const modelRef = `${mutation.provider}/${mutation.model}`;
-      throw new Error(
-        `Orchestrator model "${modelRef}" produced no output (no tool calls, no text response). ` +
-        `The model may be too weak for coding-orchestrator tasks or the API call failed silently. ` +
-        `Try a more capable model such as openrouter/anthropic/claude-sonnet-4.6.`,
-      );
-    }
-
-    // Warn when the agent responded with text but never called a tool to modify the file.
-    if (context.localSkillPath && toolActivity.length === 0) {
-      console.warn('[mutation] WARNING: orchestrator produced text but made no tool calls — skill file unchanged');
-    }
-
-    // Detect no-op: agent ran but the file content did not change.
-    const afterHash = context.localSkillPath ? hashFile(context.localSkillPath) : null;
-    if (beforeHash !== null && afterHash !== null && beforeHash === afterHash) {
-      console.warn('[mutation] WARNING: skill file content is identical before and after mutation');
-    }
-
-    // Local skill file: return the path directly; git detection only applies to target-repo mutations.
-    const changedFiles = context.localSkillPath
-      ? [context.localSkillPath]
-      : await collectGitChangedFiles(context.manifest.targetRepo.path);
-    const summary = assistantText
-      ?? context.failureBuckets[0]?.kind
-      ?? 'benchmark failures';
-
-    return {
-      summary,
-      changedFiles,
-      toolActivity,
-    };
-  }
-}
-
-function buildMutationPrompt(context: MutationContext): string {
-  // If we have a local skill path, that's the only file the agent should edit.
-  // The path is absolute so the agent can find it regardless of cwd.
-  const allowedPaths = context.localSkillPath
-    ? `- ${basename(context.localSkillPath)}  (in current working directory)`
-    : context.manifest.targetRepo.allowedPaths.map((p) => `- ${p}`).join('\n');
-  const feedbackCtx = buildMutationContext(
-    context.currentReport,
-    context.manifest.mutation?.reportContextMaxBytes ?? 16_000,
-  );
-  const reportContext = feedbackCtx.serialized || null;
-  const fallbackFailureSummary = context.failureBuckets.length === 0
-    ? '- No failure buckets were detected; improve benchmark pass rate conservatively.'
-    : context.failureBuckets
-      .slice(0, 5)
-      .map((bucket) => `- ${bucket.kind}: ${bucket.count} failures`)
-      .join('\n');
-
-  const skillWritingSection = context.localSkillPath
-    ? [
-        '',
-        SKILL_WRITING_GUIDE,
-        '',
-      ].join('\n')
-    : '';
-
-  const skillFileName = context.localSkillPath ? basename(context.localSkillPath) : null;
-  const skillPreamble = skillFileName
-    ? [
-        `Improve the skill documentation file: ${skillFileName}`,
-        '(This file is in your current working directory.)',
-        '',
-        'IMPORTANT: Read the file first, then make surgical edits.',
-        '- Do NOT rewrite or replace the file — patch only the sections that are weak or missing.',
-        '- Preserve every command/action that is already documented and passing.',
-        '- The skill must continue to cover ALL commands in the surface, not just the failing ones.',
-        '- Add or expand only the sections that address the benchmark failures below.',
-      ].join('\n')
-    : 'Improve this repository for LLM usability based on benchmark feedback.';
-
-  return [
-    skillPreamble,
-    '',
-    'Constraints:',
-    '- The benchmark tool schema is frozen. Do not modify benchmark tool definitions, expected tool APIs, or benchmark task contracts.',
-    '- Only edit files under these allowed paths:',
-    allowedPaths,
-    '- Do not edit files outside the allowed paths, even if they seem related.',
-    '- Preserve overall product correctness.',
-    '- Prefer the smallest change that improves agent usability.',
-    '- If the tool names are cryptic, you may introduce a friendly alias glossary in the docs (for example, "get_ticket -> get_tkt") without changing the actual schema.',
-    '- Make the docs explicit about what each parameter means and which values are allowed.',
-    '',
-    `Current overall pass rate: ${context.currentReport.summary.overallPassRate.toFixed(3)}`,
-    reportContext
-      ? 'Use the following persisted benchmark details as the primary source of truth:'
-      : 'Fallback failure summary (use this only because no persisted report context is available):',
-    reportContext ?? fallbackFailureSummary,
-    '',
-    skillWritingSection,
-    'Make the changes directly in the repo and stop when the changes are applied.',
-    'In your final response, explain in 2-4 concise bullet points:',
-    '- what you changed',
-    '- why it should improve model tool use',
-    '- any remaining weak spots you still see',
-  ].join('\n');
-}
-
-function extractLatestAssistantText(messages: AgentMessage[]): string | null {
-  for (let index = messages.length - 1; index >= 0; index--) {
-    const message = messages[index] as {
-      role?: string;
-      content?: Array<{ type?: string; text?: string }> | string;
-    };
-    if (message.role !== 'assistant') continue;
-
-    if (typeof message.content === 'string' && message.content.trim()) {
-      return message.content.trim();
-    }
-
-    if (Array.isArray(message.content)) {
-      const text = message.content
-        .filter((block) => block?.type === 'text' && typeof block.text === 'string')
-        .map((block) => block.text!.trim())
-        .filter(Boolean)
-        .join('\n')
-        .trim();
-      if (text) return text;
-    }
-  }
-
-  return null;
-}
-
-function extractToolActivity(messages: AgentMessage[]): string[] {
-  const lines: string[] = [];
-  for (const message of messages as Array<{
-    role?: string;
-    toolName?: string;
-    content?: Array<{ type?: string; text?: string; name?: string; arguments?: Record<string, unknown> }> | string;
-  }>) {
-    if (message.role === 'assistant' && Array.isArray(message.content)) {
-      for (const block of message.content) {
-        if (block?.type !== 'toolCall' || typeof block.name !== 'string') continue;
-        const args = block.arguments && Object.keys(block.arguments).length > 0
-          ? ` ${JSON.stringify(block.arguments)}`
-          : '';
-        lines.push(`tool call: ${block.name}${args}`);
-      }
-      continue;
-    }
-
-    if (message.role === 'toolResult' && Array.isArray(message.content)) {
-      const text = message.content
-        .filter((block) => block?.type === 'text' && typeof block.text === 'string')
-        .map((block) => block.text!.trim())
-        .filter(Boolean)
-        .join(' ')
-        .trim();
-      if (text) {
-        lines.push(`tool result (${message.toolName ?? 'tool'}): ${text}`);
-      }
-    }
-  }
-
-  return lines;
-}
-
-function hashFile(filePath: string): string | null {
-  try {
-    return createHash('sha256').update(readFileSync(filePath)).digest('hex');
-  } catch {
-    return null;
-  }
-}
diff --git a/src/optimizer/mutation/skill-writing-guide.ts b/src/optimizer/mutation/skill-writing-guide.ts
deleted file mode 100644
index fb68381..0000000
--- a/src/optimizer/mutation/skill-writing-guide.ts
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Guidance for writing and improving SKILL.md files.
- *
- * Sourced from the anthropics/skills skill-creator skill.
- * Sections included: Write the SKILL.md, Skill Writing Guide, Improving the skill.
- * Sections omitted: eval/viewer infrastructure, test cases, description optimization,
- * packaging — none of those apply to the mutation context.
- */
-export const SKILL_WRITING_GUIDE = `
----BEGIN SKILL WRITING GUIDE (source: anthropics/skills skill-creator)---
-
-## Write the SKILL.md
-
-Fill in these components:
-
-- **name**: Skill identifier (do not change)
-- **description**: When to trigger, what it does. This is the primary triggering mechanism —
-  include both what the skill does AND specific contexts for when to use it. All "when to use"
-  info goes here, not in the body. Claude has a tendency to "undertrigger" skills. To combat
-  this, make the description a little bit "pushy". For instance, instead of "How to use the
-  fast CLI tool", write "How to use the fast CLI tool. Use this skill whenever the user mentions
-  sending tokens, checking balances, managing accounts, or working with blockchain transactions —
-  even if they don't explicitly ask for the fast CLI."
-- **the rest of the skill**: markdown instructions that guide the model
-
-## Skill Writing Guide
-
-### Anatomy of a Skill
-
-\`\`\`
-skill-name/
-├── SKILL.md (required)
-│   ├── YAML frontmatter (name, description required)
-│   └── Markdown instructions
-└── Bundled Resources (optional)
-    ├── scripts/    - Executable code for deterministic/repetitive tasks
-    ├── references/ - Docs loaded into context as needed
-    └── assets/     - Files used in output (templates, icons, fonts)
-\`\`\`
-
-### Progressive Disclosure
-
-Skills use a three-level loading system:
-1. **Metadata** (name + description) — Always in context (~100 words)
-2. **SKILL.md body** — In context whenever skill triggers (<500 lines ideal)
-3. **Bundled resources** — As needed (unlimited, scripts can execute without loading)
-
-Key patterns:
-- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers
-- Reference files clearly from SKILL.md with guidance on when to read them
-- For large reference files (>300 lines), include a table of contents
-
-### Writing Patterns
-
-Prefer using the imperative form in instructions.
-
-**Defining output formats:**
-\`\`\`markdown
-## Report structure
-ALWAYS use this exact template:
-# [Title]
-## Executive summary
-## Key findings
-## Recommendations
-\`\`\`
-
-**Examples pattern — include concrete examples of correct call patterns:**
-\`\`\`markdown
-## Send tokens
-**Example:** Send 0.01 ETH from Base to Ethereum
-fast send 0x1234...abcd 0.01 --token ETH --from-chain base --to-chain ethereum
-\`\`\`
-
-### Writing Style
-
-Try to explain to the model *why* things are important in lieu of heavy-handed MUSTs. Use theory
-of mind and try to make the skill general and not super-narrow to specific examples. Start by
-writing a draft and then look at it with fresh eyes and improve it.
-
-## How to think about improvements
-
-1. **Generalize from the feedback.** We're trying to create skills that work across many different
-   prompts, not just the benchmark tasks. Here we're iterating on a small set of failing examples
-   because it helps move faster — but if the fix only works for those examples, it's useless.
-   Rather than adding narrow rules for each failing case, find the underlying confusion and address
-   the root cause with clearer conceptual framing. Avoid fiddly overfitty changes and oppressively
-   constrictive MUSTs. If some issue is stubborn, try branching out and using different metaphors
-   or recommending different patterns of working — it's relatively cheap to try.
-
-2. **Keep the prompt lean.** Remove things that aren't pulling their weight. Make sure to read the
-   actual failure details, not just the summary — if guidance doesn't change model behavior in
-   practice, cut it. If some instruction is making the model waste time doing unproductive things,
-   remove the part of the skill that's causing it. Every line the model reads takes attention.
-
-3. **Explain the why.** Try hard to explain the *why* behind everything you're asking the model to
-   do. Today's LLMs are smart — they have good theory of mind and when given good reasoning they
-   apply it intelligently to novel situations. Even if the feedback is terse or frustrated, try to
-   actually understand the underlying confusion and transmit that understanding into the
-   instructions. If you find yourself writing ALWAYS or NEVER in all caps, or using super rigid
-   structures, that's a yellow flag — reframe and explain the reasoning instead. That's a more
-   humane, powerful, and effective approach.
-
-4. **Surface the non-obvious.** Gotchas and pre-conditions that a model wouldn't infer from tool
-   names are the highest-value content. Things like required ordering, mutually exclusive options,
-   cases where a flag has no effect, and default behaviors that surprise users.
-
-5. **Be explicit about parameters.** Models frequently hallucinate argument names or invent values.
-   For each important command/method/tool: name the required parameters, list valid values or
-   formats, and clarify which parameters are optional and their defaults.
-
-6. **Look for patterns across failures.** If multiple failing tasks share the same root confusion
-   (e.g., they all misuse the same flag, or they all pick the wrong subcommand for the same
-   reason), that's a signal to fix the conceptual framing for that area rather than patching each
-   case individually.
-
----END SKILL WRITING GUIDE---
-`.trim();
diff --git a/src/optimizer/progress-table.ts b/src/optimizer/progress-table.ts
deleted file mode 100644
index e3ef4ce..0000000
--- a/src/optimizer/progress-table.ts
+++ /dev/null
@@ -1,107 +0,0 @@
-import type { BenchmarkReport } from '../benchmark/types.js';
-import type { OptimizeIteration } from './types.js';
-
-function pct(rate: number): string {
-  return `${(rate * 100).toFixed(1)}%`;
-}
-
-function delta(before: number, after: number): string {
-  const d = (after - before) * 100;
-  return d >= 0 ? `+${d.toFixed(1)}%` : `${d.toFixed(1)}%`;
-}
-
-function modelDisplayName(id: string): string {
-  // Strip openrouter/ prefix for readability
-  return id.startsWith('openrouter/') ? id.slice('openrouter/'.length) : id;
-}
-
-function pad(s: string, width: number, align: 'left' | 'right' | 'center' = 'right'): string {
-  if (s.length >= width) return s;
-  const spaces = width - s.length;
-  if (align === 'left') return s + ' '.repeat(spaces);
-  if (align === 'center') {
-    const left = Math.floor(spaces / 2);
-    return ' '.repeat(left) + s + ' '.repeat(spaces - left);
-  }
-  return ' '.repeat(spaces) + s;
-}
-
-export function renderProgressTable(
-  baselineReport: BenchmarkReport,
-  bestReport: BenchmarkReport,
-  iterations: OptimizeIteration[],
-): string {
-  // Only iterations where a benchmark actually ran (scoreAfter is set)
-  const benchedIterations = iterations.filter(it => it.scoreAfter !== undefined);
-
-  // Collect all model IDs from baseline (same set across all runs)
-  const modelIds = Object.keys(baselineReport.summary.perModel);
-  if (modelIds.length === 0 && benchedIterations.length === 0) return '';
-
-  // Display names
-  const displayNames = modelIds.map(modelDisplayName);
-  const modelColWidth = Math.max(
-    'Model'.length,
-    'Overall'.length,
-    ...displayNames.map(n => n.length),
-  );
-
-  // Column headers: Baseline | I1 | I2 | ... | Final | Δ
-  const iterHeaders = benchedIterations.map(it => `I${it.index}`);
-  const dataColWidth = 8; // enough for "100.0%  " or "+100.0%"
-
-  const allHeaders = ['Baseline', ...iterHeaders, 'Final', 'Δ'];
-  const colWidths = allHeaders.map(h => Math.max(h.length, dataColWidth));
-
-  function row(label: string, cells: string[]): string {
-    const labelCell = pad(label, modelColWidth, 'left');
-    const dataCells = cells.map((c, i) => pad(c, colWidths[i]!, 'center'));
-    return `│ ${labelCell} │ ${dataCells.join(' │ ')} │`;
-  }
-
-  function divider(left: string, mid: string, right: string, sep: string): string {
-    const modelSeg = '─'.repeat(modelColWidth + 2);
-    const datasegs = colWidths.map(w => '─'.repeat(w + 2));
-    return left + modelSeg + sep + datasegs.join(sep) + right;
-  }
-
-  function modelRow(modelId: string, displayName: string): string {
-    const baseline = baselineReport.summary.perModel[modelId];
-    const final = bestReport.summary.perModel[modelId];
-    if (!baseline || !final) return row(displayName, allHeaders.map(() => '—'));
-
-    const iterCells = benchedIterations.map(it => {
-      const pm = it.perModelAfter?.[modelId];
-      return pm ? pct(pm.passRate) : '—';
-    });
-
-    const d = delta(baseline.passRate, final.passRate);
-    return row(displayName, [pct(baseline.passRate), ...iterCells, pct(final.passRate), d]);
-  }
-
-  function overallRow(): string {
-    const baselineRate = baselineReport.summary.overallPassRate;
-    const finalRate = bestReport.summary.overallPassRate;
-    const iterCells = benchedIterations.map(it =>
-      it.scoreAfter !== undefined ? pct(it.scoreAfter) : '—'
-    );
-    const d = delta(baselineRate, finalRate);
-    return row('Overall', [pct(baselineRate), ...iterCells, pct(finalRate), d]);
-  }
-
-  const headerRow = row('Model', allHeaders.map((h, i) => pad(h, colWidths[i]!, 'center').trim()));
-
-  const lines: string[] = [
-    '',
-    '=== Optimization Progress ===',
-    divider('┌', '┬', '┐', '┬'),
-    headerRow,
-    divider('├', '┼', '┤', '┼'),
-    ...modelIds.map((id, i) => modelRow(id, displayNames[i]!)),
-    divider('├', '┼', '┤', '┼'),
-    overallRow(),
-    divider('└', '┴', '┘', '┴'),
-  ];
-
-  return lines.join('\n');
-}
diff --git a/src/optimizer/repo-state.ts b/src/optimizer/repo-state.ts
deleted file mode 100644
index beb4a4c..0000000
--- a/src/optimizer/repo-state.ts
+++ /dev/null
@@ -1,104 +0,0 @@
-import { execFile } from 'node:child_process';
-import { promisify } from 'node:util';
-
-import type { MutationCandidate, ResolvedOptimizeManifest } from './types.js';
-import { collectGitChangedFiles } from './mutation/git-changes.js';
-
-const execFileAsync = promisify(execFile);
-
-export function createRepoStateManager() {
-  return {
-    async ensureReady(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<string> {
-      const status = await git(targetRepo.path, ['status', '--porcelain']);
-      const dirtyPaths = parseStatusPaths(status.stdout).filter(
-        (file) => !isIgnoredCleanPath(file, targetRepo.cleanIgnorePaths ?? []),
-      );
-      if (targetRepo.requireCleanGit && dirtyPaths.length > 0) {
-        throw new Error(`Target repo must be clean before optimize runs: ${targetRepo.path}`);
-      }
-      return 'ready';
-    },
-
-    async captureCheckpoint(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<string> {
-      const result = await git(targetRepo.path, ['rev-parse', 'HEAD']);
-      return result.stdout.trim();
-    },
-
-    async restoreCheckpoint(targetRepo: ResolvedOptimizeManifest['targetRepo'], checkpoint: string): Promise<void> {
-      await git(targetRepo.path, ['restore', `--source=${checkpoint}`, '--staged', '--worktree', '.']);
-      await git(targetRepo.path, buildCleanArgs(targetRepo.cleanIgnorePaths ?? []));
-    },
-
-    async listChangedFiles(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<string[]> {
-      return collectGitChangedFiles(targetRepo.path);
-    },
-
-    async updateAcceptedCheckpoint(
-      targetRepo: ResolvedOptimizeManifest['targetRepo'],
-      checkpoint: string,
-      candidate: MutationCandidate,
-      changedFiles: string[] = candidate.changedFiles,
-    ): Promise<string> {
-      const status = await git(targetRepo.path, ['status', '--porcelain']);
-      if (status.stdout.trim() === '') {
-        return checkpoint;
-      }
-
-      if (changedFiles.length === 0) {
-        return checkpoint;
-      }
-
-      await git(targetRepo.path, ['add', '-A', '--', ...changedFiles]);
-      await git(targetRepo.path, ['commit', '-m', buildIterationCommitMessage(candidate.summary)], {
-        GIT_AUTHOR_NAME: process.env.GIT_AUTHOR_NAME ?? 'skill-optimizer',
-        GIT_AUTHOR_EMAIL: process.env.GIT_AUTHOR_EMAIL ?? 'skill-optimizer@local',
-        GIT_COMMITTER_NAME: process.env.GIT_COMMITTER_NAME ?? process.env.GIT_AUTHOR_NAME ?? 'skill-optimizer',
-        GIT_COMMITTER_EMAIL: process.env.GIT_COMMITTER_EMAIL ?? process.env.GIT_AUTHOR_EMAIL ?? 'skill-optimizer@local',
-      });
-      const result = await git(targetRepo.path, ['rev-parse', 'HEAD']);
-      return result.stdout.trim();
-    },
-  };
-}
-
-function parseStatusPaths(statusOutput: string): string[] {
-  return statusOutput
-    .split(/\r?\n/)
-    .map((line) => line.trimEnd())
-    .filter(Boolean)
-    .map((line) => line.slice(3));
-}
-
-function isIgnoredCleanPath(file: string, ignoredPaths: string[]): boolean {
-  const normalizedFile = normalizeRelativePath(file);
-  return ignoredPaths.some((ignoredPath) => {
-    const normalizedIgnored = normalizeRelativePath(ignoredPath);
-    return normalizedFile === normalizedIgnored || normalizedFile.startsWith(`${normalizedIgnored}/`);
-  });
-}
-
-function normalizeRelativePath(path: string): string {
-  return path.replace(/^\.\//, '').replace(/^\/+/, '').replace(/\/+$/, '');
-}
-
-function buildIterationCommitMessage(summary: string): string {
-  const normalized = summary.trim().replace(/\s+/g, ' ');
-  return `chore(optimize): ${normalized || 'accept optimizer iteration'}`;
-}
-
-function buildCleanArgs(cleanIgnorePaths: string[]): string[] {
-  const args = ['clean', '-fdx'];
-  for (const ignoredPath of cleanIgnorePaths) {
-    args.push('-e', ignoredPath);
-  }
-  return args;
-}
-
-async function git(cwd: string, args: string[], env?: Record<string, string>): Promise<{ stdout: string; stderr: string }> {
-  try {
-    return await execFileAsync('git', args, { cwd, encoding: 'utf-8', env: env ? { ...process.env, ...env } : process.env });
-  } catch (error) {
-    const err = error as Error & { stdout?: string; stderr?: string };
-    throw new Error(`git ${args.join(' ')} failed in ${cwd}: ${err.stderr ?? err.message}`);
-  }
-}
diff --git a/src/optimizer/types.ts b/src/optimizer/types.ts
deleted file mode 100644
index 77d2c8e..0000000
--- a/src/optimizer/types.ts
+++ /dev/null
@@ -1,181 +0,0 @@
-import type { BenchmarkReport, BenchmarkSurface, ModelConfig, ModelSummary } from '../benchmark/types.js';
-import type { PiAuthMode } from '../runtime/pi/auth.js';
-
-export type FailureBucketKind = 'missing-tool' | 'bad-args' | 'hallucination' | 'error';
-export type StopReason = 'max-iterations' | 'stable';
-
-export interface OptimizeTaskGenerationConfig {
-  enabled?: boolean;
-  maxGenerated?: number;
-  seed?: number;
-  outputDir?: string;
-}
-
-export interface OptimizeTargetRepo {
-  path: string;
-  surface: BenchmarkSurface;
-  allowedPaths: string[];
-  surfacePaths?: string[];
-  cleanIgnorePaths?: string[];
-  validation: string[];
-  requireCleanGit?: boolean;
-}
-
-export interface OptimizePolicy {
-  mode?: 'stable-surface' | 'surface-changing';
-  maxIterations?: number;
-  stabilityWindow?: number;
-  minImprovement?: number;
-  taskGeneration?: OptimizeTaskGenerationConfig;
-  perModelFloor?: number;
-  targetWeightedAverage?: number;
-  models?: ModelConfig[];
-}
-
-export interface OptimizeMutationConfig {
-  provider: string;
-  model: string;
-  thinkingLevel?: 'off' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';
-  authMode?: PiAuthMode;
-  apiKeyEnv?: string;
-  reportContextMaxBytes?: number;
-}
-
-export interface OptimizeManifest {
-  benchmarkConfig: string;
-  targetRepo: OptimizeTargetRepo;
-  optimizer?: OptimizePolicy;
-  mutation?: OptimizeMutationConfig;
-}
-
-export interface ResolvedOptimizeManifest {
-  benchmarkConfig: string;
-  /** Absolute path to the source SKILL.md in the target repo, if it is a local file. */
-  skillPath?: string;
-  targetRepo: {
-    path: string;
-    surface: BenchmarkSurface;
-    allowedPaths: string[];
-    surfacePaths?: string[];
-    cleanIgnorePaths?: string[];
-    validation: string[];
-    requireCleanGit: boolean;
-  };
-  optimizer: {
-    mode: 'stable-surface' | 'surface-changing';
-    maxIterations: number;
-    stabilityWindow: number;
-    minImprovement: number;
-    taskGeneration: {
-      enabled: boolean;
-      maxGenerated: number;
-      seed: number;
-      outputDir: string;
-    };
-    perModelFloor: number;
-    targetWeightedAverage: number;
-    models: ModelConfig[];
-  };
-  mutation?: OptimizeMutationConfig & {
-    reportContextMaxBytes: number;
-  };
-}
-
-export interface TaskGenerationResult {
-  benchmarkConfigPath: string;
-  taskCount: number;
-  rejectedCount: number;
-}
-
-export interface FailureBucket {
-  kind: FailureBucketKind;
-  count: number;
-  taskIds: string[];
-  modelIds: string[];
-}
-
-export interface MutationCandidate {
-  summary: string;
-  changedFiles: string[];
-  toolActivity?: string[];
-}
-
-export interface ValidationCommandResult {
-  command: string;
-  ok: boolean;
-  exitCode: number;
-  stdout: string;
-  stderr: string;
-}
-
-export interface ValidationResult {
-  ok: boolean;
-  commands: ValidationCommandResult[];
-}
-
-export interface OptimizeIteration {
-  index: number;
-  accepted: boolean;
-  summary: string;
-  changedFiles: string[];
-  validation: ValidationResult;
-  scoreBefore: number;
-  scoreAfter?: number;
-  /** Per-model pass rates after this iteration's benchmark run (absent when no benchmark ran) */
-  perModelAfter?: Record<string, ModelSummary>;
-  delta: number;
-  failureBuckets: FailureBucket[];
-}
-
-export interface OptimizeResult {
-  baselineReport: BenchmarkReport;
-  bestReport: BenchmarkReport;
-  iterations: OptimizeIteration[];
-  stopReason: StopReason;
-  generation?: TaskGenerationResult;
-}
-
-export interface MutationContext {
-  manifest: ResolvedOptimizeManifest;
-  iteration: number;
-  currentReport: BenchmarkReport;
-  failureBuckets: FailureBucket[];
-  reportPath: string | null;
-  /**
-   * Absolute path to the local skill file for this iteration
-   * (e.g. `.skill-optimizer/skill-v1.md`). When present, the mutation
-   * executor must write its changes to this path instead of the target repo.
-   */
-  localSkillPath?: string;
-}
-
-export interface OptimizeLoopDependencies {
-  benchmark: {
-    run(
-      configPath: string,
-      opts: { outputDir: string; label: string; verdictPolicy?: { perModelFloor: number; targetWeightedAverage: number }; skillOverride?: string },
-    ): Promise<{ report: BenchmarkReport; reportPath: string }>;
-  };
-  repo: {
-    ensureReady(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<string>;
-    captureCheckpoint(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<string>;
-    restoreCheckpoint(targetRepo: ResolvedOptimizeManifest['targetRepo'], checkpoint: string): Promise<void>;
-    updateAcceptedCheckpoint(targetRepo: ResolvedOptimizeManifest['targetRepo'], checkpoint: string, candidate: MutationCandidate, changedFiles?: string[]): Promise<string>;
-    listChangedFiles?(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<string[]>;
-  };
-  mutation: {
-    apply(context: MutationContext): Promise<MutationCandidate>;
-  };
-  taskGenerator?: {
-    generate(
-      manifest: ResolvedOptimizeManifest,
-      opts: { outputDir: string },
-    ): Promise<TaskGenerationResult>;
-  };
-  validation: {
-    run(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<ValidationResult>;
-  };
-  ledger: {
-    record(event: Record<string, unknown>): Promise<void>;
-  };
-}
diff --git a/src/optimizer/validation.ts b/src/optimizer/validation.ts
deleted file mode 100644
index 6d33268..0000000
--- a/src/optimizer/validation.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-import { exec } from 'node:child_process';
-import { promisify } from 'node:util';
-
-import type { ResolvedOptimizeManifest, ValidationCommandResult, ValidationResult } from './types.js';
-
-const execAsync = promisify(exec);
-
-export function createValidationRunner() {
-  return {
-    async run(targetRepo: ResolvedOptimizeManifest['targetRepo']): Promise<ValidationResult> {
-      const commands: ValidationCommandResult[] = [];
-
-      for (const command of targetRepo.validation) {
-        try {
-          const result = await execAsync(command, {
-            cwd: targetRepo.path,
-            encoding: 'utf-8',
-            maxBuffer: 10 * 1024 * 1024,
-          });
-          commands.push({
-            command,
-            ok: true,
-            exitCode: 0,
-            stdout: result.stdout,
-            stderr: result.stderr,
-          });
-        } catch (error) {
-          const err = error as Error & { code?: number; stdout?: string; stderr?: string };
-          commands.push({
-            command,
-            ok: false,
-            exitCode: typeof err.code === 'number' ? err.code : 1,
-            stdout: err.stdout ?? '',
-            stderr: err.stderr ?? err.message,
-          });
-          return { ok: false, commands };
-        }
-      }
-
-      return { ok: true, commands };
-    },
-  };
-}
diff --git a/src/project/adapters.ts b/src/project/adapters.ts
deleted file mode 100644
index f0f3187..0000000
--- a/src/project/adapters.ts
+++ /dev/null
@@ -1,120 +0,0 @@
-import type { BenchmarkConfig, SkillConfig } from '../benchmark/types.js';
-import type { ResolvedOptimizeManifest } from '../optimizer/types.js';
-import type { ResolvedProjectConfig } from './types.js';
-import { parseModelRef } from './types.js';
-import { buildMcpToolDefinitionsFromSnapshot, buildSurfaceSnapshot } from './snapshot.js';
-
-export function toBenchmarkConfig(project: ResolvedProjectConfig): BenchmarkConfig {
-  const surfaceSnapshot = buildSurfaceSnapshot(project);
-  return {
-    name: project.name,
-    surface: project.target.surface,
-    sdk: project.target.sdk && {
-      language: project.target.sdk.language,
-      style: project.target.sdk.style,
-      apiSurface: project.target.sdk.apiSurface,
-    },
-    cli: project.target.cli,
-    mcp: project.target.mcp,
-    skill: project.target.skill as SkillConfig | undefined,
-    tasks: project.benchmark.tasks ?? '__generated__',
-    llm: {
-      format: project.benchmark.format,
-      baseUrl: project.benchmark.baseUrl,
-      authMode: project.benchmark.authMode,
-      apiKeyEnv: project.benchmark.apiKeyEnv,
-      timeout: project.benchmark.timeout,
-      headers: project.benchmark.headers,
-      models: project.benchmark.models,
-    },
-    output: project.benchmark.output,
-    agentic: project.benchmark.agentic,
-    surfaceSnapshot,
-    mcpToolDefinitions: project.target.surface === 'mcp'
-      ? buildMcpToolDefinitionsFromSnapshot(surfaceSnapshot)
-      : undefined,
-  };
-}
-
-export function toOptimizeManifest(project: ResolvedProjectConfig): ResolvedOptimizeManifest {
-  const optimize = project.optimize;
-  if (!optimize || !optimize.enabled) {
-    throw new Error(`Project ${project.configPath} does not have optimization enabled`);
-  }
-
-  const mutationModel = parseModelRef(optimize.model);
-
-  // Resolve the local skill path — only for file-system sources (not github:/https:)
-  const skillSource = project.target.skill?.source;
-  const skillPath = skillSource && !skillSource.startsWith('github:') && !skillSource.startsWith('http')
-    ? skillSource
-    : undefined;
-
-  return {
-    benchmarkConfig: project.configPath,
-    skillPath,
-    targetRepo: {
-      path: project.target.repoPath,
-      surface: project.target.surface,
-      allowedPaths: optimize.allowedPaths,
-      surfacePaths: getProjectSurfacePaths(project),
-      validation: optimize.validation,
-      requireCleanGit: optimize.requireCleanGit,
-    },
-    optimizer: {
-      mode: optimize.mode,
-      maxIterations: optimize.maxIterations,
-      stabilityWindow: optimize.stabilityWindow,
-      minImprovement: optimize.minImprovement,
-      taskGeneration: {
-        enabled: project.benchmark.taskGeneration.enabled,
-        maxGenerated: project.benchmark.taskGeneration.maxTasks,
-        seed: project.benchmark.taskGeneration.seed,
-        outputDir: project.benchmark.taskGeneration.outputDir,
-      },
-      perModelFloor: project.benchmark.verdict.perModelFloor,
-      targetWeightedAverage: project.benchmark.verdict.targetWeightedAverage,
-      models: project.benchmark.models,
-    },
-    mutation: {
-      provider: mutationModel.provider,
-      model: mutationModel.model,
-      authMode: optimize.authMode,
-      apiKeyEnv: optimize.apiKeyEnv,
-      thinkingLevel: optimize.thinkingLevel,
-      reportContextMaxBytes: optimize.reportContextMaxBytes,
-    },
-  };
-}
-
-function getProjectSurfacePaths(project: ResolvedProjectConfig): string[] {
-  const paths = new Set<string>();
-
-  for (const source of project.target.discovery?.sources ?? []) {
-    paths.add(source);
-  }
-
-  if (project.target.discovery?.fallbackManifest) {
-    paths.add(project.target.discovery.fallbackManifest);
-  }
-
-  for (const entrypoint of project.target.sdk?.entrypoints ?? []) {
-    paths.add(entrypoint);
-  }
-
-  if (project.target.cli?.commands) {
-    paths.add(project.target.cli.commands);
-  }
-
-  if (project.target.mcp?.tools) {
-    paths.add(project.target.mcp.tools);
-  }
-
-  // Pinned surface snapshots are part of the surface definition — edits to
-  // them must be tracked so stable-surface mode can detect drift.
-  if (project.benchmark.surfaceSnapshot) {
-    paths.add(project.benchmark.surfaceSnapshot);
-  }
-
-  return [...paths];
-}
diff --git a/src/project/discover-prompt.ts b/src/project/discover-prompt.ts
deleted file mode 100644
index 7f242af..0000000
--- a/src/project/discover-prompt.ts
+++ /dev/null
@@ -1,369 +0,0 @@
-import type { ActionDefinition } from '../actions/types.js';
-
-export type PromptCapabilityType = 'phase' | 'instruction' | 'output' | 'decision';
-
-export interface PromptCapability {
-  name: string;
-  description: string;
-  section: string;
-  type: PromptCapabilityType;
-}
-
-// Imperative verbs that start an instruction line.
-const IMPERATIVE_VERBS = [
-  'write', 'generate', 'create', 'check', 'verify', 'ensure', 'add',
-  'remove', 'delete', 'update', 'modify', 'set', 'configure', 'run',
-  'execute', 'build', 'deploy', 'test', 'validate', 'send', 'fetch',
-  'parse', 'extract', 'transform', 'convert', 'compute', 'calculate',
-  'define', 'implement', 'install', 'import', 'export', 'open', 'close',
-  'start', 'stop', 'initialize', 'list', 'search', 'find', 'filter',
-  'sort', 'map', 'reduce', 'merge', 'split', 'read', 'load', 'save',
-  'store', 'output', 'print', 'log', 'return', 'emit', 'publish',
-  'subscribe', 'ask', 'prompt', 'collect', 'gather', 'summarize',
-  'analyze', 'review', 'approve', 'reject', 'iterate', 'loop', 'repeat',
-  'wait', 'retry', 'handle', 'catch', 'throw', 'raise', 'assert',
-  'call', 'invoke', 'trigger', 'notify', 'alert', 'warn',
-];
-
-const IMPERATIVE_RE = new RegExp(
-  `^(?:[-*]\\s+)?(?:\\*\\*)?(?:${IMPERATIVE_VERBS.join('|')})\\b`,
-  'i',
-);
-
-// Heading patterns for phases/steps.
-const PHASE_HEADING_RE = /^##\s+(?:phase|step)\s+(\d+)[:\s—–-]*\s*(.*)/i;
-
-// General ## heading (non-phase).
-const HEADING_RE = /^##\s+(.+)/;
-
-// Decision-point patterns (if/when/then, conditional logic).
-const DECISION_RE = /\b(?:if|when|unless|otherwise|then|else|in case|provided that|assuming)\b/i;
-
-// Code block detection.
-const CODE_BLOCK_OPEN_RE = /^```(\w*)/;
-const CODE_BLOCK_CLOSE_RE = /^```\s*$/;
-
-interface MarkdownSection {
-  heading: string;
-  level: number;
-  body: string;
-  isPhase: boolean;
-  phaseNumber?: number;
-}
-
-function stripFrontmatter(content: string): string {
-  if (!content.startsWith('---')) return content;
-  const closingMatch = content.slice(3).match(/\n---\s*(\n|$)/);
-  if (!closingMatch || closingMatch.index === undefined) return content;
-  return content.slice(3 + closingMatch.index + closingMatch[0].length);
-}
-
-/**
- * Split markdown content into sections by ## headings.
- */
-function splitSections(content: string): MarkdownSection[] {
-  const lines = content.split('\n');
-  const sections: MarkdownSection[] = [];
-  let current: MarkdownSection | null = null;
-  const bodyLines: string[] = [];
-
-  function flushCurrent(): void {
-    if (current) {
-      current.body = bodyLines.join('\n').trim();
-      sections.push(current);
-      bodyLines.length = 0;
-    }
-  }
-
-  for (const line of lines) {
-    const headingMatch = HEADING_RE.exec(line);
-    if (headingMatch) {
-      flushCurrent();
-      bodyLines.length = 0; // discard preamble lines before first heading
-      const phaseMatch = PHASE_HEADING_RE.exec(line);
-      current = {
-        heading: headingMatch[1]!.trim(),
-        level: 2,
-        body: '',
-        isPhase: Boolean(phaseMatch),
-        phaseNumber: phaseMatch ? parseInt(phaseMatch[1]!, 10) : undefined,
-      };
-    } else if (current) {
-      bodyLines.push(line);
-    } else {
-      // Lines before the first ## heading — accumulate in case there is a preamble.
-      bodyLines.push(line);
-    }
-  }
-
-  flushCurrent();
-  return sections;
-}
-
-/**
- * Slugify a heading string into a snake_case identifier.
- *   "Phase 1: Requirements Discovery" -> "phase_1_requirements_discovery"
- */
-function slugify(text: string): string {
-  return text
-    .toLowerCase()
-    .replace(/[^a-z0-9]+/g, '_')
-    .replace(/^_|_$/g, '');
-}
-
-/**
- * Extract imperative instructions from section body text.
- * Returns an array of instruction sentences.
- */
-function extractInstructions(body: string): string[] {
-  const results: string[] = [];
-  let inCodeBlock = false;
-
-  for (const line of body.split('\n')) {
-    if (CODE_BLOCK_OPEN_RE.test(line) && !inCodeBlock) {
-      inCodeBlock = true;
-      continue;
-    }
-    if (CODE_BLOCK_CLOSE_RE.test(line) && inCodeBlock) {
-      inCodeBlock = false;
-      continue;
-    }
-    if (inCodeBlock) continue;
-
-    const trimmed = line.trim();
-    if (IMPERATIVE_RE.test(trimmed)) {
-      // Strip leading markdown list markers and bold markers for the description.
-      const clean = trimmed.replace(/^[-*]\s+/, '').replace(/\*\*/g, '');
-      results.push(clean);
-    }
-  }
-
-  return results;
-}
-
-/**
- * Extract code blocks that represent expected output formats.
- * Returns the raw code block content strings.
- */
-function extractOutputFormats(body: string): string[] {
-  const blocks: string[] = [];
-  let inCodeBlock = false;
-  let currentBlock: string[] = [];
-
-  for (const line of body.split('\n')) {
-    if (CODE_BLOCK_OPEN_RE.test(line) && !inCodeBlock) {
-      inCodeBlock = true;
-      currentBlock = [];
-      continue;
-    }
-    if (CODE_BLOCK_CLOSE_RE.test(line) && inCodeBlock) {
-      inCodeBlock = false;
-      if (currentBlock.length > 0) {
-        blocks.push(currentBlock.join('\n'));
-      }
-      continue;
-    }
-    if (inCodeBlock) {
-      currentBlock.push(line);
-    }
-  }
-
-  return blocks;
-}
-
-/**
- * Detect whether the section body contains decision-point language.
- */
-function hasDecisionPoints(body: string): boolean {
-  let inCodeBlock = false;
-
-  for (const line of body.split('\n')) {
-    if (CODE_BLOCK_OPEN_RE.test(line) && !inCodeBlock) {
-      inCodeBlock = true;
-      continue;
-    }
-    if (CODE_BLOCK_CLOSE_RE.test(line) && inCodeBlock) {
-      inCodeBlock = false;
-      continue;
-    }
-    if (inCodeBlock) continue;
-
-    if (DECISION_RE.test(line)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/**
- * Convert a PromptCapability into an ActionDefinition compatible with the
- * benchmark runner and surface snapshot system.
- */
-function capabilityToAction(cap: PromptCapability): ActionDefinition {
-  const key = slugify(cap.name);
-
-  // Build args based on the capability type.
-  const args: ActionDefinition['args'] = [];
-
-  if (cap.type === 'phase') {
-    args.push({
-      name: 'user_brief',
-      required: true,
-      type: 'string',
-      description: 'The product brief from the user',
-    });
-  } else if (cap.type === 'instruction') {
-    args.push({
-      name: 'input',
-      required: true,
-      type: 'string',
-      description: 'The input to process for this instruction',
-    });
-  } else if (cap.type === 'output') {
-    args.push({
-      name: 'content',
-      required: true,
-      type: 'string',
-      description: 'The content to format into the expected output',
-    });
-  } else if (cap.type === 'decision') {
-    args.push(
-      {
-        name: 'condition',
-        required: true,
-        type: 'string',
-        description: 'The condition or context to evaluate',
-      },
-      {
-        name: 'context',
-        required: false,
-        type: 'string',
-        description: 'Additional context for the decision',
-      },
-    );
-  }
-
-  return {
-    key,
-    name: key,
-    description: cap.description,
-    args,
-    source: 'prompt',
-  };
-}
-
-export interface PromptCapabilityWithSection {
-  action: ActionDefinition;
-  /** Raw markdown body text of the section this capability was extracted from. */
-  section: string;
-}
-
-/**
- * Discover capabilities from a markdown prompt/skill file, returning each
- * capability paired with its raw section body text.
- */
-export function discoverPromptCapabilitiesWithSections(
-  skillContent: string,
-): PromptCapabilityWithSection[] {
-  const content = stripFrontmatter(skillContent);
-  const sections = splitSections(content);
-  const result: PromptCapabilityWithSection[] = [];
-  const seenNames = new Set<string>();
-
-  function addWithSection(cap: PromptCapability): void {
-    const key = slugify(cap.name);
-    if (seenNames.has(key)) return;
-    seenNames.add(key);
-    result.push({ action: capabilityToAction(cap), section: cap.section });
-  }
-
-  for (const section of sections) {
-    // 1. Phase capabilities (## Phase N / ## Step N headings).
-    if (section.isPhase && section.phaseNumber !== undefined) {
-      const phaseName = `phase_${section.phaseNumber}_${slugify(section.heading.replace(/^(?:phase|step)\s+\d+[:\s—–-]*/i, ''))}`;
-      const firstSentence = section.body.split(/[.!?\n]/)[0]?.trim() ?? section.heading;
-      addWithSection({
-        name: phaseName,
-        description: firstSentence || section.heading,
-        section: section.body,
-        type: 'phase',
-      });
-    }
-
-    // 2. Instruction capabilities from imperative sentences.
-    const instructions = extractInstructions(section.body);
-    for (const instruction of instructions) {
-      const instructionName = `${slugify(section.heading)}_${slugify(instruction.slice(0, 60))}`;
-      addWithSection({
-        name: instructionName,
-        description: instruction,
-        section: section.body,
-        type: 'instruction',
-      });
-    }
-
-    // 3. Output format capabilities from code blocks.
-    const outputs = extractOutputFormats(section.body);
-    for (let i = 0; i < outputs.length; i++) {
-      const snippet = outputs[i]!;
-      const outputName = `${slugify(section.heading)}_output${outputs.length > 1 ? `_${i + 1}` : ''}`;
-      const preview = snippet.split('\n')[0]?.trim().slice(0, 80) ?? 'code block';
-      addWithSection({
-        name: outputName,
-        description: `Expected output format: ${preview}`,
-        section: section.body,
-        type: 'output',
-      });
-    }
-
-    // 4. Decision-point capabilities.
-    if (hasDecisionPoints(section.body)) {
-      const decisionName = `${slugify(section.heading)}_decision`;
-      addWithSection({
-        name: decisionName,
-        description: `Decision point in "${section.heading}" — evaluate conditional logic`,
-        section: section.body,
-        type: 'decision',
-      });
-    }
-  }
-
-  // Fallback: no ## sections were found — extract from whole content.
-  if (sections.length === 0) {
-    const fallbackInstructions = extractInstructions(content);
-    for (const instruction of fallbackInstructions) {
-      addWithSection({
-        name: slugify(instruction.slice(0, 60)) || 'instruction',
-        description: instruction,
-        section: content,
-        type: 'instruction',
-      });
-    }
-    // Last resort: use the first non-empty content line as a single capability.
-    // Only do this for non-empty content; empty files should still return [] so
-    // buildPromptSurfaceSnapshot's 0-capability guard fires correctly.
-    if (result.length === 0 && content.trim().length > 0) {
-      const firstLine = content.trim().split('\n').find(l => l.trim().length > 0) ?? 'skill';
-      const cleaned = firstLine.replace(/^#+\s*/, '').trim();
-      addWithSection({
-        name: slugify(cleaned) || 'skill',
-        description: cleaned || 'Skill capability',
-        section: content,
-        type: 'instruction',
-      });
-    }
-  }
-
-  return result;
-}
-
-/**
- * Discover capabilities from a markdown prompt/skill file.
- * Parses headings, imperative instructions, code-block output formats,
- * and decision-point logic into ActionDefinition[] compatible with the
- * benchmark runner.
- */
-export function discoverPromptCapabilities(skillContent: string): ActionDefinition[] {
-  return discoverPromptCapabilitiesWithSections(skillContent).map(({ action }) => action);
-}
diff --git a/src/project/fix.ts b/src/project/fix.ts
deleted file mode 100644
index 28ad3a3..0000000
--- a/src/project/fix.ts
+++ /dev/null
@@ -1,45 +0,0 @@
-import type { Issue } from './validate.js';
-
-/**
- * Apply auto-fixable changes to a raw config JSON object.
- * Pure function — deep-clones input, never mutates, never writes to disk.
- */
-export function applyFixes(
-  rawJson: Record<string, unknown>,
-  issues: Issue[],
-  _configDir: string,
-): Record<string, unknown> {
-  const result = JSON.parse(JSON.stringify(rawJson)) as Record<string, unknown>;
-
-  // Track which model indices had their prefix fixed so we don't also apply
-  // the dot-format fix on top (the bad-format issue was generated from the
-  // original ID before the prefix was added).
-  const prefixFixedIndices = new Set<number>();
-
-  for (const issue of issues.filter((i) => i.fixable)) {
-    if (issue.code === 'model-id-missing-prefix' || issue.code === 'model-id-bad-format') {
-      const match = issue.field.match(/^benchmark\.models\[(\d+)\]\.id$/);
-      if (!match) continue;
-      const idx = parseInt(match[1]!, 10);
-      const models = (result.benchmark as Record<string, unknown> | undefined)?.models as Array<Record<string, unknown>> | undefined;
-      if (!models?.[idx]) continue;
-
-      if (issue.code === 'model-id-missing-prefix') {
-        models[idx]!.id = `openrouter/${models[idx]!.id as string}`;
-        prefixFixedIndices.add(idx);
-      }
-
-      if (issue.code === 'model-id-bad-format' && !prefixFixedIndices.has(idx)) {
-        const currentId = models[idx]!.id as string;
-        // Only anthropic/ direct-API IDs get dots rewritten to hyphens.
-        // openrouter/ slugs are passed verbatim; openai/ direct-API IDs use dots (e.g. gpt-5.4).
-        if (!currentId.startsWith('openrouter/') && !currentId.startsWith('openai/')) {
-          models[idx]!.id = currentId.replace(/(\d+)\.(\d+)/g, '$1-$2');
-        }
-      }
-    }
-
-  }
-
-  return result;
-}
diff --git a/src/project/index.ts b/src/project/index.ts
deleted file mode 100644
index ba0f3d7..0000000
--- a/src/project/index.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-export { DEFAULT_PROJECT_CONFIG_NAME, loadProjectConfig } from './load.js';
-export { resolveProjectConfig } from './resolve.js';
-export { buildMcpToolDefinitionsFromSnapshot, buildSurfaceSnapshot, loadSurfaceSnapshotFile } from './snapshot.js';
-export { checkConfig, validateProjectConfig, type Issue, type IssueSeverity } from './validate.js';
-export { toBenchmarkConfig, toOptimizeManifest } from './adapters.js';
-
-export type { ActionCatalog } from '../actions/types.js';
-
-export type {
-  ParsedModelRef,
-  ProjectBenchmarkConfig,
-  ProjectBenchmarkVerdictConfig,
-  ProjectConfig,
-  ProjectDiscoveryConfig,
-  ProjectOptimizeConfig,
-  ProjectScopeConfig,
-  ProjectTargetConfig,
-  ProjectTaskGenerationConfig,
-  ResolvedProjectBenchmarkConfig,
-  ResolvedProjectConfig,
-  SurfaceSnapshot,
-  SurfaceSnapshotAction,
-  ResolvedProjectOptimizeConfig,
-  ResolvedProjectTargetConfig,
-  ResolvedProjectTaskGenerationConfig,
-} from './types.js';
-
-export { isSdkLanguage, parseModelRef } from './types.js';
diff --git a/src/project/load.ts b/src/project/load.ts
deleted file mode 100644
index 6ce09d4..0000000
--- a/src/project/load.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-import type { ProjectConfig, ResolvedProjectConfig } from './types.js';
-import { resolveProjectConfig } from './resolve.js';
-import { validateProjectConfig } from './validate.js';
-
-export const DEFAULT_PROJECT_CONFIG_NAME = '.skill-optimizer/skill-optimizer.json';
-
-export async function loadProjectConfig(configPath?: string, opts?: { skipDirtyGitCheck?: boolean }): Promise<ResolvedProjectConfig> {
-  const resolvedPath = configPath
-    ? resolve(configPath)
-    : resolve(process.cwd(), DEFAULT_PROJECT_CONFIG_NAME);
-
-  if (!existsSync(resolvedPath)) {
-    throw new Error(
-      `Project config not found: ${resolvedPath}\n` +
-      `Run 'skill-optimizer init' to create one, or specify --config <path>.`,
-    );
-  }
-
-  let raw: string;
-  try {
-    raw = readFileSync(resolvedPath, 'utf-8');
-  } catch (error) {
-    throw new Error(
-      `Failed to read project config: ${resolvedPath}: ` +
-        `${error instanceof Error ? error.message : String(error)}`,
-    );
-  }
-
-  let parsed: ProjectConfig;
-  try {
-    parsed = JSON.parse(raw) as ProjectConfig;
-  } catch (error) {
-    throw new Error(
-      `Invalid JSON in project config ${resolvedPath}: ` +
-        `${error instanceof Error ? error.message : String(error)}`,
-    );
-  }
-
-  await validateProjectConfig(parsed, resolvedPath, { skipDirtyGitCheck: opts?.skipDirtyGitCheck });
-  return resolveProjectConfig(parsed, resolvedPath);
-}
diff --git a/src/project/resolve.ts b/src/project/resolve.ts
deleted file mode 100644
index cbc3ecf..0000000
--- a/src/project/resolve.ts
+++ /dev/null
@@ -1,141 +0,0 @@
-import { dirname, resolve } from 'node:path';
-
-import type { ProjectConfig, ResolvedProjectConfig } from './types.js';
-
-const DEFAULT_BENCHMARK_FORMAT = 'pi';
-const DEFAULT_AUTH_MODE = 'env';
-const DEFAULT_TIMEOUT = 240_000;
-const DEFAULT_OUTPUT_DIR = 'benchmark-results';
-const DEFAULT_GENERATION_OUTPUT_DIR = '.skill-optimizer';
-const DEFAULT_MAX_TASKS = 10;
-const DEFAULT_TASK_SEED = 1;
-const DEFAULT_OPTIMIZE_ITERATIONS = 5;
-const DEFAULT_STABILITY_WINDOW = 2;
-const DEFAULT_MIN_IMPROVEMENT = 0.02;
-const DEFAULT_PER_MODEL_FLOOR = 0.6;
-const DEFAULT_TARGET_WEIGHTED_AVERAGE = 0.7;
-const DEFAULT_REPORT_CONTEXT_MAX_BYTES = 16_000;
-
-export function resolveProjectConfig(config: ProjectConfig, configPath: string): ResolvedProjectConfig {
-  const configDir = dirname(configPath);
-  const skill = config.target.skill
-    ? typeof config.target.skill === 'string'
-      ? { source: resolve(configDir, config.target.skill), cache: true }
-      : {
-          ...config.target.skill,
-          source: resolve(configDir, config.target.skill.source),
-          cache: config.target.skill.cache ?? true,
-        }
-    : undefined;
-  const discovery = config.target.discovery
-    ? {
-        mode: config.target.discovery.mode ?? 'auto',
-        sources: (config.target.discovery.sources ?? []).map((source) => resolve(configDir, source)),
-        fallbackManifest: config.target.discovery.fallbackManifest
-          ? resolve(configDir, config.target.discovery.fallbackManifest)
-          : undefined,
-        language: config.target.discovery.language,
-      }
-    : undefined;
-
-  const sdkConfig = config.target.sdk
-    ? {
-        language: config.target.sdk.language ?? discovery?.language,
-        style: config.target.sdk.style,
-        apiSurface: config.target.sdk.apiSurface,
-        entrypoints: (config.target.sdk.entrypoints ?? []).map((entrypoint) => resolve(configDir, entrypoint)),
-      }
-    : discovery?.language
-      ? {
-          language: discovery.language,
-          style: undefined,
-          apiSurface: undefined,
-          entrypoints: [],
-        }
-      : undefined;
-
-  const cliConfig = config.target.cli
-    ? {
-        ...config.target.cli,
-        commands: resolve(configDir, config.target.cli.commands),
-      }
-    : config.target.surface === 'cli' && discovery?.fallbackManifest
-      ? {
-          commands: discovery.fallbackManifest,
-        }
-      : undefined;
-
-  const mcpConfig = config.target.mcp
-    ? {
-        ...config.target.mcp,
-        tools: resolve(configDir, config.target.mcp.tools),
-      }
-    : config.target.surface === 'mcp' && discovery?.fallbackManifest
-      ? {
-          tools: discovery.fallbackManifest,
-        }
-      : undefined;
-
-  return {
-    configPath,
-    configDir,
-    name: config.name,
-    target: {
-      surface: config.target.surface,
-      repoPath: resolve(configDir, config.target.repoPath ?? '.'),
-      skill,
-      discovery,
-      sdk: sdkConfig,
-      cli: cliConfig,
-      mcp: mcpConfig,
-      scope: {
-        include: config.target.scope?.include && config.target.scope.include.length > 0
-          ? [...config.target.scope.include]
-          : ['*'],
-        exclude: config.target.scope?.exclude ? [...config.target.scope.exclude] : [],
-      },
-    },
-    benchmark: {
-      format: config.benchmark.format ?? DEFAULT_BENCHMARK_FORMAT,
-      baseUrl: config.benchmark.baseUrl,
-      authMode: config.benchmark.authMode ?? DEFAULT_AUTH_MODE,
-      apiKeyEnv: config.benchmark.apiKeyEnv,
-      timeout: config.benchmark.timeout ?? DEFAULT_TIMEOUT,
-      headers: config.benchmark.headers,
-      models: config.benchmark.models,
-      tasks: config.benchmark.tasks ? resolve(configDir, config.benchmark.tasks) : undefined,
-      surfaceSnapshot: config.benchmark.surfaceSnapshot ? resolve(configDir, config.benchmark.surfaceSnapshot) : undefined,
-      taskGeneration: {
-        enabled: config.benchmark.taskGeneration?.enabled ?? false,
-        maxTasks: config.benchmark.taskGeneration?.maxTasks ?? DEFAULT_MAX_TASKS,
-        seed: config.benchmark.taskGeneration?.seed ?? DEFAULT_TASK_SEED,
-        outputDir: resolve(configDir, config.benchmark.taskGeneration?.outputDir ?? DEFAULT_GENERATION_OUTPUT_DIR),
-      },
-      output: {
-        dir: resolve(configDir, config.benchmark.output?.dir ?? DEFAULT_OUTPUT_DIR),
-      },
-      agentic: config.benchmark.agentic,
-      verdict: {
-        perModelFloor: config.benchmark.verdict?.perModelFloor ?? DEFAULT_PER_MODEL_FLOOR,
-        targetWeightedAverage: config.benchmark.verdict?.targetWeightedAverage ?? DEFAULT_TARGET_WEIGHTED_AVERAGE,
-      },
-    },
-    optimize: config.optimize
-      ? {
-          enabled: config.optimize.enabled ?? true,
-          mode: config.optimize.mode ?? 'stable-surface',
-          model: config.optimize.model ?? config.benchmark.models[0]!.id,
-          authMode: config.optimize.authMode ?? config.benchmark.authMode ?? DEFAULT_AUTH_MODE,
-          apiKeyEnv: config.optimize.apiKeyEnv ?? config.benchmark.apiKeyEnv,
-          thinkingLevel: config.optimize.thinkingLevel ?? 'medium',
-          allowedPaths: [...(config.optimize.allowedPaths ?? [])],
-          validation: [...(config.optimize.validation ?? [])],
-          requireCleanGit: config.optimize.requireCleanGit ?? true,
-          maxIterations: config.optimize.maxIterations ?? DEFAULT_OPTIMIZE_ITERATIONS,
-          stabilityWindow: config.optimize.stabilityWindow ?? DEFAULT_STABILITY_WINDOW,
-          minImprovement: config.optimize.minImprovement ?? DEFAULT_MIN_IMPROVEMENT,
-          reportContextMaxBytes: config.optimize.reportContextMaxBytes ?? DEFAULT_REPORT_CONTEXT_MAX_BYTES,
-        }
-      : undefined,
-  };
-}
diff --git a/src/project/schema.ts b/src/project/schema.ts
deleted file mode 100644
index 13fa0d9..0000000
--- a/src/project/schema.ts
+++ /dev/null
@@ -1,96 +0,0 @@
-// src/project/schema.ts
-// Zod schema for config documentation generation only.
-// Runtime validation stays in src/project/validate.ts.
-import { z } from 'zod/v3';
-
-const ModelConfigSchema = z.object({
-  id: z.string().describe('Provider-prefixed model ID — openrouter/<provider>/<slug>, anthropic/<slug>, or openai/<slug>. Example: openrouter/anthropic/claude-sonnet-4.6'),
-  name: z.string().describe('Human-readable model name for reports'),
-  tier: z.enum(['flagship', 'mid', 'low']).optional().describe('Model tier — affects weighting in weighted average'),
-  weight: z.number().optional().describe('Weight in weighted average (default 1.0). Higher = more influence'),
-});
-
-const DiscoveryConfigSchema = z.object({
-  mode: z.enum(['auto', 'manifest']).optional().describe('"auto" = code-first tree-sitter; "manifest" = use provided file only'),
-  sources: z.array(z.string()).optional().describe('Source files to scan for callable methods/commands/tools'),
-  fallbackManifest: z.string().optional().describe('Path to manifest JSON when code-first discovery is incomplete'),
-  language: z.enum(['typescript', 'python', 'rust']).optional().describe('Language for code-first discovery'),
-});
-
-const SdkConfigSchema = z.object({
-  language: z.enum(['typescript', 'python', 'rust']).optional().describe('SDK language'),
-  entrypoints: z.array(z.string()).optional().describe('SDK entry files for discovery'),
-});
-
-const CliConfigSchema = z.object({
-  commands: z.string().optional().describe('Path to CLI commands manifest JSON (CliCommandDefinition[])'),
-});
-
-const McpConfigSchema = z.object({
-  tools: z.string().optional().describe('Path to MCP tools manifest JSON (OpenAI function tool definitions)'),
-});
-
-const ScopeConfigSchema = z.object({
-  include: z.array(z.string()).optional().describe('Glob patterns for actions to include (default ["*"])'),
-  exclude: z.array(z.string()).optional().describe('Glob patterns for actions to exclude (default [])'),
-});
-
-const TargetConfigSchema = z.object({
-  surface: z.enum(['sdk', 'cli', 'mcp', 'prompt']).describe('Type of callable surface'),
-  repoPath: z.string().optional().describe('Path to the target repo (default ".")'),
-  skill: z.union([
-    z.string(),
-    z.object({ source: z.string(), cache: z.boolean().optional() }),
-  ]).optional().describe('Path to SKILL.md or { source, cache } object'),
-  discovery: DiscoveryConfigSchema.optional().describe('How to discover callable actions'),
-  sdk: SdkConfigSchema.optional().describe('SDK-specific config'),
-  cli: CliConfigSchema.optional().describe('CLI-specific config'),
-  mcp: McpConfigSchema.optional().describe('MCP-specific config'),
-  scope: ScopeConfigSchema.optional().describe('Scope filter — which actions to benchmark'),
-});
-
-const TaskGenerationConfigSchema = z.object({
-  enabled: z.boolean().optional().describe('Whether to generate tasks automatically (default false)'),
-  maxTasks: z.number().int().positive().optional().describe('Max tasks to generate — must be >= in-scope action count (default 10)'),
-  seed: z.number().int().nonnegative().optional().describe('RNG seed for reproducible generation (default 1)'),
-  outputDir: z.string().optional().describe('Where to write generated task artifacts (default ".skill-optimizer")'),
-});
-
-const VerdictConfigSchema = z.object({
-  perModelFloor: z.number().min(0).max(1).optional().describe('Minimum per-model pass fraction for PASS verdict (default 0.6)'),
-  targetWeightedAverage: z.number().min(0).max(1).optional().describe('Minimum weighted average across all models for PASS (default 0.7)'),
-});
-
-const BenchmarkConfigSchema = z.object({
-  format: z.enum(['pi', 'openai', 'anthropic']).optional().describe('LLM transport format: "pi" routes through OpenRouter/Pi (use openrouter/* or openai/* model refs); "openai" calls the OpenAI API directly (supports Codex auth); "anthropic" calls the Anthropic API directly'),
-  authMode: z.enum(['env', 'codex', 'auto']).optional().describe('How to resolve credentials: env var, ~/.codex/auth.json browser-login tokens, or env-then-codex fallback'),
-  apiKeyEnv: z.string().optional().describe('Env var name for the API key (default is determined by the model provider prefix: openrouter/ → OPENROUTER_API_KEY, openai/ → OPENAI_API_KEY, anthropic/ → ANTHROPIC_API_KEY; leave unset to use the per-provider default)'),
-  timeout: z.number().int().positive().optional().describe('Milliseconds per model call (default 240000)'),
-  models: z.array(ModelConfigSchema).describe('Models to benchmark — at least one required'),
-  taskGeneration: TaskGenerationConfigSchema.optional().describe('Automatic task generation config'),
-  output: z.object({
-    dir: z.string().optional().describe('Directory where reports are saved (default "benchmark-results/")'),
-  }).optional().describe('Output configuration'),
-  verdict: VerdictConfigSchema.optional().describe('PASS/FAIL thresholds'),
-});
-
-const OptimizeConfigSchema = z.object({
-  model: z.string().optional().describe('Model for mutation, e.g. openrouter/anthropic/claude-sonnet-4.6'),
-  authMode: z.enum(['env', 'codex', 'auto']).optional().describe('How to resolve optimizer credentials: env var, ~/.codex/auth.json browser-login tokens, or env-then-codex fallback'),
-  apiKeyEnv: z.string().optional().describe('Env var for the optimizer API key'),
-  thinkingLevel: z.enum(['off', 'minimal', 'low', 'medium', 'high', 'xhigh']).optional()
-    .describe('Reasoning depth for mutation calls (default "medium")'),
-  allowedPaths: z.array(z.string()).optional().describe('Paths the optimizer may edit — safety boundary'),
-  validation: z.array(z.string()).optional().describe('Shell commands to run to validate each mutation'),
-  requireCleanGit: z.boolean().optional().describe('Require clean git state before starting (default true)'),
-  maxIterations: z.number().int().positive().optional().describe('Maximum optimization iterations (default 5)'),
-  minImprovement: z.number().nonnegative().optional().describe('Minimum weighted-average gain per accepted iteration (default 0.02)'),
-  reportContextMaxBytes: z.number().int().positive().optional().describe('Byte budget for mutation context (default 16000)'),
-});
-
-export const ProjectConfigSchema = z.object({
-  name: z.string().describe('Human-readable project name'),
-  target: TargetConfigSchema.describe('Target surface configuration'),
-  benchmark: BenchmarkConfigSchema.describe('Benchmark configuration'),
-  optimize: OptimizeConfigSchema.optional().describe('Optimizer configuration (optional)'),
-});
diff --git a/src/project/snapshot.ts b/src/project/snapshot.ts
deleted file mode 100644
index e5ecd5f..0000000
--- a/src/project/snapshot.ts
+++ /dev/null
@@ -1,144 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { resolve, isAbsolute } from 'node:path';
-
-import type { SurfaceSnapshot } from './types.js';
-import type { ResolvedProjectConfig } from './types.js';
-import type { McpToolDefinition } from '../benchmark/types.js';
-import { discoverActions } from '../actions/discover.js';
-import { loadActionSnapshotFile, toSurfaceSnapshot } from '../actions/snapshot.js';
-import { discoverPromptCapabilities } from './discover-prompt.js';
-
-export function buildSurfaceSnapshot(project: ResolvedProjectConfig): SurfaceSnapshot {
-  if (project.benchmark.surfaceSnapshot) {
-    return loadSurfaceSnapshotFile(project.benchmark.surfaceSnapshot);
-  }
-
-  if (project.target.surface === 'prompt') {
-    return buildPromptSurfaceSnapshot(project);
-  }
-
-  return toSurfaceSnapshot(discoverActions(project));
-}
-
-function buildPromptSurfaceSnapshot(project: ResolvedProjectConfig): SurfaceSnapshot {
-  const skillSource = project.target.skill?.source;
-  if (!skillSource) {
-    throw new Error('Prompt surface requires target.skill to point at a markdown skill file');
-  }
-
-  const absPath = isAbsolute(skillSource) ? skillSource : resolve(project.configDir, skillSource);
-  if (!existsSync(absPath)) {
-    throw new Error(`Prompt skill file not found: ${absPath}`);
-  }
-
-  const content = readFileSync(absPath, 'utf-8');
-  const actions = discoverPromptCapabilities(content);
-
-  if (actions.length === 0) {
-    throw new Error(`Prompt discovery found 0 capabilities in ${absPath}`);
-  }
-
-  return {
-    surface: 'prompt',
-    actions: actions.map(({ key: _key, ...rest }) => rest),
-  };
-}
-
-function normalizeCliArgName(name: string): string {
-  return name.replace(/^-+/, '');
-}
-
-export function loadSurfaceSnapshotFile(snapshotPath: string): SurfaceSnapshot {
-  if (!existsSync(snapshotPath)) {
-    throw new Error(`Surface snapshot file not found: ${snapshotPath}`);
-  }
-
-  const raw = readFileSync(snapshotPath, 'utf-8');
-  let parsed: unknown;
-  try {
-    parsed = JSON.parse(raw) as unknown;
-  } catch (error) {
-    throw new Error(`Invalid surface snapshot file: ${snapshotPath} (invalid JSON: ${error instanceof Error ? error.message : String(error)})`);
-  }
-
-  if (!isActionSnapshotArtifactShape(parsed)) {
-    if (
-      parsed
-      && typeof parsed === 'object'
-      && 'surface' in parsed
-      && 'actions' in parsed
-    ) {
-      throw new Error(
-        `Snapshot file format is not supported: ${snapshotPath} — delete .skill-optimizer/ and re-run the benchmark to regenerate.`,
-      );
-    }
-    throw new Error(`Invalid surface snapshot file: ${snapshotPath}`);
-  }
-
-  return normalizeCliArgs(toSurfaceSnapshot(loadActionSnapshotFile(snapshotPath).catalog));
-}
-
-function isActionSnapshotArtifactShape(value: unknown): value is { version: number; catalog: { surface: string; actions: unknown[] } } {
-  if (!value || typeof value !== 'object') {
-    return false;
-  }
-
-  const candidate = value as {
-    version?: unknown;
-    catalog?: {
-      surface?: unknown;
-      actions?: unknown;
-    };
-  };
-
-  return typeof candidate.version === 'number'
-    && Boolean(candidate.catalog)
-    && typeof candidate.catalog === 'object'
-    && ['sdk', 'cli', 'mcp', 'prompt'].includes(String(candidate.catalog.surface))
-    && Array.isArray(candidate.catalog.actions);
-}
-
-function normalizeCliArgs(snapshot: SurfaceSnapshot): SurfaceSnapshot {
-  if (snapshot.surface !== 'cli') {
-    return snapshot;
-  }
-
-  return {
-    ...snapshot,
-    actions: snapshot.actions.map((action) => ({
-      ...action,
-      args: action.args.map((arg) => ({
-        ...arg,
-        name: normalizeCliArgName(arg.name),
-      })),
-    })),
-  };
-}
-
-export function buildMcpToolDefinitionsFromSnapshot(snapshot: SurfaceSnapshot): McpToolDefinition[] {
-  if (snapshot.surface !== 'mcp') {
-    throw new Error(`Cannot build MCP tool definitions from surface ${snapshot.surface}`);
-  }
-
-  return snapshot.actions.map((action) => ({
-    type: 'function',
-    function: {
-      name: action.name,
-      description: action.description,
-      parameters: {
-        type: 'object',
-        properties: Object.fromEntries(
-          action.args.map((arg) => [
-            arg.name,
-            {
-              ...(arg.schema ?? {}),
-              ...(arg.type ? { type: arg.type } : {}),
-              ...(arg.description ? { description: arg.description } : {}),
-            },
-          ]),
-        ),
-        required: action.args.filter((arg) => arg.required).map((arg) => arg.name),
-      },
-    },
-  }));
-}
diff --git a/src/project/types.ts b/src/project/types.ts
deleted file mode 100644
index a7e1007..0000000
--- a/src/project/types.ts
+++ /dev/null
@@ -1,183 +0,0 @@
-import type {
-  AgenticConfig,
-  BenchmarkSurface,
-  CliSurfaceConfig,
-  LLMConfig,
-  McpSurfaceConfig,
-  ModelConfig,
-  OutputConfig,
-  SdkLanguage,
-  SdkSurfaceConfig,
-  SkillConfig,
-} from '../benchmark/types.js';
-import type { ActionCatalog, ActionDefinition } from '../actions/types.js';
-
-export interface ProjectTaskGenerationConfig {
-  enabled?: boolean;
-  maxTasks?: number;
-  seed?: number;
-  outputDir?: string;
-}
-
-export interface ProjectDiscoveryConfig {
-  mode?: 'auto' | 'manifest';
-  sources?: string[];
-  fallbackManifest?: string;
-  language?: SdkLanguage;
-}
-
-export interface ProjectScopeConfig {
-  include?: string[];
-  exclude?: string[];
-}
-
-export interface ProjectTargetConfig {
-  surface: BenchmarkSurface;
-  repoPath?: string;
-  skill?: string | SkillConfig;
-  discovery?: ProjectDiscoveryConfig;
-  sdk?: Pick<SdkSurfaceConfig, 'language' | 'style' | 'apiSurface'> & {
-    entrypoints?: string[];
-  };
-  cli?: CliSurfaceConfig;
-  mcp?: McpSurfaceConfig;
-  scope?: ProjectScopeConfig;
-}
-
-export interface ProjectBenchmarkVerdictConfig {
-  perModelFloor?: number;
-  targetWeightedAverage?: number;
-}
-
-export interface ProjectBenchmarkConfig {
-  format?: LLMConfig['format'];
-  baseUrl?: string;
-  authMode?: LLMConfig['authMode'];
-  apiKeyEnv?: string;
-  timeout?: number;
-  headers?: Record<string, string>;
-  models: ModelConfig[];
-  tasks?: string;
-  surfaceSnapshot?: string;
-  taskGeneration?: ProjectTaskGenerationConfig;
-  output?: OutputConfig;
-  agentic?: AgenticConfig;
-  verdict?: ProjectBenchmarkVerdictConfig;
-}
-
-export interface ProjectOptimizeConfig {
-  enabled?: boolean;
-  mode?: 'stable-surface' | 'surface-changing';
-  model?: string;
-  authMode?: LLMConfig['authMode'];
-  apiKeyEnv?: string;
-  thinkingLevel?: 'off' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';
-  allowedPaths?: string[];
-  validation?: string[];
-  requireCleanGit?: boolean;
-  maxIterations?: number;
-  stabilityWindow?: number;
-  minImprovement?: number;
-  reportContextMaxBytes?: number;
-}
-
-export interface ProjectConfig {
-  name: string;
-  target: ProjectTargetConfig;
-  benchmark: ProjectBenchmarkConfig;
-  optimize?: ProjectOptimizeConfig;
-}
-
-export interface ResolvedProjectTaskGenerationConfig {
-  enabled: boolean;
-  maxTasks: number;
-  seed: number;
-  outputDir: string;
-}
-
-export interface ResolvedProjectTargetConfig {
-  surface: BenchmarkSurface;
-  repoPath: string;
-  skill?: SkillConfig;
-  discovery?: {
-    mode: 'auto' | 'manifest';
-    sources: string[];
-    fallbackManifest?: string;
-    language?: SdkLanguage;
-  };
-  sdk?: Pick<SdkSurfaceConfig, 'language' | 'style' | 'apiSurface'> & {
-    entrypoints: string[];
-  };
-  cli?: CliSurfaceConfig;
-  mcp?: McpSurfaceConfig;
-  scope: { include: string[]; exclude: string[] };
-}
-
-export interface ResolvedProjectBenchmarkConfig {
-  format: LLMConfig['format'];
-  baseUrl?: string;
-  authMode: NonNullable<LLMConfig['authMode']>;
-  apiKeyEnv?: string;
-  timeout: number;
-  headers?: Record<string, string>;
-  models: ModelConfig[];
-  tasks?: string;
-  surfaceSnapshot?: string;
-  taskGeneration: ResolvedProjectTaskGenerationConfig;
-  output: { dir: string };
-  agentic?: AgenticConfig;
-  verdict: { perModelFloor: number; targetWeightedAverage: number };
-}
-
-export interface ResolvedProjectOptimizeConfig {
-  enabled: boolean;
-  mode: 'stable-surface' | 'surface-changing';
-  model: string;
-  authMode: NonNullable<LLMConfig['authMode']>;
-  apiKeyEnv?: string;
-  thinkingLevel: 'off' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';
-  allowedPaths: string[];
-  validation: string[];
-  requireCleanGit: boolean;
-  maxIterations: number;
-  stabilityWindow: number;
-  minImprovement: number;
-  reportContextMaxBytes: number;
-}
-
-export interface ResolvedProjectConfig {
-  configPath: string;
-  configDir: string;
-  name: string;
-  target: ResolvedProjectTargetConfig;
-  benchmark: ResolvedProjectBenchmarkConfig;
-  optimize?: ResolvedProjectOptimizeConfig;
-}
-
-export type SurfaceSnapshotAction = Omit<ActionDefinition, 'key'>;
-
-export interface SurfaceSnapshot extends Omit<ActionCatalog, 'actions'> {
-  surface: BenchmarkSurface;
-  actions: SurfaceSnapshotAction[];
-}
-
-export interface ParsedModelRef {
-  provider: string;
-  model: string;
-}
-
-export function parseModelRef(modelRef: string): ParsedModelRef {
-  const slashIndex = modelRef.indexOf('/');
-  if (slashIndex === -1) {
-    throw new Error(`Model references must be in provider/model form, got "${modelRef}"`);
-  }
-
-  return {
-    provider: modelRef.slice(0, slashIndex),
-    model: modelRef.slice(slashIndex + 1),
-  };
-}
-
-export function isSdkLanguage(value: string): value is SdkLanguage {
-  return value === 'typescript' || value === 'python' || value === 'rust';
-}
diff --git a/src/project/validate.ts b/src/project/validate.ts
deleted file mode 100644
index 8b15586..0000000
--- a/src/project/validate.ts
+++ /dev/null
@@ -1,495 +0,0 @@
-import { existsSync } from 'node:fs';
-import { resolve, dirname, isAbsolute } from 'node:path';
-import { execFile } from 'node:child_process';
-import { promisify } from 'node:util';
-
-import type { ProjectConfig } from './types.js';
-import { isSdkLanguage, parseModelRef } from './types.js';
-import { resolveApiKey } from '../runtime/pi/auth.js';
-
-const execFileAsync = promisify(execFile);
-
-export type IssueSeverity = 'error' | 'warning' | 'info';
-
-export interface Issue {
-  code: string;
-  severity: IssueSeverity;
-  field: string;
-  message: string;
-  hint?: string;
-  fixable: boolean;
-}
-
-export async function checkConfig(
-  config: unknown,
-  _configPath: string,
-  opts?: { skipDirtyGitCheck?: boolean },
-): Promise<Issue[]> {
-  const issues: Issue[] = [];
-
-  function err(code: string, field: string, message: string, hint?: string): void {
-    issues.push({ code, severity: 'error', field, message, hint, fixable: false });
-  }
-
-  const cfg = config as ProjectConfig;
-
-  if (!cfg.name || typeof cfg.name !== 'string') {
-    err('missing-name', 'name', '"name" is required');
-    return issues;
-  }
-
-  if (!cfg.target || typeof cfg.target !== 'object') {
-    err('missing-target', 'target', '"target" is required');
-    return issues;
-  }
-
-  const { target, benchmark, optimize } = cfg;
-
-  if (target.surface !== 'sdk' && target.surface !== 'cli' && target.surface !== 'mcp' && target.surface !== 'prompt') {
-    err('invalid-surface', 'target.surface', '"target.surface" must be sdk, cli, mcp, or prompt');
-  }
-
-  if (target.skill !== undefined) {
-    const skillSource = typeof target.skill === 'string' ? target.skill : target.skill?.source;
-    if (!skillSource || typeof skillSource !== 'string') {
-      err('invalid-skill', 'target.skill', '"target.skill" must be a path string or { source } object');
-    }
-  }
-
-  if (target.scope !== undefined) {
-    if (target.scope.include !== undefined) {
-      if (!Array.isArray(target.scope.include) || target.scope.include.some((s) => typeof s !== 'string')) {
-        err('invalid-scope-include', 'target.scope.include', '"target.scope.include" must be an array of glob strings');
-      }
-    }
-    if (target.scope.exclude !== undefined) {
-      if (!Array.isArray(target.scope.exclude) || target.scope.exclude.some((s) => typeof s !== 'string')) {
-        err('invalid-scope-exclude', 'target.scope.exclude', '"target.scope.exclude" must be an array of glob strings');
-      }
-    }
-  }
-
-  if (benchmark?.taskGeneration?.enabled === true && target.skill === undefined) {
-    err('missing-skill-for-generation', 'target.skill', '"target.skill" is required when benchmark.taskGeneration.enabled=true');
-  }
-
-  if (target.surface === 'sdk') {
-    const sdkLanguage = target.sdk?.language ?? target.discovery?.language;
-    if (!sdkLanguage || !isSdkLanguage(sdkLanguage)) {
-      err('invalid-sdk-language', 'target.sdk.language', '"target.sdk.language" must be typescript, python, or rust');
-    }
-    const hasCodeSources = Array.isArray(target.discovery?.sources) && target.discovery.sources.length > 0;
-    const hasApiSurface = Array.isArray(target.sdk?.apiSurface) && target.sdk.apiSurface.length > 0;
-    if (!hasCodeSources && !hasApiSurface) {
-      err('missing-sdk-surface', 'target', 'SDK targets need discovery.sources or target.sdk.apiSurface');
-    }
-  }
-
-  if (target.surface === 'cli') {
-    const discoveryMode = target.discovery?.mode ?? 'auto';
-    const hasCodeSources = Array.isArray(target.discovery?.sources) && target.discovery.sources.length > 0;
-    const hasManifest = Boolean(target.cli?.commands || target.discovery?.fallbackManifest);
-    if (discoveryMode === 'manifest' && !hasManifest) {
-      err('missing-cli-manifest', 'target', 'CLI manifest mode requires target.cli.commands or target.discovery.fallbackManifest');
-    }
-    if (!hasManifest && !hasCodeSources) {
-      err('missing-cli-surface', 'target', 'CLI targets need discovery.sources, target.cli.commands, or target.discovery.fallbackManifest');
-    }
-  }
-
-  if (target.surface === 'mcp') {
-    const discoveryMode = target.discovery?.mode ?? 'auto';
-    const hasCodeSources = Array.isArray(target.discovery?.sources) && target.discovery?.sources.length > 0;
-    const hasManifest = Boolean(target.mcp?.tools || target.discovery?.fallbackManifest);
-    if (discoveryMode === 'manifest' && !hasManifest) {
-      err('missing-mcp-manifest', 'target', 'MCP manifest mode requires target.mcp.tools or target.discovery.fallbackManifest');
-    }
-    if (!hasManifest && !hasCodeSources) {
-      err('missing-mcp-surface', 'target', 'MCP targets need discovery.sources, target.mcp.tools, or target.discovery.fallbackManifest');
-    }
-  }
-
-  if (target.surface === 'prompt') {
-    if (target.skill === undefined) {
-      err('missing-prompt-skill', 'target.skill', 'Prompt targets require target.skill (path to the markdown file)');
-    }
-  }
-
-  if (target.discovery) {
-    if (target.discovery.mode && target.discovery.mode !== 'auto' && target.discovery.mode !== 'manifest') {
-      err('invalid-discovery-mode', 'target.discovery.mode', '"target.discovery.mode" must be auto or manifest');
-    }
-    if (target.discovery.sources !== undefined && !Array.isArray(target.discovery.sources)) {
-      err('invalid-discovery-sources', 'target.discovery.sources', '"target.discovery.sources" must be an array when present');
-    }
-    if (target.discovery.language !== undefined && !isSdkLanguage(target.discovery.language)) {
-      err('invalid-discovery-language', 'target.discovery.language', '"target.discovery.language" must be typescript, python, or rust when present');
-    }
-  }
-
-  if (!benchmark || typeof benchmark !== 'object') {
-    err('missing-benchmark', 'benchmark', '"benchmark" is required');
-    return issues;
-  }
-
-  if (!Array.isArray(benchmark.models) || benchmark.models.length === 0) {
-    err('missing-models', 'benchmark.models', '"benchmark.models" must be a non-empty array');
-  } else {
-    for (const model of benchmark.models) {
-      if (!model.id || !model.name || !model.tier) {
-        err('invalid-model', 'benchmark.models', 'each benchmark model needs id, name, and tier');
-      }
-    }
-
-    for (const [i, model] of benchmark.models.entries()) {
-      if (model.weight !== undefined && (!Number.isFinite(model.weight) || model.weight < 0)) {
-        err('invalid-model-weight', `benchmark.models[${i}].weight`, `model "${model.id}" has invalid weight; must be a non-negative number`);
-      }
-    }
-
-    if (benchmark.authMode === 'codex') {
-      for (const [i, model] of benchmark.models.entries()) {
-        try {
-          const { provider } = parseModelRef(model.id);
-          if (provider !== 'openai') {
-            err(
-              'codex-auth-provider-mismatch',
-              `benchmark.models[${i}].id`,
-              `benchmark.authMode="codex" only supports openai/* models, but found "${model.id}"`,
-              'Use openai/* model IDs with codex auth, or switch benchmark.authMode to "env" / "auto"',
-            );
-          }
-        } catch {
-          // model-id format issues are reported separately
-        }
-      }
-    }
-  }
-
-  if (benchmark.verdict !== undefined) {
-    if (benchmark.verdict.perModelFloor !== undefined) {
-      const v = benchmark.verdict.perModelFloor;
-      if (!Number.isFinite(v) || v < 0 || v > 1) {
-        err('invalid-per-model-floor', 'benchmark.verdict.perModelFloor', '"benchmark.verdict.perModelFloor" must be between 0 and 1');
-      }
-    }
-    if (benchmark.verdict.targetWeightedAverage !== undefined) {
-      const v = benchmark.verdict.targetWeightedAverage;
-      if (!Number.isFinite(v) || v < 0 || v > 1) {
-        err('invalid-target-weighted-average', 'benchmark.verdict.targetWeightedAverage', '"benchmark.verdict.targetWeightedAverage" must be between 0 and 1');
-      }
-    }
-  }
-
-  if (benchmark.format && benchmark.format !== 'pi' && benchmark.format !== 'openai' && benchmark.format !== 'anthropic') {
-    err('invalid-format', 'benchmark.format', '"benchmark.format" must be pi, openai, or anthropic');
-  }
-
-  // Check: format/model prefix compatibility — openai format requires openai/ prefix, anthropic format requires anthropic/
-  if (benchmark.format === 'openai' || benchmark.format === 'anthropic') {
-    const requiredPrefix = `${benchmark.format}/`;
-    if (Array.isArray(benchmark.models)) {
-      for (let i = 0; i < benchmark.models.length; i++) {
-        const model = benchmark.models[i]!;
-        if (model.id && !model.id.startsWith(requiredPrefix)) {
-          issues.push({
-            code: 'format-model-prefix-mismatch', severity: 'error', field: `benchmark.models[${i}].id`,
-            message: `model ID "${model.id}" does not match format "${benchmark.format}" — expected prefix "${requiredPrefix}"`,
-            hint: `Change model ID to start with "${requiredPrefix}" or change benchmark.format to match your model provider`,
-            fixable: false,
-          });
-        }
-      }
-    }
-    if (optimize?.model && !optimize.model.startsWith(requiredPrefix)) {
-      issues.push({
-        code: 'format-model-prefix-mismatch', severity: 'error', field: 'optimize.model',
-        message: `optimize.model "${optimize.model}" does not match format "${benchmark.format}" — expected prefix "${requiredPrefix}"`,
-        hint: `Change optimize.model to start with "${requiredPrefix}" or change benchmark.format to match your model provider`,
-        fixable: false,
-      });
-    }
-  }
-
-  if (benchmark.taskGeneration?.enabled !== true && !benchmark.tasks) {
-    err('missing-tasks', 'benchmark.tasks', '"benchmark.tasks" is required when task generation is disabled');
-  }
-
-  if (benchmark.taskGeneration?.maxTasks !== undefined && (!Number.isInteger(benchmark.taskGeneration.maxTasks) || benchmark.taskGeneration.maxTasks <= 0)) {
-    err('invalid-max-tasks', 'benchmark.taskGeneration.maxTasks', '"benchmark.taskGeneration.maxTasks" must be a positive integer');
-  }
-
-  if (benchmark.taskGeneration?.seed !== undefined && (!Number.isInteger(benchmark.taskGeneration.seed) || benchmark.taskGeneration.seed < 0)) {
-    err('invalid-seed', 'benchmark.taskGeneration.seed', '"benchmark.taskGeneration.seed" must be a non-negative integer');
-  }
-
-  if (optimize) {
-    if (optimize.mode !== undefined && optimize.mode !== 'stable-surface' && optimize.mode !== 'surface-changing') {
-      err('invalid-optimize-mode', 'optimize.mode', '"optimize.mode" must be stable-surface or surface-changing');
-    }
-    if (optimize.mode === 'surface-changing' && benchmark.taskGeneration?.enabled !== true) {
-      err('surface-changing-needs-generation', 'optimize.mode', 'surface-changing optimization requires benchmark.taskGeneration.enabled=true');
-    }
-    if (optimize.enabled !== false) {
-      if (!Array.isArray(optimize.allowedPaths) || optimize.allowedPaths.length === 0) {
-        err('missing-allowed-paths', 'optimize.allowedPaths', '"optimize.allowedPaths" must be a non-empty array when optimization is enabled');
-      }
-    }
-
-    if (optimize.maxIterations !== undefined && (!Number.isInteger(optimize.maxIterations) || optimize.maxIterations <= 0)) {
-      err('invalid-max-iterations', 'optimize.maxIterations', '"optimize.maxIterations" must be a positive integer');
-    }
-
-    if (optimize.stabilityWindow !== undefined && (!Number.isInteger(optimize.stabilityWindow) || optimize.stabilityWindow <= 0)) {
-      err('invalid-stability-window', 'optimize.stabilityWindow', '"optimize.stabilityWindow" must be a positive integer');
-    }
-
-    if (optimize.minImprovement !== undefined && (!Number.isFinite(optimize.minImprovement) || optimize.minImprovement < 0)) {
-      err('invalid-min-improvement', 'optimize.minImprovement', '"optimize.minImprovement" must be a non-negative number');
-    }
-
-    if (optimize.reportContextMaxBytes !== undefined && (!Number.isInteger(optimize.reportContextMaxBytes) || optimize.reportContextMaxBytes <= 0)) {
-      err('invalid-report-context-max-bytes', 'optimize.reportContextMaxBytes', '"optimize.reportContextMaxBytes" must be a positive integer');
-    }
-
-    if (optimize.requireCleanGit === false) {
-      err('require-clean-git-disabled', 'optimize.requireCleanGit', '"optimize.requireCleanGit" must remain true in v1');
-    }
-
-    const effectiveOptimizeAuthMode = optimize.authMode ?? benchmark.authMode;
-    if (effectiveOptimizeAuthMode === 'codex') {
-      const optimizeModelRef = optimize.model
-        ?? (Array.isArray(benchmark.models) && benchmark.models.length > 0 ? benchmark.models[0]!.id : undefined);
-      if (typeof optimizeModelRef === 'string') {
-        try {
-          const { provider } = parseModelRef(optimizeModelRef);
-          if (provider !== 'openai') {
-            err(
-              'codex-auth-provider-mismatch',
-              'optimize.model',
-              `optimize.authMode="codex" only supports openai/* models, but found "${optimizeModelRef}"`,
-              'Use an openai/* optimize.model with codex auth, or switch optimize.authMode to "env" / "auto"',
-            );
-          }
-        } catch {
-          // model-id format issues are reported separately
-        }
-      }
-    }
-  }
-
-  const configDir = dirname(_configPath);
-
-  // Check: target.repoPath exists
-  if (target.repoPath !== undefined) {
-    const absRepo = isAbsolute(target.repoPath) ? target.repoPath : resolve(configDir, target.repoPath);
-    if (!existsSync(absRepo)) {
-      issues.push({
-        code: 'repo-path-missing', severity: 'error', field: 'target.repoPath',
-        message: `"target.repoPath" does not exist: ${absRepo}`,
-        hint: `Set target.repoPath to the absolute path of your project root`,
-        fixable: false,
-      });
-    }
-  }
-
-  // Check: target.skill file exists (skip for remote sources — github: / https: / http:)
-  if (target.skill !== undefined) {
-    const skillSource = typeof target.skill === 'string' ? target.skill : target.skill.source;
-    const isRemoteSkill = skillSource.startsWith('github:') || skillSource.startsWith('https://') || skillSource.startsWith('http://');
-    if (skillSource && !isRemoteSkill) {
-      const absSkill = isAbsolute(skillSource) ? skillSource : resolve(configDir, skillSource);
-      if (!existsSync(absSkill)) {
-        issues.push({
-          code: 'skill-file-missing', severity: 'error', field: 'target.skill',
-          message: `"target.skill" does not exist: ${absSkill}`,
-          hint: `Create SKILL.md at that path or update target.skill`,
-          fixable: false,
-        });
-      }
-    }
-  }
-
-  // Check: target.discovery.sources all exist
-  if (Array.isArray(target.discovery?.sources)) {
-    for (const src of target.discovery!.sources) {
-      const absSrc = isAbsolute(src) ? src : resolve(configDir, src);
-      if (!existsSync(absSrc)) {
-        issues.push({
-          code: 'discovery-source-missing', severity: 'error', field: 'target.discovery.sources',
-          message: `discovery source does not exist: ${absSrc}`,
-          hint: `Update target.discovery.sources to point at your entry file`,
-          fixable: false,
-        });
-      }
-    }
-  }
-
-  // Check: CLI/MCP manifest file exists if configured
-  const manifestPath = target.cli?.commands ?? target.mcp?.tools ?? target.discovery?.fallbackManifest;
-  if (manifestPath) {
-    const absManifest = isAbsolute(manifestPath) ? manifestPath : resolve(configDir, manifestPath);
-    if (!existsSync(absManifest)) {
-      let field: string;
-      if (target.cli?.commands) {
-        field = 'target.cli.commands';
-      } else if (target.mcp?.tools) {
-        field = 'target.mcp.tools';
-      } else {
-        field = 'target.discovery.fallbackManifest';
-      }
-      issues.push({
-        code: 'manifest-file-missing', severity: 'error', field,
-        message: `manifest file does not exist: ${absManifest}`,
-        hint: `Run 'skill-optimizer init ${target.surface}' to generate a template manifest`,
-        fixable: false,
-      });
-    }
-  }
-
-  // Check: model ID format
-  if (Array.isArray(benchmark.models)) {
-    for (let i = 0; i < benchmark.models.length; i++) {
-      const model = benchmark.models[i]!;
-      if (!model.id) continue;
-
-      const hasProviderPrefix = model.id.includes('/') && (
-        model.id.startsWith('openrouter/') ||
-        model.id.startsWith('anthropic/') ||
-        model.id.startsWith('openai/')
-      );
-      if (!hasProviderPrefix) {
-        issues.push({
-          code: 'model-id-missing-prefix', severity: 'error', field: `benchmark.models[${i}].id`,
-          message: `model ID "${model.id}" is missing a provider prefix`,
-          hint: `Change to: openrouter/${model.id}`,
-          fixable: true,
-        });
-      }
-
-      // OpenAI's own API uses dots in some model slugs (e.g. gpt-4.5), so skip the
-      // dot check for openai/ IDs. OpenRouter model slugs come from the provider's
-      // own catalog and are passed verbatim to OpenRouter — don't rewrite them either.
-      if (!model.id.startsWith('openai/') && !model.id.startsWith('openrouter/') && /\d+\.\d+/.test(model.id)) {
-        const corrected = model.id.replace(/(\d+)\.(\d+)/g, '$1-$2');
-        issues.push({
-          code: 'model-id-bad-format', severity: 'warning', field: `benchmark.models[${i}].id`,
-          message: `model ID "${model.id}" uses dots in version segment — use hyphens instead`,
-          hint: `Change to: ${corrected}`,
-          fixable: true,
-        });
-      }
-    }
-  }
-
-  // Check: optimize.allowedPaths inside target.repoPath
-  if (optimize?.allowedPaths && target.repoPath) {
-    const absRepo = isAbsolute(target.repoPath) ? target.repoPath : resolve(configDir, target.repoPath);
-    for (const ap of optimize.allowedPaths) {
-      const absAp = isAbsolute(ap) ? ap : resolve(absRepo, ap);
-      if (!absAp.startsWith(absRepo + '/') && absAp !== absRepo) {
-        issues.push({
-          code: 'allowed-path-outside-repo', severity: 'error', field: 'optimize.allowedPaths',
-          message: `allowedPath "${ap}" is not inside target.repoPath "${absRepo}"`,
-          hint: `Use a path inside ${absRepo}`,
-          fixable: false,
-        });
-      }
-    }
-  }
-
-  // Check: API key env var / Codex auth
-  const authMode = benchmark.authMode ?? 'env';
-  function warnMissingApiKey(provider: string, effectiveAuthMode: typeof authMode, apiKeyEnv: string | undefined, fieldPrefix: 'benchmark' | 'optimize'): void {
-    const defaultEnvName = apiKeyEnv
-      ?? (provider === 'openai' ? 'OPENAI_API_KEY'
-        : provider === 'anthropic' ? 'ANTHROPIC_API_KEY'
-        : 'OPENROUTER_API_KEY');
-    const hint = effectiveAuthMode === 'codex'
-      ? `Sign in with Codex so ~/.codex/auth.json contains a browser-login access token or OPENAI_API_KEY, or switch ${fieldPrefix}.authMode to "env"`
-      : effectiveAuthMode === 'auto' && provider === 'openai'
-        ? `Run: export ${defaultEnvName}=... or sign in with Codex`
-        : `Run: export ${defaultEnvName}=...`;
-    issues.push({
-      code: 'api-key-not-set', severity: 'warning',
-      field: effectiveAuthMode === 'codex' ? `${fieldPrefix}.authMode` : `${fieldPrefix}.apiKeyEnv`,
-      message: effectiveAuthMode === 'codex'
-        ? 'Codex auth is enabled but no usable browser-login access token or OPENAI_API_KEY was found in ~/.codex/auth.json'
-        : `No API key was found for authMode "${effectiveAuthMode}"`,
-      hint,
-      fixable: false,
-    });
-  }
-
-  if (benchmark.format === 'openai' || benchmark.format === 'anthropic') {
-    // Single direct-API provider — one credential to check
-    const benchmarkProvider = benchmark.format === 'openai' ? 'openai' : 'anthropic';
-    const apiKey = resolveApiKey({ provider: benchmarkProvider, authMode, apiKeyEnv: benchmark.apiKeyEnv });
-    if (!apiKey) warnMissingApiKey(benchmarkProvider, authMode, benchmark.apiKeyEnv, 'benchmark');
-  } else {
-    // Pi format: each model may route through a different provider — check all unique ones
-    const modelList = Array.isArray(benchmark.models) ? benchmark.models : [];
-    const providers = Array.from(new Set(
-      modelList.length > 0
-        ? modelList.map(m => String(m.id ?? '').split('/')[0] || 'openrouter')
-        : ['openrouter'],
-    ));
-    for (const provider of providers) {
-      const apiKey = resolveApiKey({ provider, authMode, apiKeyEnv: benchmark.apiKeyEnv });
-      if (!apiKey) warnMissingApiKey(provider, authMode, benchmark.apiKeyEnv, 'benchmark');
-    }
-  }
-
-  // Check: optimize API key env var / Codex auth (skip when optimization is disabled)
-  if (optimize !== undefined && optimize.enabled !== false) {
-    const optimizeAuthMode = optimize.authMode ?? benchmark.authMode ?? 'env';
-    const optimizeModelRef = optimize.model
-      ?? (Array.isArray(benchmark.models) && benchmark.models.length > 0 ? benchmark.models[0]!.id : undefined);
-    const optimizeProvider = typeof optimizeModelRef === 'string'
-      ? (optimizeModelRef.split('/')[0] || 'openrouter')
-      : 'openrouter';
-    const optimizeApiKeyEnv = optimize.apiKeyEnv ?? benchmark.apiKeyEnv;
-    const optimizeApiKey = resolveApiKey({ provider: optimizeProvider, authMode: optimizeAuthMode, apiKeyEnv: optimizeApiKeyEnv });
-    if (!optimizeApiKey) warnMissingApiKey(optimizeProvider, optimizeAuthMode, optimizeApiKeyEnv, 'optimize');
-  }
-
-  // Check: dirty git (uses a fixed arg array, not a shell string, to prevent injection)
-  // Skipped inside the optimizer loop — the loop manages git state itself via ensureReady.
-  if (!opts?.skipDirtyGitCheck && optimize !== undefined && optimize.requireCleanGit !== false && target.repoPath) {
-    const absRepo = isAbsolute(target.repoPath) ? target.repoPath : resolve(configDir, target.repoPath);
-    if (existsSync(absRepo)) {
-      try {
-        // Verify the git root is exactly target.repoPath, not a parent directory.
-        // If target.repoPath is a subdirectory of a larger repo (e.g. a mock template
-        // inside the tool's own repo), git status would reflect the parent's state —
-        // skip the check to avoid false positives.
-        const { stdout: rootOut } = await execFileAsync('git', ['rev-parse', '--show-toplevel'], { cwd: absRepo });
-        const gitRoot = rootOut.trim();
-        if (gitRoot === absRepo) {
-          const { stdout } = await execFileAsync('git', ['status', '--porcelain'], { cwd: absRepo });
-          const dirtyTracked = stdout.split('\n').filter(l => l.trim() && !l.startsWith('??'));
-          if (dirtyTracked.length > 0) {
-            issues.push({
-              code: 'dirty-git', severity: 'error', field: 'target.repoPath',
-              message: `target repo has uncommitted changes (optimize.requireCleanGit is enabled)`,
-              hint: `Run: git stash  or commit your changes in ${absRepo}`,
-              fixable: false,
-            });
-          }
-        }
-      } catch {
-        // Not a git repo or git unavailable — skip silently
-      }
-    }
-  }
-
-  return issues;
-}
-
-export async function validateProjectConfig(config: ProjectConfig, configPath: string, opts?: { skipDirtyGitCheck?: boolean }): Promise<void> {
-  const issues = await checkConfig(config, configPath, opts);
-  const errors = issues.filter((i) => i.severity === 'error');
-  if (errors.length > 0) {
-    throw new Error(errors.map((i) => `${i.field}: ${i.message}${i.hint ? ` — ${i.hint}` : ''}`).join('\n'));
-  }
-}
diff --git a/src/runtime/pi/auth.ts b/src/runtime/pi/auth.ts
deleted file mode 100644
index e50da25..0000000
--- a/src/runtime/pi/auth.ts
+++ /dev/null
@@ -1,172 +0,0 @@
-import { readFileSync } from 'node:fs';
-import { homedir } from 'node:os';
-import { resolve } from 'node:path';
-
-import { AuthStorage } from '@mariozechner/pi-coding-agent';
-
-export type PiAuthMode = 'env' | 'codex' | 'auto';
-
-export interface PiAuthOptions {
-  provider: string;
-  authMode?: PiAuthMode;
-  apiKeyEnv?: string;
-  apiKeyOverride?: string;
-}
-
-export interface ResolvedApiCredential {
-  apiKey?: string;
-  source?: 'override' | 'env' | 'codex';
-}
-
-const CODEX_ACCESS_TOKEN_REFRESH_SKEW_MS = 120_000;
-
-export function createPiAuthStorage(options?: PiAuthOptions): ReturnType<typeof AuthStorage.create> {
-  const authStorage = AuthStorage.create();
-  if (!options) {
-    return authStorage;
-  }
-
-  const apiKey = resolveApiKey({
-    provider: options.provider,
-    authMode: options.authMode,
-    apiKeyEnv: options.apiKeyEnv,
-    apiKeyOverride: options.apiKeyOverride,
-  });
-  if (apiKey) {
-    authStorage.setRuntimeApiKey(options.provider as never, apiKey);
-  }
-  return authStorage;
-}
-
-export function resolveApiKey(options: PiAuthOptions): string | undefined {
-  return resolveApiCredential(options).apiKey;
-}
-
-export function resolveApiCredential(options: PiAuthOptions): ResolvedApiCredential {
-  if (options.apiKeyOverride) {
-    return { apiKey: options.apiKeyOverride, source: 'override' };
-  }
-
-  const authMode = options.authMode ?? 'env';
-  const envName = options.apiKeyEnv ?? defaultApiKeyEnvForProvider(options.provider);
-  const envApiKey = envName ? process.env[envName] : undefined;
-
-  if (authMode === 'env') {
-    return envApiKey ? { apiKey: envApiKey, source: 'env' } : {};
-  }
-
-  if (authMode === 'codex') {
-    return readCodexApiKey(options.provider);
-  }
-
-  if (options.authMode !== 'codex' && envApiKey) {
-    return { apiKey: envApiKey, source: 'env' };
-  }
-
-  return readCodexApiKey(options.provider);
-}
-
-export function requireConfiguredApiKey(options: PiAuthOptions): string {
-  const apiKey = resolveApiKey(options);
-  if (apiKey) {
-    return apiKey;
-  }
-
-  const authMode = options.authMode ?? 'env';
-  if (authMode === 'codex') {
-    if (options.provider !== 'openai') {
-      throw new Error(
-        `Codex auth only supports the "openai" provider, got "${options.provider}". ` +
-        `Use authMode: "env" with an appropriate API key env var instead.`,
-      );
-    }
-    throw new Error(
-      `Codex auth is enabled for provider "${options.provider}" but no usable access token or OPENAI_API_KEY was found in ~/.codex/auth.json.`,
-    );
-  }
-
-  if (authMode === 'auto' && options.provider === 'openai') {
-    const envName = options.apiKeyEnv ?? 'OPENAI_API_KEY';
-    throw new Error(
-      `Could not resolve auth for provider "${options.provider}". ` +
-      `Checked env var "${envName}" and ~/.codex/auth.json for a browser-login access token or OPENAI_API_KEY.`,
-    );
-  }
-
-  const envName = options.apiKeyEnv ?? defaultApiKeyEnvForProvider(options.provider);
-  if (!envName) {
-    throw new Error(`No default API key env var is known for provider "${options.provider}"`);
-  }
-
-  throw new Error(`Missing API key env var: ${envName}`);
-}
-
-function readCodexApiKey(provider: string): ResolvedApiCredential {
-  if (provider !== 'openai') {
-    return {};
-  }
-
-  const authPath = resolve(homedir(), '.codex', 'auth.json');
-  let raw: string;
-  try {
-    raw = readFileSync(authPath, 'utf-8');
-  } catch {
-    return {};
-  }
-
-  try {
-    const parsed = JSON.parse(raw) as {
-      OPENAI_API_KEY?: unknown;
-      tokens?: { OPENAI_API_KEY?: unknown; access_token?: unknown };
-    };
-    // Browser-login JWT takes highest priority: it represents an active user session.
-    // A stale static key must not shadow a valid browser-login token.
-    // However, an expired JWT must not shadow a valid static key — fall through on expiry.
-    if (typeof parsed.tokens?.access_token === 'string' && parsed.tokens.access_token.trim()) {
-      if (!isJwtExpired(parsed.tokens.access_token)) {
-        return { apiKey: parsed.tokens.access_token, source: 'codex' };
-      }
-      // JWT is expired — fall through to static key fallbacks below
-    }
-    if (typeof parsed.tokens?.OPENAI_API_KEY === 'string' && parsed.tokens.OPENAI_API_KEY.trim()) {
-      return { apiKey: parsed.tokens.OPENAI_API_KEY, source: 'env' };
-    }
-    if (typeof parsed.OPENAI_API_KEY === 'string' && parsed.OPENAI_API_KEY.trim()) {
-      return { apiKey: parsed.OPENAI_API_KEY, source: 'env' };
-    }
-    return {};
-  } catch {
-    return {};
-  }
-}
-
-function isJwtExpired(token: string): boolean {
-  const parts = token.split('.');
-  if (parts.length !== 3) {
-    return true;
-  }
-
-  try {
-    const payload = JSON.parse(Buffer.from(parts[1]!, 'base64url').toString('utf-8')) as { exp?: unknown };
-    if (typeof payload.exp !== 'number') {
-      return false;
-    }
-    return payload.exp * 1000 <= Date.now() + CODEX_ACCESS_TOKEN_REFRESH_SKEW_MS;
-  } catch {
-    return false;
-  }
-}
-
-function defaultApiKeyEnvForProvider(provider: string): string | undefined {
-  switch (provider) {
-    case 'openai':
-    case 'openai-codex':
-      return 'OPENAI_API_KEY';
-    case 'anthropic':
-      return 'ANTHROPIC_API_KEY';
-    case 'openrouter':
-      return 'OPENROUTER_API_KEY';
-    default:
-      return undefined;
-  }
-}
diff --git a/src/runtime/pi/coding-orchestrator.ts b/src/runtime/pi/coding-orchestrator.ts
deleted file mode 100644
index ae8f16d..0000000
--- a/src/runtime/pi/coding-orchestrator.ts
+++ /dev/null
@@ -1,33 +0,0 @@
-import {
-  SessionManager,
-  createAgentSession,
-  createCodingTools,
-} from '@mariozechner/pi-coding-agent';
-
-import { parseModelRef } from '../../project/types.js';
-import { resolvePiModel } from './models.js';
-import type { PiAuthMode } from './auth.js';
-
-export async function createCodingOrchestratorSession(params: {
-  cwd: string;
-  modelRef: string;
-  authMode?: PiAuthMode;
-  apiKeyEnv?: string;
-  thinkingLevel?: 'off' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';
-}) {
-  const { provider, model } = parseModelRef(params.modelRef);
-  const resolved = await resolvePiModel(provider, model, {
-    authMode: params.authMode,
-    apiKeyEnv: params.apiKeyEnv,
-  });
-
-  return createAgentSession({
-    cwd: params.cwd,
-    model: resolved.model,
-    thinkingLevel: params.thinkingLevel ?? 'medium',
-    authStorage: resolved.authStorage,
-    modelRegistry: resolved.modelRegistry,
-    tools: createCodingTools(params.cwd),
-    sessionManager: SessionManager.inMemory(),
-  });
-}
diff --git a/src/runtime/pi/index.ts b/src/runtime/pi/index.ts
deleted file mode 100644
index b85353c..0000000
--- a/src/runtime/pi/index.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-export { createPiAuthStorage, requireConfiguredApiKey, resolveApiCredential, resolveApiKey } from './auth.js';
-export { createCodingOrchestratorSession } from './coding-orchestrator.js';
-export { resolvePiModel, resolvePiModelByRef } from './models.js';
-
-export type { PiAuthMode, PiAuthOptions } from './auth.js';
-export type { ResolvedPiModelRequest } from './models.js';
diff --git a/src/runtime/pi/models.ts b/src/runtime/pi/models.ts
deleted file mode 100644
index d4c378b..0000000
--- a/src/runtime/pi/models.ts
+++ /dev/null
@@ -1,125 +0,0 @@
-import type { Api, Model } from '@mariozechner/pi-ai';
-import { getModel } from '@mariozechner/pi-ai';
-import { ModelRegistry } from '@mariozechner/pi-coding-agent';
-
-import { createPiAuthStorage, resolveApiCredential } from './auth.js';
-import type { PiAuthMode } from './auth.js';
-import { parseModelRef } from '../../project/types.js';
-
-export interface ResolvedPiModelRequest {
-  model: Model<Api>;
-  authStorage: ReturnType<typeof createPiAuthStorage>;
-  modelRegistry: ReturnType<typeof ModelRegistry.create>;
-  auth: {
-    apiKey?: string;
-    headers?: Record<string, string>;
-  };
-}
-
-export async function resolvePiModelByRef(
-  modelRef: string,
-  options?: { authMode?: PiAuthMode; apiKeyEnv?: string; apiKeyOverride?: string },
-): Promise<ResolvedPiModelRequest> {
-  const { provider, model } = parseModelRef(modelRef);
-  return resolvePiModel(provider, model, options);
-}
-
-/**
- * Synthesizes a Model entry for OpenRouter models that are not pre-registered in pi-ai.
- * OpenRouter exposes an OpenAI-compatible completions API, so any model routed through
- * it can use the "openai-completions" api type with openrouter.ai/api/v1 as the base URL.
- * The model ID passed to OpenRouter is the portion after "openrouter/" in the full ref.
- */
-function synthesizeOpenRouterModel(provider: string, modelName: string): Model<'openai-completions'> | undefined {
-  if (provider !== 'openrouter') return undefined;
-  return {
-    id: modelName,
-    name: modelName,
-    api: 'openai-completions' as const,
-    provider: 'openrouter' as const,
-    baseUrl: 'https://openrouter.ai/api/v1',
-    reasoning: false,
-    input: ['text'] as ('text' | 'image')[],
-    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
-    contextWindow: 128000,
-    maxTokens: 16384,
-  };
-}
-
-function synthesizeOpenAICodexModel(provider: string, modelName: string): Model<'openai-codex-responses'> | undefined {
-  if (provider !== 'openai-codex') return undefined;
-  return {
-    id: modelName,
-    name: modelName,
-    api: 'openai-codex-responses' as const,
-    provider: 'openai-codex' as const,
-    baseUrl: 'https://chatgpt.com/backend-api/codex',
-    reasoning: true,
-    input: ['text', 'image'] as ('text' | 'image')[],
-    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
-    contextWindow: 272000,
-    maxTokens: 32768,
-  };
-}
-
-export async function resolvePiModel(
-  provider: string,
-  modelName: string,
-  options?: { authMode?: PiAuthMode; apiKeyEnv?: string; apiKeyOverride?: string },
-): Promise<ResolvedPiModelRequest> {
-  // Guard: direct-provider model + OpenRouter key = guaranteed 401.
-  // Catches stale scaffolded configs that had apiKeyEnv:"OPENROUTER_API_KEY" and were
-  // later updated to use a direct openai/ or anthropic/ model without fixing the key.
-  if (provider === 'anthropic' && options?.apiKeyEnv === 'OPENROUTER_API_KEY') {
-    throw new Error(
-      `Model "${provider}/${modelName}" routes through the Anthropic API directly and requires ANTHROPIC_API_KEY. ` +
-      `To use Claude via OpenRouter, change the model ID to "openrouter/anthropic/${modelName}".`,
-    );
-  }
-  if (provider === 'openai' && options?.apiKeyEnv === 'OPENROUTER_API_KEY') {
-    throw new Error(
-      `Model "${provider}/${modelName}" routes through the OpenAI API directly and requires OPENAI_API_KEY. ` +
-      `To use this model via OpenRouter, change the model ID to "openrouter/openai/${modelName}".`,
-    );
-  }
-
-  const credential = resolveApiCredential({
-    provider,
-    authMode: options?.authMode,
-    apiKeyEnv: options?.apiKeyEnv,
-    apiKeyOverride: options?.apiKeyOverride,
-  });
-  const resolvedProvider = provider === 'openai' && credential.source === 'codex'
-    ? 'openai-codex'
-    : provider;
-  const authStorage = createPiAuthStorage({
-    provider: resolvedProvider,
-    authMode: options?.authMode,
-    apiKeyEnv: options?.apiKeyEnv,
-    apiKeyOverride: credential.apiKey,
-  });
-  const modelRegistry = ModelRegistry.create(authStorage);
-  const registryModel = modelRegistry.find(resolvedProvider, modelName);
-  const resolvedModel = registryModel
-    ?? getModel(resolvedProvider as never, modelName)
-    ?? synthesizeOpenRouterModel(resolvedProvider, modelName)
-    ?? synthesizeOpenAICodexModel(resolvedProvider, modelName);
-  if (!resolvedModel) {
-    throw new Error(`Could not resolve pi model ${resolvedProvider}/${modelName}`);
-  }
-
-  const auth = await modelRegistry.getApiKeyAndHeaders(resolvedModel);
-  if (!auth.ok) {
-    throw new Error(auth.error);
-  }
-
-  return {
-    model: resolvedModel,
-    authStorage,
-    modelRegistry,
-    auth: {
-      apiKey: auth.apiKey,
-      headers: auth.headers,
-    },
-  };
-}
diff --git a/src/tasks/coverage.ts b/src/tasks/coverage.ts
deleted file mode 100644
index ac791cd..0000000
--- a/src/tasks/coverage.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-import type { ActionDefinition } from '../actions/types.js';
-import type { GeneratedTask } from './types.js';
-import type { CoverageReport } from '../benchmark/types.js';
-
-function actionNamesOf(task: GeneratedTask): string[] {
-  const fromActions = task.expected_actions.map((a) => a.name).filter(Boolean);
-  if (fromActions.length > 0) return fromActions;
-  // Prompt surface: tasks use capabilityId instead of expected_actions.
-  // action.key === action.name for prompt capabilities, so capabilityId matches.
-  return task.capabilityId ? [task.capabilityId] : [];
-}
-
-export function computeCoverage(
-  actions: ActionDefinition[],
-  tasks: GeneratedTask[],
-  outOfScopeActions: ActionDefinition[] = [],
-): CoverageReport {
-  const tasksPerAction: Record<string, number> = {};
-  for (const action of actions) tasksPerAction[action.name] = 0;
-  for (const task of tasks) {
-    for (const name of actionNamesOf(task)) {
-      if (name in tasksPerAction) tasksPerAction[name] += 1;
-    }
-  }
-  const covered = actions.filter((a) => tasksPerAction[a.name] > 0).map((a) => a.name);
-  const uncovered = actions.filter((a) => tasksPerAction[a.name] === 0).map((a) => a.name);
-  return {
-    inScopeActions: actions.map((a) => a.name),
-    outOfScopeActions: outOfScopeActions.map((a) => a.name),
-    coveredActions: covered,
-    uncoveredActions: uncovered,
-    tasksPerAction,
-    coverageViolation: uncovered.length > 0,
-  };
-}
-
-export function computeUncovered(actions: ActionDefinition[], tasks: GeneratedTask[]): string[] {
-  return computeCoverage(actions, tasks).uncoveredActions;
-}
-
-export function buildRetryPrompt(uncovered: string[]): string {
-  return [
-    'The prior pass did not cover these actions. Generate tasks for EACH of them.',
-    'Exactly one task per action minimum. Use only arguments documented in the surface snapshot.',
-    '',
-    'Uncovered actions:',
-    ...uncovered.map((name) => `- ${name}`),
-  ].join('\n');
-}
diff --git a/src/tasks/default-pi-critic.ts b/src/tasks/default-pi-critic.ts
deleted file mode 100644
index ea0e6ab..0000000
--- a/src/tasks/default-pi-critic.ts
+++ /dev/null
@@ -1,41 +0,0 @@
-import type { CriticDeps } from '../verdict/recommendations.js';
-import { piSimpleComplete, NoTextBlocksError } from './pi-simple-complete.js';
-import type { PiAuthMode } from '../runtime/pi/auth.js';
-
-export interface DefaultPiCriticOptions {
-  provider: string;
-  model: string;
-  authMode?: PiAuthMode;
-  apiKeyEnv?: string;
-  timeoutMs?: number;
-  headers?: Record<string, string>;
-}
-
-export function createDefaultPiCritic(options: DefaultPiCriticOptions): CriticDeps {
-  return {
-    async complete(input) {
-      try {
-        return await piSimpleComplete(
-          {
-            provider: options.provider,
-            model: options.model,
-            authMode: options.authMode,
-            apiKeyEnv: options.apiKeyEnv,
-            timeoutMs: options.timeoutMs,
-            headers: options.headers,
-            reasoning: 'minimal',
-          },
-          { system: input.system, prompt: input.prompt },
-        );
-      } catch (err) {
-        // A model that returns no text blocks is treated as "no recommendations"
-        // rather than a hard failure — the verdict flow continues with an empty list.
-        // Real provider errors (stopReason === 'error') are re-thrown.
-        if (err instanceof NoTextBlocksError) {
-          return '[]';
-        }
-        throw err;
-      }
-    },
-  };
-}
diff --git a/src/tasks/default-pi-generator.ts b/src/tasks/default-pi-generator.ts
deleted file mode 100644
index a17bef5..0000000
--- a/src/tasks/default-pi-generator.ts
+++ /dev/null
@@ -1,36 +0,0 @@
-import type { SimpleStreamOptions } from '@mariozechner/pi-ai';
-
-import type { TaskGeneratorDeps } from './types.js';
-import { piSimpleComplete } from './pi-simple-complete.js';
-import type { PiAuthMode } from '../runtime/pi/auth.js';
-
-type ThinkingLevel = NonNullable<SimpleStreamOptions['reasoning']>;
-
-export interface DefaultPiGeneratorOptions {
-  provider: string;
-  model: string;
-  authMode?: PiAuthMode;
-  apiKeyEnv?: string;
-  timeoutMs?: number;
-  headers?: Record<string, string>;
-  thinkingLevel?: ThinkingLevel;
-}
-
-export function createDefaultPiTaskGenerator(options: DefaultPiGeneratorOptions): TaskGeneratorDeps {
-  return {
-    async complete(input) {
-      return await piSimpleComplete(
-        {
-          provider: options.provider,
-          model: options.model,
-          authMode: options.authMode,
-          apiKeyEnv: options.apiKeyEnv,
-          timeoutMs: options.timeoutMs,
-          headers: options.headers,
-          reasoning: options.thinkingLevel ?? 'minimal',
-        },
-        { system: input.system, prompt: input.prompt },
-      );
-    },
-  };
-}
diff --git a/src/tasks/discover.ts b/src/tasks/discover.ts
deleted file mode 100644
index 5b7c16b..0000000
--- a/src/tasks/discover.ts
+++ /dev/null
@@ -1,27 +0,0 @@
-import { readFileSync } from 'node:fs';
-
-import { buildSurfaceSnapshot, loadProjectConfig } from '../project/index.js';
-
-import type { DiscoveredTaskSurface } from './types.js';
-
-export async function discoverTaskSurface(configPath: string): Promise<DiscoveredTaskSurface> {
-  const project = await loadProjectConfig(configPath);
-  const skillPath = project.target.skill?.source;
-  if (!skillPath) {
-    throw new Error('Project config must define target.skill for task generation');
-  }
-
-  let skillMarkdown: string;
-  try {
-    skillMarkdown = readFileSync(skillPath, 'utf-8');
-  } catch (error) {
-    throw new Error(`Could not read skill markdown from ${skillPath}: ${error instanceof Error ? error.message : String(error)}`);
-  }
-
-  return {
-    project,
-    skillMarkdown,
-    skillPath,
-    snapshot: buildSurfaceSnapshot(project),
-  };
-}
diff --git a/src/tasks/freeze.ts b/src/tasks/freeze.ts
deleted file mode 100644
index 94a32b3..0000000
--- a/src/tasks/freeze.ts
+++ /dev/null
@@ -1,80 +0,0 @@
-import { mkdirSync, writeFileSync } from 'node:fs';
-import { join, resolve } from 'node:path';
-
-import { fromSurfaceSnapshot, writeActionSnapshotFile } from '../actions/snapshot.js';
-import type { ResolvedProjectConfig, SurfaceSnapshot } from '../project/types.js';
-
-import type { FrozenTaskArtifacts, GeneratedTask } from './types.js';
-
-export interface FreezeTaskArtifactsParams {
-  project: ResolvedProjectConfig;
-  snapshot: SurfaceSnapshot;
-  outputDir: string;
-  kept: GeneratedTask[];
-  rejected: Array<{ task: GeneratedTask; reason: string }>;
-}
-
-export function freezeTaskArtifacts(params: FreezeTaskArtifactsParams): FrozenTaskArtifacts {
-  if (!params.project.target.skill?.source) {
-    throw new Error('Project config must define target.skill before freezing generated benchmark');
-  }
-
-  const outputDir = resolve(params.outputDir);
-  mkdirSync(outputDir, { recursive: true });
-
-  const tasksPath = join(outputDir, 'tasks.generated.json');
-  const benchmarkPath = join(outputDir, 'benchmark.generated.json');
-  const logPath = join(outputDir, 'generation.log.json');
-  const snapshotPath = join(outputDir, 'surface.snapshot.json');
-
-  writeFileSync(tasksPath, JSON.stringify({ tasks: params.kept }, null, 2), 'utf-8');
-  writeActionSnapshotFile(snapshotPath, fromSurfaceSnapshot(params.snapshot));
-
-  const generatedProject = {
-    name: params.project.name,
-    target: {
-      surface: params.project.target.surface,
-      repoPath: params.project.target.repoPath,
-      skill: {
-        source: params.project.target.skill.source,
-        cache: params.project.target.skill.cache,
-      },
-      discovery: params.project.target.discovery,
-      sdk: params.project.target.sdk,
-      cli: params.project.target.cli,
-      mcp: params.project.target.mcp,
-      scope: params.project.target.scope,
-    },
-    benchmark: {
-      format: params.project.benchmark.format,
-      baseUrl: params.project.benchmark.baseUrl,
-      authMode: params.project.benchmark.authMode,
-      apiKeyEnv: params.project.benchmark.apiKeyEnv,
-      timeout: params.project.benchmark.timeout,
-      headers: params.project.benchmark.headers,
-      models: params.project.benchmark.models,
-      tasks: tasksPath,
-      surfaceSnapshot: snapshotPath,
-      verdict: params.project.benchmark.verdict,
-      taskGeneration: {
-        enabled: false,
-        maxTasks: params.project.benchmark.taskGeneration.maxTasks,
-        seed: params.project.benchmark.taskGeneration.seed,
-        outputDir: params.project.benchmark.taskGeneration.outputDir,
-      },
-      output: params.project.benchmark.output,
-      agentic: params.project.benchmark.agentic,
-    },
-  };
-  writeFileSync(benchmarkPath, JSON.stringify(generatedProject, null, 2), 'utf-8');
-
-  writeFileSync(logPath, JSON.stringify({
-    benchmarkConfigPath: params.project.configPath,
-    generatedAt: new Date().toISOString(),
-    keptCount: params.kept.length,
-    rejectedCount: params.rejected.length,
-    rejected: params.rejected.map((entry) => ({ id: entry.task.id, reason: entry.reason })),
-  }, null, 2), 'utf-8');
-
-  return { tasksPath, benchmarkPath, logPath, snapshotPath };
-}
diff --git a/src/tasks/generate.ts b/src/tasks/generate.ts
deleted file mode 100644
index 7b3389b..0000000
--- a/src/tasks/generate.ts
+++ /dev/null
@@ -1,325 +0,0 @@
-import { createHash } from 'node:crypto';
-
-import type { ExpectedAction, CoverageReport } from '../benchmark/types.js';
-
-import type { ActionDefinition } from '../actions/types.js';
-import { computeUncovered, buildRetryPrompt, computeCoverage } from './coverage.js';
-import type { DiscoveredTaskSurface, GeneratedTask, TaskGeneratorConfig, TaskGeneratorDeps } from './types.js';
-
-// Derive a stable task ID from the expected action names.
-// Action names are surface-stable (they come from discovered code, not LLM free-form output),
-// so the same surface produces the same IDs across regenerations.
-function stableTaskId(actionNames: string[]): string {
-  const key = [...actionNames].sort().join('\x00');
-  return createHash('sha1').update(key).digest('hex').slice(0, 12);
-}
-
-export async function generateCandidateTasks(
-  surface: DiscoveredTaskSurface,
-  config: TaskGeneratorConfig,
-  deps: TaskGeneratorDeps,
-): Promise<GeneratedTask[]> {
-  const system = [
-    `You generate ${surface.snapshot.surface.toUpperCase()} benchmark tasks.`,
-    'Output strict JSON only with no markdown and no extra prose.',
-    'Never invent action names or arguments that are not present in the provided discovered surface snapshot.',
-  ].join(' ');
-
-  const prompt = buildPrompt(surface, config);
-  const completion = await deps.complete({ system, prompt });
-
-  // For prompt surface, pass the known capability keys so parseGeneratedTasks
-  // can attach capabilityId to each task; membership validation is in ground.ts.
-  const knownCapabilityKeys = surface.snapshot.surface === 'prompt'
-    ? surface.snapshot.actions.map((a) => a.name)
-    : undefined;
-
-  const tasks = parseGeneratedTasks(completion, knownCapabilityKeys);
-  return tasks.slice(0, Math.max(1, Math.floor(config.maxTasks)));
-}
-
-function buildPrompt(surface: DiscoveredTaskSurface, config: TaskGeneratorConfig): string {
-  const clampedMax = Math.max(1, Math.floor(config.maxTasks));
-
-  if (surface.snapshot.surface === 'prompt') {
-    const capKeys = surface.snapshot.actions.map((a) => a.name);
-    return [
-      'Generate benchmark evaluation tasks for a prompt/skill document.',
-      'These tasks will be evaluated by content quality, not action matching.',
-      '',
-      'Return a JSON object with EXACTLY this shape:',
-      '{"tasks":[{"id":"string","prompt":"string","expected_actions":[],"capabilityId":"string"}]}',
-      '',
-      'RULES:',
-      '- Each task has EXACTLY four keys: id, prompt, expected_actions, capabilityId.',
-      '- expected_actions MUST always be an empty array [].',
-      '- id: short snake_case identifier (e.g. "deploy_service_to_staging").',
-      '- prompt: ask the model to perform a realistic task from the skill.',
-      '- capabilityId: set to the action key of the discovered capability this task exercises.',
-      `- Valid capabilityId values: ${capKeys.join(', ')}.`,
-      '- Every task MUST have a capabilityId from the valid list above — no other values are accepted.',
-      `- Produce at most ${clampedMax} tasks. Seed: ${config.seed}.`,
-      '',
-      'Full SKILL.md:',
-      '---BEGIN SKILL---',
-      surface.skillMarkdown,
-      '---END SKILL---',
-      '',
-      'Discovered prompt surface snapshot (capabilities for reference):',
-      '---BEGIN SURFACE SNAPSHOT---',
-      JSON.stringify(surface.snapshot, null, 2),
-      '---END SURFACE SNAPSHOT---',
-    ].join('\n');
-  }
-
-  return [
-    `Generate benchmark tasks for a ${surface.snapshot.surface} callable surface.`,
-    '',
-    'Return a JSON object with EXACTLY this shape and no other keys:',
-    '{"tasks":[{"id":"string","prompt":"string","expected_actions":[{"name":"string","args":{"key":"value"}}]}]}',
-    '',
-    'STRICT SCHEMA RULES - violations cause test failures:',
-    '- Each task object has EXACTLY three keys: id, prompt, expected_actions.',
-    '- Do NOT add keys like: cli_command, instruction, action, description, expected_outcome, expected_args, source, steps, calls.',
-    '- expected_actions is an ARRAY of objects, each with exactly two keys: name and args.',
-    '- name is the action name string (e.g. "account create", "network list").',
-    '- args is a flat object of key-value argument pairs (e.g. {"name": "my-wallet"}).',
-    '',
-    `Task count limit: produce at most ${clampedMax} tasks.`,
-    `Seed for deterministic variety: ${config.seed}.`,
-    'Variety requirement: include a mix of simple, medium, and multi-step tasks using different actions and argument combinations.',
-    '',
-    'Additional rules:',
-    '1) Use ONLY action names that exist in the provided discovered surface snapshot.',
-    '2) For each expected_actions entry, args keys MUST match discovered action argument names.',
-    '3) Include required params in args when an action marks them as required.',
-    '4) expected_actions must never be empty.',
-    '',
-    'Full SKILL.md:',
-    '---BEGIN SKILL---',
-    surface.skillMarkdown,
-    '---END SKILL---',
-    '',
-    `Discovered ${surface.snapshot.surface} surface snapshot:`,
-    '---BEGIN SURFACE SNAPSHOT---',
-    JSON.stringify(surface.snapshot, null, 2),
-    '---END SURFACE SNAPSHOT---',
-  ].join('\n');
-}
-
-function stripCodeFence(raw: string): string {
-  const trimmed = raw.trim();
-  const match = trimmed.match(/^```(?:json)?\s*\n([\s\S]*?)\n```\s*$/);
-  return match ? match[1].trim() : trimmed;
-}
-
-function parseGeneratedTasks(raw: string, knownCapabilityKeys?: string[]): GeneratedTask[] {
-  let parsed: unknown;
-  try {
-    parsed = JSON.parse(stripCodeFence(raw));
-  } catch (error) {
-    throw new Error(`Task generator returned invalid JSON: ${error instanceof Error ? error.message : String(error)}`);
-  }
-
-  if (!parsed || typeof parsed !== 'object') {
-    throw new Error('Task generator response must be a JSON object');
-  }
-
-  const tasks = (parsed as { tasks?: unknown }).tasks;
-  if (!Array.isArray(tasks)) {
-    throw new Error('Task generator response must contain a top-level "tasks" array');
-  }
-
-  const validated = tasks.map((task, index) => validateTask(task, index, knownCapabilityKeys));
-
-  // Sort by (id, prompt) before deduplication so the numeric suffix assigned to
-  // colliding IDs is determined by content order, not by the LLM's output order.
-  // Without this sort, swapping two same-action tasks between runs would swap their
-  // suffixes (e.g. id-1 and id-2), making --task filters unstable for multi-variant cases.
-  validated.sort((a, b) => a.id < b.id ? -1 : a.id > b.id ? 1 : a.prompt < b.prompt ? -1 : 1);
-
-  // Deduplicate IDs: two tasks with the same action-name set get a numeric suffix.
-  const seen = new Map<string, number>();
-  return validated.map(task => {
-    const n = seen.get(task.id) ?? 0;
-    seen.set(task.id, n + 1);
-    return n > 0 ? { ...task, id: `${task.id}-${n}` } : task;
-  });
-}
-
-function resolveStringField(obj: Record<string, unknown>, ...keys: string[]): string | null {
-  for (const key of keys) {
-    if (typeof obj[key] === 'string' && (obj[key] as string).trim() !== '') {
-      return (obj[key] as string).trim();
-    }
-  }
-  return null;
-}
-
-/**
- * Last-resort prompt recovery: pick the longest non-empty string value in the object.
- * Models routinely invent field names (name, command, task_description, …).
- * Rather than maintaining an ever-growing alias list, we grab whatever string is there.
- * Longest wins because action names tend to be short while natural-language prompts are longer.
- */
-function pickLongestStringValue(obj: Record<string, unknown>): string | null {
-  let best: string | null = null;
-  for (const val of Object.values(obj)) {
-    if (typeof val === 'string') {
-      const trimmed = val.trim();
-      if (trimmed && (!best || trimmed.length > best.length)) {
-        best = trimmed;
-      }
-    }
-  }
-  return best;
-}
-
-function validateTask(task: unknown, index: number, knownCapabilityKeys?: string[]): GeneratedTask {
-  if (!task || typeof task !== 'object') {
-    throw new Error(`Task at index ${index} must be an object`);
-  }
-
-  const candidate = task as Record<string, unknown>;
-
-  // Resolve prompt — try known aliases first, then fall back to any string in the object.
-  // Models frequently invent field names; we recover rather than crash.
-  const taskPrompt =
-    resolveStringField(candidate, 'prompt', 'user_prompt', 'description', 'instruction', 'task', 'action', 'method', 'name', 'command') ??
-    pickLongestStringValue(candidate);
-
-  // Resolve expected_actions before computing the ID so action names can anchor the ID.
-  // LLM-supplied IDs are intentionally ignored — they vary across runs for the same task,
-  // breaking --task filters after regeneration. For SDK/CLI/MCP surfaces, action names come
-  // from the surface definition and are stable across runs. Prompt-surface tasks have no
-  // actions (expected_actions is always []), so they fall back to hashing the prompt text.
-  let rawExpectedActions = (
-    ['expected_actions', 'actions', 'steps', 'calls', 'expected_calls', 'tool_calls', 'cli_command'] as const
-  )
-    .map((key) => candidate[key])
-    .find((v) => Array.isArray(v)) as unknown[] | undefined;
-
-  // Fallback: model returned a single action at task level (e.g. {action:"send", args:{...}})
-  if (!rawExpectedActions) {
-    const actionName =
-      typeof candidate['action'] === 'string' ? candidate['action'] :
-      typeof candidate['command'] === 'string' ? candidate['command'] : null;
-    if (actionName && actionName.trim()) {
-      rawExpectedActions = [{ name: actionName.trim(), args: candidate['args'] }];
-    }
-  }
-
-  const actionNamesForId = (rawExpectedActions ?? [])
-    .filter((a): a is Record<string, unknown> => !!a && typeof a === 'object')
-    .map(a => (typeof a['name'] === 'string' ? a['name'].trim() : ''))
-    .filter(Boolean);
-
-  const taskId =
-    actionNamesForId.length > 0 ? stableTaskId(actionNamesForId)
-    : taskPrompt ? stableTaskId([taskPrompt])
-    : `task-${index}`;
-
-  if (!taskPrompt) {
-    // Only reachable if the object has no string values at all.
-    const received = JSON.stringify(Object.keys(candidate));
-    throw new Error(`Task ${taskId} must include a non-empty string prompt (received keys: ${received})`);
-  }
-
-  if (!rawExpectedActions && knownCapabilityKeys !== undefined) {
-    rawExpectedActions = [];
-  }
-
-  if (!rawExpectedActions) {
-    const received = JSON.stringify(Object.keys(candidate));
-    throw new Error(`Task ${taskId} must include an expected_actions array (received keys: ${received})`);
-  }
-
-  const expected_actions = rawExpectedActions.map((action, actionIndex) => validateExpectedAction(taskId, action, actionIndex));
-
-  // Extract capabilityId for prompt-surface tasks. The field is stored as-is here;
-  // grounding validates it against the known capability keys and rejects bad values.
-  const rawCapabilityId = typeof candidate['capabilityId'] === 'string' ? candidate['capabilityId'].trim() : undefined;
-  const capabilityId = knownCapabilityKeys !== undefined && rawCapabilityId ? rawCapabilityId : undefined;
-
-  return {
-    id: taskId,
-    prompt: taskPrompt,
-    expected_actions,
-    ...(capabilityId !== undefined ? { capabilityId } : {}),
-  };
-}
-
-function validateExpectedAction(taskId: string, action: unknown, actionIndex: number): ExpectedAction {
-  if (!action || typeof action !== 'object') {
-    throw new Error(`Task ${taskId} expected_actions[${actionIndex}] must be an object`);
-  }
-
-  const typed = action as { name?: unknown; args?: unknown };
-  const name = typeof typed.name === 'string' ? typed.name : null;
-  if (!name || name.trim() === '') {
-    throw new Error(`Task ${taskId} expected_actions[${actionIndex}] must include a non-empty name`);
-  }
-
-  if (typed.args !== undefined && (!typed.args || typeof typed.args !== 'object' || Array.isArray(typed.args))) {
-    throw new Error(`Task ${taskId} expected_actions[${actionIndex}] args must be an object when present`);
-  }
-
-  return {
-    name,
-    args: typed.args as Record<string, unknown> | undefined,
-  };
-}
-
-export async function generateCandidateTasksWithCoverage(
-  surface: DiscoveredTaskSurface,
-  config: TaskGeneratorConfig,
-  deps: TaskGeneratorDeps,
-  inScopeActions: ActionDefinition[],
-  outOfScopeActions: ActionDefinition[] = [],
-): Promise<{ tasks: GeneratedTask[]; coverage: CoverageReport }> {
-  if (surface.snapshot.surface === 'prompt') {
-    throw new Error('generateCandidateTasksWithCoverage must not be called for prompt surface — use generateCandidateTasks directly');
-  }
-
-  // Iteration 1 — existing one-shot prompt
-  const firstPass = await generateCandidateTasks(surface, config, deps);
-
-  let uncovered = computeUncovered(inScopeActions, firstPass);
-  if (uncovered.length === 0) {
-    return { tasks: firstPass, coverage: computeCoverage(inScopeActions, firstPass, outOfScopeActions) };
-  }
-
-  // Iteration 2 — focused retry for uncovered actions
-  const retrySystem = 'You generate benchmark tasks targeting specific missing actions. JSON only.';
-  const retryPrompt = [
-    buildRetryPrompt(uncovered),
-    '',
-    `Respond with {"tasks":[...]} using the same schema as before.`,
-    '',
-    'Surface snapshot:',
-    '---BEGIN SURFACE SNAPSHOT---',
-    JSON.stringify(surface.snapshot, null, 2),
-    '---END SURFACE SNAPSHOT---',
-  ].join('\n');
-
-  const retryRaw = await deps.complete({ system: retrySystem, prompt: retryPrompt });
-  const retryTasks = parseGeneratedTasks(retryRaw);
-
-  // Dedup by id
-  const byId = new Map<string, GeneratedTask>();
-  for (const t of [...firstPass, ...retryTasks]) {
-    if (!byId.has(t.id)) byId.set(t.id, t);
-  }
-  const combined = [...byId.values()];
-
-  uncovered = computeUncovered(inScopeActions, combined);
-  if (uncovered.length > 0) {
-    throw new Error(
-      `Task generation could not cover ${uncovered.length} in-scope action(s) after 2 iterations: ` +
-        `${uncovered.join(', ')}. ` +
-        `Improve SKILL.md guidance for these actions, or add them to target.scope.exclude.`,
-    );
-  }
-
-  return { tasks: combined, coverage: computeCoverage(inScopeActions, combined, outOfScopeActions) };
-}
diff --git a/src/tasks/ground.ts b/src/tasks/ground.ts
deleted file mode 100644
index 5b97f6d..0000000
--- a/src/tasks/ground.ts
+++ /dev/null
@@ -1,82 +0,0 @@
-import { getExpectedActionName } from '../benchmark/types.js';
-import type { SurfaceSnapshot } from '../project/types.js';
-
-import type { GeneratedTask, GroundedTasksResult } from './types.js';
-
-export function groundTasks(tasks: GeneratedTask[], snapshot: SurfaceSnapshot): GroundedTasksResult {
-  const kept: GeneratedTask[] = [];
-  const rejected: Array<{ task: GeneratedTask; reason: string }> = [];
-
-  const seenIds = new Set<string>();
-  const actions = new Map(snapshot.actions.map((action) => [action.name, action]));
-
-  for (const task of tasks) {
-    const rejection = getRejectionReason(task, seenIds, actions, snapshot.surface);
-    if (rejection) {
-      rejected.push({ task, reason: rejection });
-      continue;
-    }
-
-    seenIds.add(task.id);
-    kept.push(task);
-  }
-
-  return { kept, rejected };
-}
-
-function getRejectionReason(
-  task: GeneratedTask,
-  seenIds: Set<string>,
-  actions: Map<string, SurfaceSnapshot['actions'][number]>,
-  surface: SurfaceSnapshot['surface'],
-): string | null {
-  const expectedActions = task.expected_actions;
-  if (seenIds.has(task.id)) {
-    return `duplicate task id "${task.id}"`;
-  }
-
-  // Prompt surface tasks must have expected_actions: [] — evaluated on content, not tool calls.
-  // They must also carry a valid capabilityId referencing a known discovered capability.
-  if (surface === 'prompt') {
-    if (expectedActions.length > 0) {
-      return `prompt task "${task.id}" must have empty expected_actions, got ${expectedActions.length}`;
-    }
-    const knownKeys = [...actions.keys()];
-    if (!task.capabilityId) {
-      return `prompt task "${task.id}" is missing capabilityId (known: ${knownKeys.join(', ')})`;
-    }
-    if (!actions.has(task.capabilityId)) {
-      return `prompt task "${task.id}" has unknown capabilityId "${task.capabilityId}" (known: ${knownKeys.join(', ')})`;
-    }
-    return null;
-  }
-
-  if (expectedActions.length === 0) {
-    return `task "${task.id}" has empty expected_actions`;
-  }
-
-  for (const expectedAction of expectedActions) {
-    const actionName = getExpectedActionName(expectedAction);
-    const action = actions.get(actionName);
-    if (!action) {
-      return `task "${task.id}" uses unknown method/action "${actionName}"`;
-    }
-
-    const args = expectedAction.args ?? {};
-    const allowedKeys = new Set(action.args.map((arg) => arg.name));
-
-    for (const key of Object.keys(args)) {
-      if (!allowedKeys.has(key)) {
-        return `task "${task.id}" uses unknown arg key "${key}" for action "${actionName}"`;
-      }
-    }
-
-    for (const requiredArg of action.args.filter((arg) => arg.required)) {
-      if (!(requiredArg.name in args)) {
-        return `task "${task.id}" is missing required param "${requiredArg.name}" for action "${actionName}"`;
-      }
-    }
-  }
-
-  return null;
-}
diff --git a/src/tasks/index.ts b/src/tasks/index.ts
deleted file mode 100644
index c5b4208..0000000
--- a/src/tasks/index.ts
+++ /dev/null
@@ -1,123 +0,0 @@
-import { discoverTaskSurface } from './discover.js';
-import { freezeTaskArtifacts } from './freeze.js';
-import { generateCandidateTasks, generateCandidateTasksWithCoverage } from './generate.js';
-import { groundTasks } from './ground.js';
-import { resolveScope } from './scope.js';
-import { computeCoverage } from './coverage.js';
-
-import type { GenerateTasksForProjectResult, GeneratedTask, TaskGeneratorDeps } from './types.js';
-import { buildSurfaceSnapshot } from '../project/index.js';
-import type { ResolvedProjectConfig } from '../project/types.js';
-import type { SurfaceSnapshotAction } from '../project/types.js';
-
-export * from './default-pi-critic.js';
-export * from './default-pi-generator.js';
-export * from './discover.js';
-export * from './freeze.js';
-export * from './generate.js';
-export * from './ground.js';
-export * from './scope.js';
-export * from './types.js';
-
-export function discoverActionsOnly(project: ResolvedProjectConfig): SurfaceSnapshotAction[] {
-  const snapshot = buildSurfaceSnapshot(project);
-  return snapshot.actions;
-}
-
-export async function generateTasksForProject(
-  params: {
-    configPath: string;
-    maxTasks: number;
-    seed: number;
-    outputDir: string;
-    deps: TaskGeneratorDeps;
-  },
-): Promise<GenerateTasksForProjectResult> {
-  console.log('[optimize] Discovering surface for task generation...');
-  const surface = await discoverTaskSurface(params.configPath);
-  console.log(`[optimize] Loaded ${surface.snapshot.surface} surface with ${surface.snapshot.actions.length} actions.`);
-
-  const { inScope, outOfScope } = resolveScope(surface.snapshot.actions, surface.project.target.scope);
-  if (inScope.length === 0) {
-    console.warn(
-      `[warn] Discovery found 0 in-scope actions for surface "${surface.snapshot.surface}".` +
-      ` If using CLI surface, add target.cli.commands in your config.` +
-      ` Run 'npx skill-optimizer doctor' for a full diagnosis.`
-    );
-  }
-  console.log(`[optimize] Scope filter: ${inScope.length} in scope, ${outOfScope.length} out of scope.`);
-
-  const maxTasks = params.maxTasks;
-  if (surface.snapshot.surface !== 'prompt' && maxTasks < inScope.length) {
-    throw new Error(
-      `benchmark.taskGeneration.maxTasks (${maxTasks}) is smaller than in-scope action count (${inScope.length}). ` +
-        `Raise maxTasks in ${params.configPath} or tighten target.scope.exclude.`,
-    );
-  }
-
-  const filteredSurface = {
-    ...surface,
-    snapshot: {
-      ...surface.snapshot,
-      actions: inScope,
-    },
-  };
-
-  console.log('[optimize] Generating candidate tasks...');
-  // Synthesize key=name: SurfaceSnapshotAction omits 'key', but ActionDefinition requires it.
-  // Coverage matching only reads action.name so key=name is always correct here.
-  const inScopeActions = inScope.map((a) => ({ key: a.name, ...a }));
-  const outOfScopeActions = outOfScope.map((a) => ({ key: a.name, ...a }));
-
-  let generated: GeneratedTask[];
-  if (filteredSurface.snapshot.surface === 'prompt') {
-    // Prompt surface: tasks have expected_actions:[] and are evaluated on content.
-    // Coverage enforcement does not apply — skip the retry/coverage-gap loop.
-    generated = await generateCandidateTasks(
-      filteredSurface,
-      { maxTasks: params.maxTasks, seed: params.seed },
-      params.deps,
-    );
-  } else {
-    ({ tasks: generated } = await generateCandidateTasksWithCoverage(
-      filteredSurface,
-      { maxTasks: params.maxTasks, seed: params.seed },
-      params.deps,
-      inScopeActions,
-      outOfScopeActions,
-    ));
-  }
-  console.log(`[optimize] Model proposed ${generated.length} tasks.`);
-
-  console.log('[optimize] Grounding generated tasks against the discovered surface snapshot...');
-  const grounded = groundTasks(generated, filteredSurface.snapshot);
-  if (grounded.kept.length === 0) {
-    throw new Error('Task generation produced zero valid tasks after grounding');
-  }
-
-  // Recompute from kept tasks only — pre-grounding coverage may include rejected tasks.
-  const taskCoverage = computeCoverage(inScopeActions, grounded.kept, outOfScopeActions);
-  console.log(`[optimize] Grounded ${grounded.kept.length} tasks, rejected ${grounded.rejected.length}.`);
-  console.log('[optimize] Benchmark tasks:');
-  for (const task of grounded.kept) {
-    console.log(`  - ${task.id}: ${task.prompt}`);
-  }
-
-  console.log('[optimize] Freezing generated benchmark artifacts...');
-  const artifacts = freezeTaskArtifacts({
-    project: filteredSurface.project,
-    snapshot: filteredSurface.snapshot,
-    outputDir: params.outputDir,
-    kept: grounded.kept,
-    rejected: grounded.rejected,
-  });
-
-  return {
-    surface: filteredSurface,
-    generated,
-    kept: grounded.kept,
-    rejected: grounded.rejected,
-    artifacts,
-    coverage: taskCoverage,
-  };
-}
diff --git a/src/tasks/pi-simple-complete.ts b/src/tasks/pi-simple-complete.ts
deleted file mode 100644
index 12c72e4..0000000
--- a/src/tasks/pi-simple-complete.ts
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Shared helper for tasks that call Pi's completeSimple API.
- *
- * Both the default task-generator and the default critic perform an identical
- * sequence: resolve the model, set up an abort timer, call completeSimple, check
- * for errors, and extract text blocks. This module encapsulates that sequence so
- * each consumer only needs to handle what's unique to it.
- */
-
-import { completeSimple } from '@mariozechner/pi-ai';
-import type { SimpleStreamOptions } from '@mariozechner/pi-ai';
-
-import { resolvePiModel } from '../runtime/pi/index.js';
-import type { PiAuthMode } from '../runtime/pi/auth.js';
-
-interface PiSimpleCompleteOptions {
-  provider: string;
-  model: string;
-  authMode?: PiAuthMode;
-  apiKeyEnv?: string;
-  timeoutMs?: number;
-  headers?: Record<string, string>;
-  reasoning?: NonNullable<SimpleStreamOptions['reasoning']>;
-}
-
-interface PiSimpleCompleteInput {
-  system: string;
-  prompt: string;
-}
-
-export class NoTextBlocksError extends Error {
-  readonly contentTypes: string;
-  constructor(contentTypes: string) {
-    super(`Model returned no text blocks${contentTypes ? ` (content types: ${contentTypes})` : ''}`);
-    this.name = 'NoTextBlocksError';
-    this.contentTypes = contentTypes;
-  }
-}
-
-/**
- * Resolve a Pi model, call completeSimple with a timeout, check for errors,
- * and return the concatenated text from all text blocks.
- *
- * Throws if the model signals an error or if the response contains no text blocks.
- */
-export async function piSimpleComplete(
-  options: PiSimpleCompleteOptions,
-  input: PiSimpleCompleteInput,
-): Promise<string> {
-  const resolved = await resolvePiModel(options.provider, options.model, {
-    authMode: options.authMode,
-    apiKeyEnv: options.apiKeyEnv,
-  });
-
-  const controller = new AbortController();
-  const timeoutMs = options.timeoutMs ?? 120_000;
-  const timer = setTimeout(() => controller.abort(), timeoutMs);
-  timer.unref?.();
-
-  const response = await completeSimple(
-    resolved.model,
-    {
-      systemPrompt: input.system,
-      messages: [{ role: 'user', content: input.prompt, timestamp: Date.now() }],
-    },
-    {
-      signal: controller.signal,
-      apiKey: resolved.auth.apiKey,
-      headers: { ...(resolved.auth.headers ?? {}), ...(options.headers ?? {}) },
-      reasoning: options.reasoning ?? 'minimal',
-    },
-  ).finally(() => clearTimeout(timer));
-
-  if (response.stopReason === 'error') {
-    throw new Error(response.errorMessage ?? 'Model returned stop reason "error" with no message');
-  }
-
-  const text = response.content
-    .filter((block): block is Extract<typeof block, { type: 'text' }> => block.type === 'text')
-    .map((block) => block.text)
-    .join('\n')
-    .trim();
-
-  if (!text) {
-    const contentTypes = response.content.map((b) => b.type).join(', ');
-    throw new NoTextBlocksError(contentTypes);
-  }
-
-  return text;
-}
diff --git a/src/tasks/scope.ts b/src/tasks/scope.ts
deleted file mode 100644
index 78ebf79..0000000
--- a/src/tasks/scope.ts
+++ /dev/null
@@ -1,37 +0,0 @@
-export interface ScopeConfig {
-  include: string[];
-  exclude: string[];
-}
-
-export function matchesGlob(name: string, pattern: string): boolean {
-  // Single operator '*' matches any sequence of characters including separators.
-  const escaped = pattern.replace(/[.+?^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*');
-  return new RegExp(`^${escaped}$`).test(name);
-}
-
-function matchesAny(name: string, patterns: string[]): boolean {
-  return patterns.some((p) => matchesGlob(name, p));
-}
-
-export function resolveScope<T extends { name: string }>(
-  actions: T[],
-  scope: ScopeConfig,
-): { inScope: T[]; outOfScope: T[] } {
-  const include = scope.include.length === 0 ? ['*'] : scope.include;
-  const exclude = scope.exclude;
-
-  const inScope: T[] = [];
-  const outOfScope: T[] = [];
-
-  for (const action of actions) {
-    const included = matchesAny(action.name, include);
-    const excluded = exclude.length > 0 && matchesAny(action.name, exclude);
-    if (included && !excluded) {
-      inScope.push(action);
-    } else {
-      outOfScope.push(action);
-    }
-  }
-
-  return { inScope, outOfScope };
-}
diff --git a/src/tasks/types.ts b/src/tasks/types.ts
deleted file mode 100644
index cbd51bb..0000000
--- a/src/tasks/types.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-import type { CoverageReport, ExpectedAction } from '../benchmark/types.js';
-import type { ResolvedProjectConfig, SurfaceSnapshot } from '../project/types.js';
-
-export interface GeneratedTask {
-  id: string;
-  prompt: string;
-  expected_actions: ExpectedAction[];
-  capabilityId?: string;  // prompt surface only; SDK/CLI/MCP don't set this
-}
-
-export interface TaskGeneratorConfig {
-  maxTasks: number;
-  seed: number;
-}
-
-export interface TaskGeneratorDeps {
-  complete(input: { system: string; prompt: string }): Promise<string>;
-}
-
-export interface DiscoveredTaskSurface {
-  project: ResolvedProjectConfig;
-  skillMarkdown: string;
-  skillPath: string;
-  snapshot: SurfaceSnapshot;
-}
-
-export interface GroundedTasksResult {
-  kept: GeneratedTask[];
-  rejected: Array<{ task: GeneratedTask; reason: string }>;
-}
-
-export interface FrozenTaskArtifacts {
-  tasksPath: string;
-  benchmarkPath: string;
-  logPath: string;
-  snapshotPath: string;
-}
-
-export interface GenerateTasksForProjectResult extends GroundedTasksResult {
-  surface: DiscoveredTaskSurface;
-  generated: GeneratedTask[];
-  artifacts: FrozenTaskArtifacts;
-  coverage: CoverageReport;
-}
diff --git a/src/verdict/recommendations.ts b/src/verdict/recommendations.ts
deleted file mode 100644
index 1cb0cbb..0000000
--- a/src/verdict/recommendations.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import type { BenchmarkReport } from '../benchmark/types.js';
-import { buildMutationContext } from '../optimizer/feedback/mutation-context.js';
-
-export interface Recommendation {
-  priority: 'high' | 'medium' | 'low';
-  area: string;
-  action: string;
-  rationale: string;
-}
-
-export interface CriticDeps {
-  complete: (args: { system: string; prompt: string }) => Promise<string>;
-}
-
-export async function generateRecommendations(
-  report: BenchmarkReport,
-  deps: CriticDeps,
-  contextMaxBytes: number = 16_000,
-): Promise<Recommendation[]> {
-  if (!report.verdict || report.verdict.result !== 'FAIL') return [];
-
-  const ctx = buildMutationContext(report, contextMaxBytes);
-  const system = 'You review benchmark failures and produce actionable skill / doc / SDK improvement recommendations. JSON array only.';
-  const prompt = [
-    'Return a JSON array of {priority:"high"|"medium"|"low", area:string, action:string, rationale:string}.',
-    'Focus on concrete edits, not generic advice.',
-    '',
-    `Verdict: FAIL — ${report.verdict.reasons.join('; ')}`,
-    '',
-    ctx.serialized,
-  ].join('\n');
-
-  let raw: string;
-  try {
-    raw = await deps.complete({ system, prompt });
-  } catch {
-    return [];
-  }
-
-  try {
-    const parsed = JSON.parse(raw);
-    if (!Array.isArray(parsed)) return [];
-    return parsed
-      .filter((r) => r && typeof r === 'object')
-      .map((r) => ({
-        priority: (['high', 'medium', 'low'].includes((r as { priority?: string }).priority ?? '')
-          ? (r as { priority: 'high' | 'medium' | 'low' }).priority
-          : 'medium'),
-        area: String((r as { area?: string }).area ?? 'unspecified'),
-        action: String((r as { action?: string }).action ?? ''),
-        rationale: String((r as { rationale?: string }).rationale ?? ''),
-      }))
-      .filter((r) => r.action.length > 0);
-  } catch {
-    return [];
-  }
-}
diff --git a/src/verdict/render.ts b/src/verdict/render.ts
deleted file mode 100644
index b861551..0000000
--- a/src/verdict/render.ts
+++ /dev/null
@@ -1,93 +0,0 @@
-import type { BenchmarkReport, CoverageReport } from '../benchmark/types.js';
-import type { Recommendation } from './recommendations.js';
-
-export function renderVerdictConsole(
-  report: BenchmarkReport,
-  recommendations: Recommendation[],
-): string {
-  const lines: string[] = [];
-  lines.push('');
-  lines.push('=== Verdict ===');
-  if (!report.verdict) {
-    lines.push('No verdict policy configured.');
-    return lines.join('\n');
-  }
-  lines.push(`Result: ${report.verdict.result}`);
-  for (const reason of report.verdict.reasons) {
-    lines.push(`  - ${reason}`);
-  }
-  const cov = renderCoverageBlock(report.scopeCoverage);
-  if (cov) lines.push(cov);
-  if (recommendations.length > 0) {
-    lines.push('');
-    lines.push('Recommendations:');
-    for (const rec of recommendations) {
-      lines.push(`  [${rec.priority}] ${rec.area}: ${rec.action}`);
-      if (rec.rationale) lines.push(`      ${rec.rationale}`);
-    }
-  }
-  return lines.join('\n');
-}
-
-export function renderVerdictMarkdown(
-  report: BenchmarkReport,
-  recommendations: Recommendation[],
-): string {
-  if (!report.verdict) return '';
-  const lines: string[] = [];
-  lines.push('## Verdict');
-  lines.push(`- **Result:** ${report.verdict.result}`);
-  lines.push(`- **Per-model floor:** ${(report.verdict.policy.perModelFloor * 100).toFixed(1)}%`);
-  lines.push(`- **Target weighted average:** ${(report.verdict.policy.targetWeightedAverage * 100).toFixed(1)}%`);
-  if (report.verdict.reasons.length > 0) {
-    lines.push('');
-    lines.push('**Reasons:**');
-    for (const r of report.verdict.reasons) lines.push(`- ${r}`);
-  }
-  const cov = renderCoverageBlockMarkdown(report.scopeCoverage);
-  if (cov) { lines.push(''); lines.push(cov); }
-  if (recommendations.length > 0) {
-    lines.push('');
-    lines.push('## Recommendations');
-    for (const rec of recommendations) {
-      lines.push(`- **[${rec.priority}] ${rec.area}** — ${rec.action}`);
-      if (rec.rationale) lines.push(`  - _${rec.rationale}_`);
-    }
-  }
-  return lines.join('\n');
-}
-
-function renderCoverageBlock(cov?: CoverageReport): string {
-  if (!cov) return '';
-  const total = cov.inScopeActions.length;
-  const covered = cov.coveredActions.length;
-  const pct = total > 0 ? (covered / total) * 100 : 0;
-  const lines = [
-    '',
-    'Surface coverage:',
-    `  In scope:      ${total} action(s)`,
-    `  Out of scope:  ${cov.outOfScopeActions.length} action(s)`,
-    `  Covered:       ${covered} / ${total} (${pct.toFixed(0)}%)`,
-  ];
-  if (cov.uncoveredActions.length > 0) {
-    lines.push(`  Uncovered:     ${cov.uncoveredActions.join(', ')}`);
-  }
-  return lines.join('\n');
-}
-
-function renderCoverageBlockMarkdown(cov?: CoverageReport): string {
-  if (!cov) return '';
-  const total = cov.inScopeActions.length;
-  const covered = cov.coveredActions.length;
-  const pct = total > 0 ? (covered / total) * 100 : 0;
-  const lines: string[] = [
-    '## Coverage',
-    `- In scope: ${total}`,
-    `- Out of scope: ${cov.outOfScopeActions.length}`,
-    `- Covered: ${covered}/${total} (${pct.toFixed(0)}%)`,
-  ];
-  if (cov.uncoveredActions.length > 0) {
-    lines.push(`- Uncovered: ${cov.uncoveredActions.join(', ')}`);
-  }
-  return lines.join('\n');
-}
diff --git a/src/workbench/case-loader.ts b/src/workbench/case-loader.ts
new file mode 100644
index 0000000..5f55653
--- /dev/null
+++ b/src/workbench/case-loader.ts
@@ -0,0 +1,489 @@
+import { existsSync, readFileSync, statSync } from 'node:fs';
+import { dirname, extname, resolve } from 'node:path';
+
+import { parse as parseYaml } from 'yaml';
+
+import { ensureOpenRouterModelRef } from './models.js';
+import type {
+  ResolvedWorkbenchCase,
+  WorkbenchGraderConfig,
+  WorkbenchMcpJsonValue,
+  WorkbenchMcpServerConfig,
+  WorkbenchMcpServersConfig,
+  WorkbenchMcpServiceConfig,
+  WorkbenchMcpServicesConfig,
+} from './types.js';
+
+export const DEFAULT_WORKBENCH_MODEL = 'openrouter/google/gemini-2.5-flash';
+export const DEFAULT_WORKBENCH_TIMEOUT_SECONDS = 600;
+const ENV_NAME_PATTERN = /^[A-Za-z_][A-Za-z0-9_]*$/;
+
+export function loadWorkbenchCase(configPath: string): ResolvedWorkbenchCase {
+  const resolvedConfigPath = resolve(configPath);
+  const configDir = dirname(resolvedConfigPath);
+
+  if (!existsSync(resolvedConfigPath)) {
+    throw new Error(`Workbench case file not found: ${resolvedConfigPath}`);
+  }
+
+  let raw: string;
+  try {
+    raw = readFileSync(resolvedConfigPath, 'utf-8');
+  } catch (error) {
+    throw new Error(
+      `Failed to read workbench case file ${resolvedConfigPath}: ` +
+      `${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+
+  const parsed = parseWorkbenchCase(raw, resolvedConfigPath);
+
+  return resolveWorkbenchCaseConfig(parsed, resolvedConfigPath, configDir);
+}
+
+export function resolveWorkbenchCaseConfig(
+  parsed: Record<string, unknown>,
+  configPath: string,
+  configDir: string,
+): ResolvedWorkbenchCase {
+  const resolvedConfigPath = configPath;
+  const resolvedConfigDir = resolve(configDir);
+
+  if (parsed.check !== undefined) {
+    throw new Error(`Workbench case ${resolvedConfigPath}: field "check" is invalid; define graders as a non-empty array of { name, command } objects`);
+  }
+  if (parsed.artifacts !== undefined) {
+    throw new Error(`Workbench case ${resolvedConfigPath}: field "artifacts" is invalid; inspect outputs in the workspace or use --keep-workspace`);
+  }
+
+  const name = requireNonEmptyString(parsed, 'name', resolvedConfigPath);
+  const references = requireNonEmptyString(parsed, 'references', resolvedConfigPath);
+  const task = requireNonEmptyString(parsed, 'task', resolvedConfigPath);
+  const graders = readGraders(parsed, resolvedConfigPath);
+  const mcpServers = readMcpServers(parsed, resolvedConfigPath);
+  const mcpServices = readMcpServices(parsed, resolvedConfigPath);
+  validateMcpServiceServers(mcpServices, mcpServers, resolvedConfigPath);
+  const env = readStringArray(parsed, 'env', resolvedConfigPath);
+  const setup = readStringArray(parsed, 'setup', resolvedConfigPath);
+  const cleanup = readStringArray(parsed, 'cleanup', resolvedConfigPath);
+  const model = ensureOpenRouterModelRef(readOptionalString(parsed, 'model', resolvedConfigPath) ?? DEFAULT_WORKBENCH_MODEL);
+  const timeoutSeconds = readOptionalTimeoutSeconds(parsed, resolvedConfigPath) ?? DEFAULT_WORKBENCH_TIMEOUT_SECONDS;
+
+  const referencesDir = resolve(resolvedConfigDir, references);
+  if (!existsSync(referencesDir)) {
+    throw new Error(
+      `Workbench case ${resolvedConfigPath}: references path does not exist: ${referencesDir}`,
+    );
+  }
+  if (!statSync(referencesDir).isDirectory()) {
+    throw new Error(
+      `Workbench case ${resolvedConfigPath}: references must resolve to a directory: ${referencesDir}`,
+    );
+  }
+
+  return {
+    configPath: resolvedConfigPath,
+    configDir: resolvedConfigDir,
+    name,
+    referencesDir,
+    task,
+    graders,
+    mcpServers,
+    mcpServices,
+    env,
+    setup,
+    cleanup,
+    model,
+    timeoutSeconds,
+  };
+}
+
+export function readMcpServices(
+  parsed: Record<string, unknown>,
+  configPath: string,
+): WorkbenchMcpServicesConfig {
+  const value = parsed.mcpServices;
+  if (value === undefined) {
+    return {};
+  }
+  if (!isPlainObject(value)) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServices" must be an object`);
+  }
+
+  const services: WorkbenchMcpServicesConfig = {};
+  for (const [rawName, rawService] of Object.entries(value)) {
+    const name = rawName.trim();
+    if (name === '') {
+      throw new Error(`Workbench case ${configPath}: field "mcpServices" service names must be non-empty strings`);
+    }
+    if (!isPlainObject(rawService)) {
+      throw new Error(`Workbench case ${configPath}: field "mcpServices" service "${name}" must be an object`);
+    }
+    services[name] = readMcpService(rawService, name, configPath);
+  }
+
+  return services;
+}
+
+function readMcpService(
+  parsed: Record<string, unknown>,
+  name: string,
+  configPath: string,
+): WorkbenchMcpServiceConfig {
+  const command = readMcpServiceString(parsed.command, 'command', name, configPath);
+  const args = parsed.args === undefined ? [] : readMcpServiceStringArray(parsed.args, 'args', name, configPath);
+  if (parsed.port !== undefined) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServices" service "${name}" port is not supported; set the port in the matching mcpServers URL`);
+  }
+  return {
+    command,
+    args,
+  };
+}
+
+function validateMcpServiceServers(
+  services: WorkbenchMcpServicesConfig,
+  servers: WorkbenchMcpServersConfig,
+  configPath: string,
+): void {
+  for (const name of Object.keys(services)) {
+    if (servers[name] === undefined) {
+      throw new Error(`Workbench case ${configPath}: field "mcpServices" service "${name}" requires a matching "mcpServers" entry`);
+    }
+  }
+}
+
+export function readMcpServers(
+  parsed: Record<string, unknown>,
+  configPath: string,
+): WorkbenchMcpServersConfig {
+  const value = parsed.mcpServers;
+  if (value === undefined) {
+    return {};
+  }
+  if (!isPlainObject(value)) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" must be an object`);
+  }
+
+  const servers: WorkbenchMcpServersConfig = {};
+  for (const [rawName, rawServer] of Object.entries(value)) {
+    const name = rawName.trim();
+    if (name === '') {
+      throw new Error(`Workbench case ${configPath}: field "mcpServers" server names must be non-empty strings`);
+    }
+    if (!isPlainObject(rawServer)) {
+      throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${name}" must be an object`);
+    }
+    servers[name] = readMcpServer(rawServer, name, configPath);
+  }
+
+  return servers;
+}
+
+function readMcpServer(
+  parsed: Record<string, unknown>,
+  name: string,
+  configPath: string,
+): WorkbenchMcpServerConfig {
+  const server: WorkbenchMcpServerConfig = {};
+
+  for (const [key, value] of Object.entries(parsed)) {
+    if (isMcpStringField(key)) {
+      server[key] = readMcpString(value, key, name, configPath);
+      continue;
+    }
+
+    if (isMcpStringArrayField(key)) {
+      server[key] = readMcpStringArray(value, key, name, configPath);
+      continue;
+    }
+
+    if (key === 'env' || key === 'headers') {
+      server[key] = readMcpStringRecord(value, key, name, configPath);
+      continue;
+    }
+
+    server[key] = cloneMcpJsonValue(value, key, name, configPath);
+  }
+
+  if (server.auth === 'oauth') {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${name}" auth "oauth" is not supported; use non-interactive headers or env credentials`);
+  }
+
+  const hasUrl = [server.url, server.baseUrl, server.serverUrl]
+    .some((value) => typeof value === 'string' && value.trim() !== '');
+  const hasCommand = typeof server.command === 'string' && server.command.trim() !== '';
+  if (!hasUrl && !hasCommand) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${name}" must define a non-empty url, baseUrl, serverUrl, or command`);
+  }
+
+  if ((server.allowedTools || server.allowed_tools) && (server.blockedTools || server.blocked_tools)) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${name}" cannot define both allowedTools and blockedTools`);
+  }
+
+  return server;
+}
+
+function isPlainObject(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === 'object' && !Array.isArray(value);
+}
+
+function isMcpStringField(key: string): boolean {
+  return [
+    'description',
+    'baseUrl',
+    'url',
+    'serverUrl',
+    'command',
+    'auth',
+    'tokenCacheDir',
+    'clientName',
+    'oauthRedirectUrl',
+    'oauthScope',
+  ].includes(key);
+}
+
+function isMcpStringArrayField(key: string): boolean {
+  return ['args', 'allowedTools', 'allowed_tools', 'blockedTools', 'blocked_tools'].includes(key);
+}
+
+function readMcpString(
+  value: unknown,
+  field: string,
+  serverName: string,
+  configPath: string,
+): string {
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} must be a non-empty string`);
+  }
+  return value.trim();
+}
+
+function readMcpServiceString(
+  value: unknown,
+  field: string,
+  serviceName: string,
+  configPath: string,
+): string {
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(`Workbench case ${configPath}: field "mcpServices" service "${serviceName}" ${field} must be a non-empty string`);
+  }
+  return value.trim();
+}
+
+function readMcpStringArray(
+  value: unknown,
+  field: string,
+  serverName: string,
+  configPath: string,
+): string[] {
+  if (!Array.isArray(value)) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} must be an array of non-empty strings`);
+  }
+  return value.map((item, index) => {
+    if (typeof item !== 'string' || item.trim() === '') {
+      throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} item at index ${index} must be a non-empty string`);
+    }
+    return item.trim();
+  });
+}
+
+function readMcpServiceStringArray(
+  value: unknown,
+  field: string,
+  serviceName: string,
+  configPath: string,
+): string[] {
+  if (!Array.isArray(value)) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServices" service "${serviceName}" ${field} must be an array of non-empty strings`);
+  }
+  return value.map((item, index) => {
+    if (typeof item !== 'string' || item.trim() === '') {
+      throw new Error(`Workbench case ${configPath}: field "mcpServices" service "${serviceName}" ${field} item at index ${index} must be a non-empty string`);
+    }
+    return item.trim();
+  });
+}
+
+function readMcpStringRecord(
+  value: unknown,
+  field: string,
+  serverName: string,
+  configPath: string,
+): Record<string, string> {
+  if (!isPlainObject(value)) {
+    throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} must be an object of string values`);
+  }
+
+  const record: Record<string, string> = {};
+  for (const [rawKey, rawValue] of Object.entries(value)) {
+    const key = rawKey.trim();
+    if (key === '' || typeof rawValue !== 'string') {
+      throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} entries must have non-empty string keys and string values`);
+    }
+    record[key] = rawValue;
+  }
+  return record;
+}
+
+function cloneMcpJsonValue(
+  value: unknown,
+  field: string,
+  serverName: string,
+  configPath: string,
+): WorkbenchMcpJsonValue {
+  if (value === null || typeof value === 'string' || typeof value === 'boolean') {
+    return value;
+  }
+  if (typeof value === 'number') {
+    if (!Number.isFinite(value)) {
+      throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} must be JSON-compatible`);
+    }
+    return value;
+  }
+  if (Array.isArray(value)) {
+    return value.map((item) => cloneMcpJsonValue(item, field, serverName, configPath));
+  }
+  if (isPlainObject(value)) {
+    const record: Record<string, WorkbenchMcpJsonValue> = {};
+    for (const [key, item] of Object.entries(value)) {
+      record[key] = cloneMcpJsonValue(item, field, serverName, configPath);
+    }
+    return record;
+  }
+
+  throw new Error(`Workbench case ${configPath}: field "mcpServers" server "${serverName}" ${field} must be JSON-compatible`);
+}
+
+function parseWorkbenchCase(raw: string, configPath: string): Record<string, unknown> {
+  const extension = extname(configPath).toLowerCase();
+
+  try {
+    if (extension === '.json') {
+      const parsed = JSON.parse(raw) as unknown;
+      ensurePlainObject(parsed, configPath);
+      return parsed;
+    }
+
+    if (extension === '.yml' || extension === '.yaml') {
+      const parsed = parseYaml(raw) as unknown;
+      ensurePlainObject(parsed, configPath);
+      return parsed;
+    }
+  } catch (error) {
+    const parser = extension === '.json' ? 'JSON' : 'YAML';
+    throw new Error(
+      `Invalid ${parser} in workbench case file ${configPath}: ` +
+      `${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+
+  throw new Error(
+    `Unsupported workbench case file extension for ${configPath}. Expected .json, .yml, or .yaml.`,
+  );
+}
+
+function ensurePlainObject(value: unknown, configPath: string): asserts value is Record<string, unknown> {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    throw new Error(`Workbench case file ${configPath} must contain an object at the root`);
+  }
+}
+
+function requireNonEmptyString(
+  parsed: Record<string, unknown>,
+  field: 'name' | 'references' | 'task',
+  configPath: string,
+): string {
+  const value = parsed[field];
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(`Workbench case ${configPath}: field "${field}" must be a non-empty string`);
+  }
+  return value.trim();
+}
+
+function readGraders(parsed: Record<string, unknown>, configPath: string): WorkbenchGraderConfig[] {
+  const value = parsed.graders;
+  if (!Array.isArray(value) || value.length === 0) {
+    throw new Error(`Workbench case ${configPath}: field "graders" must be a non-empty array`);
+  }
+
+  return value.map((item, index) => {
+    if (!item || typeof item !== 'object' || Array.isArray(item)) {
+      throw new Error(`Workbench case ${configPath}: field "graders" item at index ${index} must be an object`);
+    }
+
+    const grader = item as Record<string, unknown>;
+    const name = readGraderString(grader.name, 'name', index, configPath);
+    const command = readGraderString(grader.command, 'command', index, configPath);
+    return { name, command };
+  });
+}
+
+function readGraderString(
+  value: unknown,
+  field: keyof WorkbenchGraderConfig,
+  index: number,
+  configPath: string,
+): string {
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(
+      `Workbench case ${configPath}: field "graders" item at index ${index} ${field} must be a non-empty string`,
+    );
+  }
+  return value.trim();
+}
+
+function readOptionalString(
+  parsed: Record<string, unknown>,
+  field: 'model',
+  configPath: string,
+): string | undefined {
+  const value = parsed[field];
+  if (value === undefined) {
+    return undefined;
+  }
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(`Workbench case ${configPath}: field "${field}" must be a non-empty string when provided`);
+  }
+  return value.trim();
+}
+
+function readOptionalTimeoutSeconds(parsed: Record<string, unknown>, configPath: string): number | undefined {
+  const value = parsed.timeoutSeconds;
+  if (value === undefined) {
+    return undefined;
+  }
+  if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) {
+    throw new Error(`Workbench case ${configPath}: field "timeoutSeconds" must be a positive number when provided`);
+  }
+  return value;
+}
+
+function readStringArray(
+  parsed: Record<string, unknown>,
+  field: 'env' | 'setup' | 'cleanup',
+  configPath: string,
+): string[] {
+  const value = parsed[field];
+  if (value === undefined) {
+    return [];
+  }
+  if (!Array.isArray(value)) {
+    throw new Error(`Workbench case ${configPath}: field "${field}" must be an array of non-empty strings`);
+  }
+
+  return value.map((item, index) => {
+    if (typeof item !== 'string' || item.trim() === '') {
+      throw new Error(
+        `Workbench case ${configPath}: field "${field}" item at index ${index} must be a non-empty string`,
+      );
+    }
+    const trimmed = item.trim();
+    if (field === 'env' && !ENV_NAME_PATTERN.test(trimmed)) {
+      throw new Error(
+        `Workbench case ${configPath}: field "env" item at index ${index} must match ^[A-Za-z_][A-Za-z0-9_]*$`,
+      );
+    }
+    return trimmed;
+  });
+}
diff --git a/src/workbench/check-runner.ts b/src/workbench/check-runner.ts
new file mode 100644
index 0000000..d940cab
--- /dev/null
+++ b/src/workbench/check-runner.ts
@@ -0,0 +1,143 @@
+import { runShellCommand } from './process.js';
+import type { ProcessResult } from './process.js';
+import type { WorkbenchGrade, WorkbenchGraderConfig, WorkbenchGraderResult } from './types.js';
+
+function parseFirstJsonObject(stdout: string): Record<string, unknown> | null {
+  const firstBrace = stdout.indexOf('{');
+  const lastBrace = stdout.lastIndexOf('}');
+  if (firstBrace < 0 || lastBrace <= firstBrace) {
+    return null;
+  }
+
+  try {
+    const parsed = JSON.parse(stdout.slice(firstBrace, lastBrace + 1));
+    if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
+      return null;
+    }
+    return parsed as Record<string, unknown>;
+  } catch {
+    return null;
+  }
+}
+
+function normalizeEvidence(evidence: unknown): string[] {
+  if (Array.isArray(evidence)) {
+    return evidence.map((value) => String(value));
+  }
+
+  if (typeof evidence === 'string') {
+    return [evidence];
+  }
+
+  if (evidence === undefined || evidence === null) {
+    return [];
+  }
+
+  return [String(evidence)];
+}
+
+function clampScore(score: unknown, pass: boolean): number {
+  if (typeof score !== 'number' || !Number.isFinite(score)) {
+    return pass ? 1 : 0;
+  }
+
+  return Math.max(0, Math.min(1, score));
+}
+
+export function normalizeCheckResult(result: ProcessResult): WorkbenchGrade {
+  if (result.timedOut === true) {
+    const json = parseFirstJsonObject(result.stdout);
+    const evidence = ['check command timed out'];
+    if (json !== null) {
+      evidence.push(...normalizeEvidence(json.evidence));
+    } else if (result.stdout.trim().length > 0) {
+      evidence.push(result.stdout.trim());
+    }
+    if (result.stderr.trim().length > 0) {
+      evidence.push(result.stderr.trim());
+    }
+
+    return {
+      pass: false,
+      score: 0,
+      evidence,
+    };
+  }
+
+  const json = parseFirstJsonObject(result.stdout);
+  if (json !== null) {
+    const pass =
+      typeof json.pass === 'boolean' ? json.pass : result.exitCode === 0;
+    const score = clampScore(json.score, pass);
+    const evidence = normalizeEvidence(json.evidence);
+
+    return {
+      pass,
+      score,
+      evidence,
+    };
+  }
+
+  if (result.exitCode === 0) {
+    return {
+      pass: true,
+      score: 1,
+      evidence: result.stdout.trim().length > 0 ? [result.stdout.trim()] : [],
+    };
+  }
+
+  const evidence: string[] = [];
+  if (result.stderr.trim().length > 0) {
+    evidence.push(result.stderr.trim());
+  }
+
+  if (result.stdout.trim().length > 0) {
+    evidence.push(result.stdout.trim());
+  }
+
+  return {
+    pass: false,
+    score: 0,
+    evidence,
+  };
+}
+
+export async function runCheckCommand(
+  command: string,
+  opts: { cwd: string; env?: NodeJS.ProcessEnv; timeoutSeconds?: number },
+): Promise<WorkbenchGrade> {
+  const processResult = await runShellCommand(command, opts);
+  return normalizeCheckResult(processResult);
+}
+
+export async function runGraderCommands(
+  graders: WorkbenchGraderConfig[],
+  opts: { cwd: string; env?: NodeJS.ProcessEnv; timeoutSeconds?: number },
+): Promise<WorkbenchGrade> {
+  const results: WorkbenchGraderResult[] = [];
+
+  for (const grader of graders) {
+    const grade = await runCheckCommand(grader.command, opts);
+    results.push({
+      ...grade,
+      name: grader.name,
+      command: grader.command,
+    });
+  }
+
+  const passed = results.filter((result) => result.pass).length;
+  const evidence = results.flatMap((result) => {
+    if (result.evidence.length === 0) {
+      return [`${result.name}: ${result.pass ? 'PASS' : 'FAIL'}`];
+    }
+
+    return result.evidence.map((line) => `${result.name}: ${line}`);
+  });
+
+  return {
+    pass: passed === results.length,
+    score: results.length === 0 ? 0 : passed / results.length,
+    evidence,
+    graders: results,
+  };
+}
diff --git a/src/workbench/cli-args.ts b/src/workbench/cli-args.ts
new file mode 100644
index 0000000..03735d9
--- /dev/null
+++ b/src/workbench/cli-args.ts
@@ -0,0 +1,43 @@
+export function getFlag(args: string[], flag: string): string | undefined {
+  const index = args.indexOf(flag);
+  if (index < 0) {
+    return undefined;
+  }
+
+  const value = args[index + 1];
+  if (!value || value.startsWith('--')) {
+    throw new Error(`Flag ${flag} requires a value`);
+  }
+
+  return value;
+}
+
+export function positionals(args: string[], options: { valueFlags: string[]; booleanFlags?: string[] }): string[] {
+  const valueFlags = new Set(options.valueFlags);
+  const booleanFlags = new Set(options.booleanFlags ?? []);
+  const result: string[] = [];
+
+  for (let index = 0; index < args.length; index += 1) {
+    const arg = args[index];
+    if (!arg) {
+      continue;
+    }
+
+    if (valueFlags.has(arg)) {
+      index += 1;
+      continue;
+    }
+
+    if (booleanFlags.has(arg)) {
+      continue;
+    }
+
+    if (arg.startsWith('--')) {
+      throw new Error(`Unknown flag: ${arg}`);
+    }
+
+    result.push(arg);
+  }
+
+  return result;
+}
diff --git a/src/workbench/container-runner.ts b/src/workbench/container-runner.ts
new file mode 100644
index 0000000..742183b
--- /dev/null
+++ b/src/workbench/container-runner.ts
@@ -0,0 +1,592 @@
+import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+
+import { runGraderCommands } from './check-runner.js';
+import { loadWorkbenchCase } from './case-loader.js';
+import { buildWorkbenchMetrics } from './metrics.js';
+import { createWorkbenchPiSession } from './pi-agent.js';
+import { runShellCommand } from './process.js';
+import { buildAgentSystemPrompt } from './sandbox.js';
+import { buildWorkbenchTrace, createTraceRecorder } from './trace.js';
+import type { TraceRecorder } from './trace.js';
+import type { WorkbenchGrade, WorkbenchResult, WorkbenchTrace, WorkbenchTraceEntry } from './types.js';
+import { isRecord, writeJsonFile } from './utils.js';
+import { buildWorkbenchEnv, prepareWorkbenchDirectory } from './workspace.js';
+
+export { prepareWorkbenchDirectory } from './workspace.js';
+export { buildAgentSystemPrompt } from './sandbox.js';
+
+interface PromptSession {
+  prompt(prompt: string): Promise<unknown>;
+  systemPrompt?: string;
+  subscribe?: (listener: (event: unknown) => void) => () => void;
+  dispose?: () => void;
+  state?: {
+    messages?: unknown[];
+  };
+}
+
+interface AgentRunnerArgs {
+  mode: 'agent';
+  caseName: string;
+  model: string;
+  task: string;
+  appendSystemPrompt?: string;
+  timeoutSeconds: number;
+  workDir: string;
+  resultsDir: string;
+  mcpConfigPath?: string;
+}
+
+interface GradeRunnerArgs {
+  mode: 'grade';
+  casePath: string;
+  workDir: string;
+  resultsDir: string;
+}
+
+interface SetupRunnerArgs {
+  mode: 'setup';
+  casePath: string;
+  workDir: string;
+}
+
+export type ContainerRunnerArgs = AgentRunnerArgs | GradeRunnerArgs | SetupRunnerArgs;
+
+export function buildContainerWorkbenchEnv(params: {
+  casePath: string;
+  workDir: string;
+  resultsDir: string;
+  baseEnv?: NodeJS.ProcessEnv;
+}): NodeJS.ProcessEnv {
+  return buildWorkbenchEnv({
+    caseDir: dirname(params.casePath),
+    workDir: params.workDir,
+    resultsDir: params.resultsDir,
+    baseEnv: params.baseEnv,
+  });
+}
+
+export function parseContainerRunnerArgs(args: string[]): ContainerRunnerArgs {
+  const workDir = getFlagValue(args, '--work');
+  const resultsDir = getFlagValue(args, '--results');
+
+  if (args.includes('--agent')) {
+    const caseName = getFlagValue(args, '--case-name');
+    const model = getFlagValue(args, '--model');
+    const taskBase64 = getFlagValue(args, '--task-base64');
+    const appendSystemPromptBase64 = getFlagValue(args, '--append-system-prompt-base64');
+    const mcpConfigPath = getFlagValue(args, '--mcp-config');
+    const timeoutSeconds = Number(getFlagValue(args, '--timeout-seconds'));
+    if (!caseName || !model || !taskBase64 || !Number.isFinite(timeoutSeconds) || timeoutSeconds <= 0 || !workDir || !resultsDir) {
+      throw new Error('Usage: container-runner --agent --case-name <name> --model <model> --task-base64 <task> --timeout-seconds <seconds> --work <path> --results <path>');
+    }
+    return {
+      mode: 'agent',
+      caseName,
+      model,
+      task: Buffer.from(taskBase64, 'base64').toString('utf-8'),
+      appendSystemPrompt: appendSystemPromptBase64
+        ? Buffer.from(appendSystemPromptBase64, 'base64').toString('utf-8')
+        : undefined,
+      timeoutSeconds,
+      workDir,
+      resultsDir,
+      mcpConfigPath,
+    };
+  }
+
+  const casePath = getFlagValue(args, '--case');
+  if (args.includes('--setup')) {
+    if (!casePath || !workDir) {
+      throw new Error('Usage: container-runner --setup --case <path> --work <path>');
+    }
+    return { mode: 'setup', casePath, workDir };
+  }
+
+  if (!args.includes('--grade') || !casePath || !workDir || !resultsDir) {
+    throw new Error('Usage: container-runner --agent ... or --setup --case <path> --work <path> or --grade --case <path> --work <path> --results <path>');
+  }
+
+  return { mode: 'grade', casePath, workDir, resultsDir };
+}
+
+function getFlagValue(args: string[], flag: string): string | undefined {
+  const index = args.indexOf(flag);
+  if (index < 0) {
+    return undefined;
+  }
+
+  const value = args[index + 1];
+  if (!value || value.startsWith('--')) {
+    throw new Error(`Flag ${flag} requires a value`);
+  }
+
+  return value;
+}
+
+export async function runAgentPromptWithTimeout(
+  session: PromptSession,
+  prompt: string,
+  timeoutSeconds: number,
+): Promise<void> {
+  let timeout: NodeJS.Timeout | undefined;
+  try {
+    await Promise.race([
+      session.prompt(prompt),
+      new Promise<never>((_, reject) => {
+        timeout = setTimeout(() => {
+          reject(new Error(`Agent timed out after ${timeoutSeconds} seconds`));
+        }, timeoutSeconds * 1000);
+      }),
+    ]);
+  } finally {
+    if (timeout) {
+      clearTimeout(timeout);
+    }
+  }
+
+  const messages = session.state?.messages ?? [];
+  const lastMessage = messages[messages.length - 1];
+  if (!isRecord(lastMessage) || lastMessage.role !== 'assistant') {
+    return;
+  }
+
+  if (lastMessage.stopReason === 'error' || lastMessage.stopReason === 'aborted') {
+    const errorMessage = typeof lastMessage.errorMessage === 'string'
+      ? lastMessage.errorMessage
+      : `Agent request ${lastMessage.stopReason}`;
+    throw new Error(errorMessage);
+  }
+}
+
+export function writeBestEffortTrace(params: {
+  tracePath: string;
+  caseName?: string;
+  model?: string;
+  startedAt?: string;
+  endedAt?: string;
+  session?: PromptSession;
+  recorder?: TraceRecorder;
+}): boolean {
+  const messages = params.session?.state?.messages;
+  if (!params.caseName || !params.model || !params.startedAt) {
+    return false;
+  }
+
+  if (params.recorder && params.recorder.events.length > 0) {
+    writeTraceFile(params.tracePath, params.recorder.toTrace({
+      caseName: params.caseName,
+      model: params.model,
+      startedAt: params.startedAt,
+      endedAt: params.endedAt ?? new Date().toISOString(),
+      messages: messages ?? [],
+    }));
+    return true;
+  }
+
+  if (!messages) {
+    return false;
+  }
+
+  writeTraceFile(params.tracePath, buildWorkbenchTrace({
+    caseName: params.caseName,
+    model: params.model,
+    startedAt: params.startedAt,
+    endedAt: params.endedAt ?? new Date().toISOString(),
+    messages,
+  }));
+  return true;
+}
+
+export function writeTraceFile(tracePath: string, trace: WorkbenchTrace): void {
+  const header = {
+    type: 'trace_start',
+    schemaVersion: trace.schemaVersion ?? 1,
+    caseName: trace.caseName,
+    model: trace.model,
+    startedAt: trace.startedAt,
+    endedAt: trace.endedAt,
+  };
+  const lines = [header, ...trace.entries]
+    .map((entry) => JSON.stringify(entry));
+  writeFileSync(tracePath, `${lines.join('\n')}\n`, 'utf-8');
+}
+
+async function runCleanupCommands(
+  commands: string[],
+  opts: { cwd: string; env: NodeJS.ProcessEnv },
+  cleanupErrorPath: string,
+): Promise<void> {
+  if (commands.length === 0) {
+    return;
+  }
+
+  const errors: string[] = [];
+  for (const command of commands) {
+    const result = await runShellCommand(command, {
+      cwd: opts.cwd,
+      env: opts.env,
+    });
+
+    if (result.exitCode !== 0) {
+      errors.push([
+        `Command failed: ${command}`,
+        `exitCode: ${String(result.exitCode)}`,
+        result.stdout.trim() ? `stdout:\n${result.stdout.trim()}` : null,
+        result.stderr.trim() ? `stderr:\n${result.stderr.trim()}` : null,
+      ].filter(Boolean).join('\n\n'));
+    }
+  }
+
+  if (errors.length > 0) {
+    writeFileSync(cleanupErrorPath, `${errors.join('\n\n---\n\n')}\n`, 'utf-8');
+  }
+}
+
+async function runSetupCommands(
+  commands: string[],
+  opts: { cwd: string; env: NodeJS.ProcessEnv },
+): Promise<string[]> {
+  const errors: string[] = [];
+  for (const command of commands) {
+    const result = await runShellCommand(command, {
+      cwd: opts.cwd,
+      env: opts.env,
+    });
+
+    if (result.exitCode !== 0) {
+      errors.push([
+        `Command failed: ${command}`,
+        `exitCode: ${String(result.exitCode)}`,
+        result.stdout.trim() ? `stdout:\n${result.stdout.trim()}` : null,
+        result.stderr.trim() ? `stderr:\n${result.stderr.trim()}` : null,
+      ].filter(Boolean).join('\n\n'));
+    }
+  }
+  return errors;
+}
+
+function buildFatalGrade(error: unknown): WorkbenchGrade {
+  return {
+    pass: false,
+    score: 0,
+    evidence: [error instanceof Error ? error.message : String(error)],
+  };
+}
+
+function buildResult(params: {
+  caseName: string;
+  model: string;
+  startedAt: string;
+  endedAt: string;
+  grade: WorkbenchGrade;
+}): WorkbenchResult {
+  return {
+    caseName: params.caseName,
+    model: params.model,
+    startedAt: params.startedAt,
+    endedAt: params.endedAt,
+    ...params.grade,
+  };
+}
+
+function readTraceFile(tracePath: string, fallback: Omit<WorkbenchTrace, 'entries'>): WorkbenchTrace {
+  try {
+    const raw = readFileSync(tracePath, 'utf-8');
+    const trimmed = raw.trim();
+    if (trimmed.startsWith('{')) {
+      try {
+      const parsed = JSON.parse(trimmed) as unknown;
+      if (isRecord(parsed) && Array.isArray(parsed.entries)) {
+        return parsed as unknown as WorkbenchTrace;
+      }
+      } catch {
+        // Fall through to JSONL parsing.
+      }
+    }
+
+    const rows = trimmed.length > 0
+      ? trimmed.split(/\r?\n/).flatMap((line) => {
+          try {
+            const parsed = JSON.parse(line) as unknown;
+            return isRecord(parsed) ? [parsed] : [];
+          } catch {
+            return [];
+          }
+        })
+      : [];
+    const header = rows.find((row) => row.type === 'trace_start');
+    const entries = rows.filter(isTraceEntry) as WorkbenchTraceEntry[];
+    if (header || entries.length > 0) {
+      return {
+        schemaVersion: 1,
+        caseName: isRecord(header) && typeof header.caseName === 'string' ? header.caseName : fallback.caseName,
+        model: isRecord(header) && typeof header.model === 'string' ? header.model : fallback.model,
+        startedAt: isRecord(header) && typeof header.startedAt === 'string' ? header.startedAt : fallback.startedAt,
+        endedAt: isRecord(header) && typeof header.endedAt === 'string' ? header.endedAt : fallback.endedAt,
+        entries,
+      };
+    }
+  } catch {
+    // Grade results should still be written if trace persistence failed.
+  }
+
+  return { ...fallback, entries: [] };
+}
+
+function isTraceEntry(value: Record<string, unknown>): boolean {
+  return value.type === 'message' || value.type === 'tool_call' || value.type === 'tool_result';
+}
+
+function summarizeContent(content: unknown): string | undefined {
+  if (!Array.isArray(content)) {
+    return undefined;
+  }
+
+  const text = content
+    .flatMap((item) => isRecord(item) && typeof item.text === 'string' ? [item.text] : [])
+    .join('\n')
+    .replace(/\s+/g, ' ')
+    .trim();
+  return text.length > 160 ? `${text.slice(0, 157)}...` : text || undefined;
+}
+
+function logAgentEvent(event: unknown): void {
+  if (!isRecord(event) || typeof event.type !== 'string') {
+    return;
+  }
+
+  if (event.type === 'message_end' && isRecord(event.message)) {
+    const role = typeof event.message.role === 'string' ? event.message.role : 'unknown';
+    const text = summarizeContent(event.message.content);
+    console.log(`[agent:${event.type}] ${role}${text ? `: ${text}` : ''}`);
+    return;
+  }
+
+  if (event.type === 'tool_execution_start') {
+    const name = typeof event.toolName === 'string' ? event.toolName : 'unknown';
+    const args = event.args === undefined ? '' : ` ${JSON.stringify(event.args)}`;
+    console.log(`[agent:${event.type}] ${name}${args}`);
+    return;
+  }
+
+  if (event.type === 'tool_execution_end') {
+    const name = typeof event.toolName === 'string' ? event.toolName : 'unknown';
+    const status = event.isError === true ? 'error' : 'ok';
+    console.log(`[agent:${event.type}] ${name} ${status}`);
+    return;
+  }
+
+  if (event.type === 'turn_start' || event.type === 'turn_end' || event.type === 'agent_start' || event.type === 'agent_end') {
+    console.log(`[agent:${event.type}]`);
+  }
+}
+
+function logAgentSystemPrompt(systemPrompt: string): void {
+  console.log('[agent:system_prompt_start]');
+  console.log(systemPrompt);
+  console.log('[agent:system_prompt_end]');
+}
+
+async function runAgentMode(parsed: AgentRunnerArgs): Promise<number> {
+  const resultPath = join(parsed.resultsDir, 'result.json');
+  const tracePath = join(parsed.resultsDir, 'trace.jsonl');
+  let session: PromptSession | undefined;
+  let recorder: TraceRecorder | undefined;
+  let startedAt: string | undefined;
+  const previousWork = process.env.WORK;
+  const previousResults = process.env.RESULTS;
+  const previousMcporterConfig = process.env.MCPORTER_CONFIG;
+
+  mkdirSync(parsed.resultsDir, { recursive: true });
+  process.env.WORK = parsed.workDir;
+  process.env.RESULTS = parsed.resultsDir;
+  if (parsed.mcpConfigPath) {
+    process.env.MCPORTER_CONFIG = parsed.mcpConfigPath;
+  } else {
+    delete process.env.MCPORTER_CONFIG;
+  }
+
+  try {
+    try {
+      startedAt = new Date().toISOString();
+      const created = await createWorkbenchPiSession({
+        cwd: parsed.workDir,
+        modelRef: parsed.model,
+        apiKeyEnv: 'OPENROUTER_API_KEY',
+        appendSystemPrompt: parsed.appendSystemPrompt,
+        mcpConfigPath: parsed.mcpConfigPath,
+      });
+      session = created.session as PromptSession;
+      const systemPrompt = typeof session.systemPrompt === 'string'
+        ? session.systemPrompt
+        : buildAgentSystemPrompt();
+      logAgentSystemPrompt(systemPrompt);
+      recorder = createTraceRecorder();
+      const unsubscribe = session.subscribe?.((event) => {
+        recorder?.record(event);
+        logAgentEvent(event);
+      });
+
+      try {
+        await runAgentPromptWithTimeout(session, parsed.task, parsed.timeoutSeconds);
+      } finally {
+        unsubscribe?.();
+      }
+
+      const endedAt = new Date().toISOString();
+      const trace = recorder.toTrace({
+        caseName: parsed.caseName,
+        model: parsed.model,
+        startedAt,
+        endedAt,
+        messages: session.state?.messages ?? [],
+      });
+      trace.entries.unshift({
+        type: 'message',
+        role: 'system',
+        text: systemPrompt,
+        timestamp: startedAt,
+      });
+
+      writeTraceFile(tracePath, trace);
+      return 0;
+    } catch (error) {
+      const endedAt = new Date().toISOString();
+      try {
+        writeBestEffortTrace({
+          tracePath,
+          caseName: parsed.caseName,
+          model: parsed.model,
+          startedAt,
+          endedAt,
+          session,
+          recorder,
+        });
+      } catch {
+        // Fatal result writing is more important than partial trace persistence.
+      }
+      writeJsonFile(resultPath, {
+        caseName: parsed.caseName,
+        model: parsed.model,
+        endedAt,
+        ...buildFatalGrade(error),
+        error: error instanceof Error ? error.message : String(error),
+      });
+      return 1;
+    }
+  } finally {
+    if (previousWork === undefined) {
+      delete process.env.WORK;
+    } else {
+      process.env.WORK = previousWork;
+    }
+
+    if (previousResults === undefined) {
+      delete process.env.RESULTS;
+    } else {
+      process.env.RESULTS = previousResults;
+    }
+
+    if (previousMcporterConfig === undefined) {
+      delete process.env.MCPORTER_CONFIG;
+    } else {
+      process.env.MCPORTER_CONFIG = previousMcporterConfig;
+    }
+  }
+}
+
+async function runSetupMode(parsed: SetupRunnerArgs): Promise<number> {
+  const resolved = loadWorkbenchCase(parsed.casePath);
+  const env = buildContainerWorkbenchEnv({
+    casePath: parsed.casePath,
+    workDir: parsed.workDir,
+    resultsDir: '/tmp/workbench-setup-results',
+  });
+  const errors = await runSetupCommands(resolved.setup, { cwd: parsed.workDir, env });
+  if (errors.length > 0) {
+    console.error(errors.join('\n\n---\n\n'));
+    return 1;
+  }
+  return 0;
+}
+
+async function runGradeMode(parsed: GradeRunnerArgs): Promise<number> {
+  const resultPath = join(parsed.resultsDir, 'result.json');
+  const tracePath = join(parsed.resultsDir, 'trace.jsonl');
+  const cleanupErrorPath = join(parsed.resultsDir, 'cleanup-error.txt');
+  const resolved = loadWorkbenchCase(parsed.casePath);
+  const env = buildContainerWorkbenchEnv(parsed);
+  const now = new Date().toISOString();
+  const trace = readTraceFile(tracePath, {
+    caseName: resolved.name,
+    model: resolved.model,
+    startedAt: now,
+    endedAt: now,
+  });
+
+  try {
+    const grade = await runGraderCommands(resolved.graders, {
+      cwd: parsed.workDir,
+      env,
+      timeoutSeconds: 120,
+    });
+    const result = buildResult({
+      caseName: resolved.name,
+      model: resolved.model,
+      startedAt: trace.startedAt,
+      endedAt: new Date().toISOString(),
+      grade: {
+        ...grade,
+        metrics: buildWorkbenchMetrics(trace),
+      },
+    });
+
+    writeJsonFile(resultPath, result);
+    await runCleanupCommands(resolved.cleanup, { cwd: parsed.workDir, env }, cleanupErrorPath);
+    return grade.pass ? 0 : 1;
+  } catch (error) {
+    writeJsonFile(resultPath, {
+      caseName: resolved.name,
+      model: resolved.model,
+      endedAt: new Date().toISOString(),
+      ...buildFatalGrade(error),
+      error: error instanceof Error ? error.message : String(error),
+    });
+    return 1;
+  }
+}
+
+export async function runContainerWorkbenchCase(args: string[]): Promise<number> {
+  const parsed = parseContainerRunnerArgs(args);
+  if (parsed.mode === 'agent') {
+    return runAgentMode(parsed);
+  }
+  if (parsed.mode === 'setup') {
+    return runSetupMode(parsed);
+  }
+  return runGradeMode(parsed);
+}
+
+function isMainModule(): boolean {
+  const entry = process.argv[1];
+  if (!entry) {
+    return false;
+  }
+
+  const normalized = entry.replace(/\\/g, '/');
+  return normalized.endsWith('/container-runner.js') || normalized.endsWith('/container-runner.ts');
+}
+
+
+if (isMainModule()) {
+  void runContainerWorkbenchCase(process.argv.slice(2))
+    .then((code) => {
+      process.exit(code);
+    })
+    .catch((error: unknown) => {
+      console.error(error instanceof Error ? error.message : String(error));
+      process.exit(1);
+    });
+}
diff --git a/src/workbench/docker-runner.ts b/src/workbench/docker-runner.ts
new file mode 100644
index 0000000..292ce6a
--- /dev/null
+++ b/src/workbench/docker-runner.ts
@@ -0,0 +1,620 @@
+import { cpSync, existsSync, mkdirSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { dirname, join, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+import { stringify as stringifyYaml } from 'yaml';
+
+import { loadWorkbenchCase } from './case-loader.js';
+import { MCPORTER_CONFIG_CONTAINER_PATH, writeWorkbenchMcpConfig } from './mcp/index.js';
+import { runShellCommand } from './process.js';
+import type { ResolvedWorkbenchCase, WorkbenchCaseConfig } from './types.js';
+import { timestampSlug } from './utils.js';
+import { prepareWorkbenchDirectory } from './workspace.js';
+
+const DEFAULT_WORKBENCH_IMAGE = 'skill-optimizer-workbench:local';
+const AGENT_RESULTS_DIR = '/tmp/workbench-results';
+const AGENT_PATH = '/work/bin:/app/node_modules/.bin:/work/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin';
+
+export function packageRootFromModuleUrl(moduleUrl: string): string {
+  return dirname(dirname(dirname(fileURLToPath(moduleUrl))));
+}
+
+export interface RunDockerWorkbenchCaseOptions {
+  casePath?: string;
+  case?: ResolvedWorkbenchCase;
+  outDir?: string;
+  resultsDir?: string;
+  model?: string;
+  image?: string;
+  keepWorkspace?: boolean;
+  appendSystemPrompt?: string;
+}
+
+export interface DockerWorkbenchRunResult {
+  tempDir: string;
+  caseDir: string;
+  bundledCasePath: string;
+  workDir: string;
+  resultsDir: string;
+  resultPath: string;
+  tracePath: string;
+  summaryPath?: string;
+  workspacePath?: string;
+  mcpConfigPath?: string;
+  cleanup: () => void;
+}
+
+export interface PrepareDockerWorkbenchRunOptions {
+  casePath?: string;
+  case?: ResolvedWorkbenchCase;
+  outDir?: string;
+  resultsDir?: string;
+  model?: string;
+  now?: Date;
+  tempRoot?: string;
+}
+
+function shellQuote(value: string): string {
+  return `'${value.replace(/'/g, `'\\''`)}'`;
+}
+
+function dockerSandboxFlags(): string[] {
+  return [
+    '--cap-drop=ALL',
+    '--security-opt no-new-privileges',
+    '--pids-limit 512',
+  ];
+}
+
+function dockerCacheEnvFlags(): string[] {
+  return [
+    '-e XDG_CACHE_HOME=/work/.cache',
+    '-e PIP_CACHE_DIR=/work/.cache/pip',
+    '-e NPM_CONFIG_CACHE=/work/.cache/npm',
+  ];
+}
+
+export function buildDockerAgentCommand(params: {
+  image: string;
+  containerName: string;
+  workDir: string;
+  caseName: string;
+  model: string;
+  task: string;
+  appendSystemPrompt?: string;
+  mcpConfigPath?: string;
+  networkName?: string;
+  timeoutSeconds: number;
+  envNames: string[];
+}): string {
+  const envArgs = params.envNames.map((name) => `-e ${name}`).join(' ');
+  const mcpEnvArg = params.mcpConfigPath ? `-e MCPORTER_CONFIG=${params.mcpConfigPath}` : '';
+  const networkArg = params.networkName ? `--network ${shellQuote(params.networkName)}` : '';
+  const taskBase64 = Buffer.from(params.task, 'utf-8').toString('base64');
+  const appendSystemPromptBase64 = params.appendSystemPrompt
+    ? Buffer.from(params.appendSystemPrompt, 'utf-8').toString('base64')
+    : undefined;
+  return [
+    'docker run',
+    `--name ${shellQuote(params.containerName)}`,
+    ...dockerSandboxFlags(),
+    '--workdir /work',
+    `-e PATH=${AGENT_PATH}`,
+    ...dockerCacheEnvFlags(),
+    networkArg,
+    `-v ${shellQuote(`${params.workDir}:/work:rw`)}`,
+    envArgs,
+    mcpEnvArg,
+    shellQuote(params.image),
+    '--agent',
+    '--work /work',
+    `--results ${AGENT_RESULTS_DIR}`,
+    `--case-name ${shellQuote(params.caseName)}`,
+    `--model ${shellQuote(params.model)}`,
+    `--timeout-seconds ${params.timeoutSeconds}`,
+    `--task-base64 ${shellQuote(taskBase64)}`,
+    params.mcpConfigPath ? `--mcp-config ${shellQuote(params.mcpConfigPath)}` : '',
+    appendSystemPromptBase64
+      ? `--append-system-prompt-base64 ${shellQuote(appendSystemPromptBase64)}`
+      : '',
+  ].filter(Boolean).join(' ');
+}
+
+export function buildDockerMcpServiceCommand(params: {
+  image: string;
+  containerName: string;
+  networkName: string;
+  alias: string;
+  mcpDir: string;
+  command: string;
+  args: string[];
+}): string {
+  const serviceCommand = [params.command, ...params.args].map(shellQuote).join(' ');
+  return [
+    'docker run -d',
+    `--name ${shellQuote(params.containerName)}`,
+    ...dockerSandboxFlags(),
+    `--network ${shellQuote(params.networkName)}`,
+    `--network-alias ${shellQuote(params.alias)}`,
+    '--workdir /mcp',
+    `-v ${shellQuote(`${params.mcpDir}:/mcp:ro`)}`,
+    '--entrypoint /bin/sh',
+    shellQuote(params.image),
+    '-lc',
+    shellQuote(serviceCommand),
+  ].filter(Boolean).join(' ');
+}
+
+export function buildDockerMcpServiceProbeCommand(params: {
+  image: string;
+  networkName: string;
+  workDir: string;
+  serverName: string;
+}): string {
+  const probeCommand = [
+    '/app/node_modules/.bin/mcporter',
+    '--config /work/mcporter.json',
+    '--root /work',
+    'list',
+    shellQuote(params.serverName),
+    '--schema',
+  ].join(' ');
+  return [
+    'docker run --rm',
+    ...dockerSandboxFlags(),
+    `--network ${shellQuote(params.networkName)}`,
+    '--workdir /work',
+    `-v ${shellQuote(`${params.workDir}:/work:rw`)}`,
+    '--entrypoint /bin/sh',
+    shellQuote(params.image),
+    '-lc',
+    shellQuote(probeCommand),
+  ].filter(Boolean).join(' ');
+}
+
+export function buildDockerSetupCommand(params: {
+  image: string;
+  caseDir: string;
+  workDir: string;
+  envNames: string[];
+}): string {
+  const envArgs = params.envNames.map((name) => `-e ${name}`).join(' ');
+  return [
+    'docker run --rm',
+    ...dockerSandboxFlags(),
+    '--workdir /work',
+    ...dockerCacheEnvFlags(),
+    `-v ${shellQuote(`${params.caseDir}:/case:ro`)}`,
+    `-v ${shellQuote(`${params.workDir}:/work:rw`)}`,
+    envArgs,
+    shellQuote(params.image),
+    '--setup',
+    '--case /case/case.yml',
+    '--work /work',
+  ].filter(Boolean).join(' ');
+}
+
+function agentContainerName(tempDir: string): string {
+  return `skill-optimizer-agent-${tempDir.split('/').pop() ?? 'run'}`;
+}
+
+async function copyAgentResults(containerName: string, resultsDir: string, repoRoot: string): Promise<void> {
+  const copy = await runShellCommand(
+    `docker cp ${shellQuote(`${containerName}:${AGENT_RESULTS_DIR}/.`)} ${shellQuote(resultsDir)}`,
+    { cwd: repoRoot },
+  );
+
+  if (copy.exitCode !== 0) {
+    throw new Error([
+      'Failed to copy agent results from Docker container',
+      copy.stdout.trim(),
+      copy.stderr.trim(),
+    ].filter(Boolean).join('\n\n'));
+  }
+}
+
+async function removeContainer(containerName: string, repoRoot: string): Promise<void> {
+  await runShellCommand(`docker rm -f ${shellQuote(containerName)}`, { cwd: repoRoot });
+}
+
+export function buildDockerGradeCommand(params: {
+  image: string;
+  caseDir: string;
+  workDir: string;
+  resultsDir: string;
+  envNames: string[];
+}): string {
+  const envArgs = params.envNames.map((name) => `-e ${name}`).join(' ');
+  return [
+    'docker run --rm',
+    ...dockerSandboxFlags(),
+    '--workdir /work',
+    ...dockerCacheEnvFlags(),
+    `-v ${shellQuote(`${params.caseDir}:/case:ro`)}`,
+    `-v ${shellQuote(`${params.workDir}:/work:rw`)}`,
+    `-v ${shellQuote(`${params.resultsDir}:/results:rw`)}`,
+    envArgs,
+    shellQuote(params.image),
+    '--grade',
+    '--case /case/case.yml',
+    '--work /work',
+    '--results /results',
+  ].filter(Boolean).join(' ');
+}
+
+function buildBundledCaseFile(params: {
+  source: ReturnType<typeof loadWorkbenchCase>;
+  modelOverride?: string;
+}): WorkbenchCaseConfig {
+  const bundled: WorkbenchCaseConfig = {
+    name: params.source.name,
+    references: './references',
+    task: params.source.task,
+    graders: params.source.graders.map((grader) => ({ ...grader })),
+    model: params.modelOverride ?? params.source.model,
+    timeoutSeconds: params.source.timeoutSeconds,
+  };
+
+  if (params.source.env.length > 0) {
+    bundled.env = [...params.source.env];
+  }
+  if (params.source.setup.length > 0) {
+    bundled.setup = [...params.source.setup];
+  }
+  if (params.source.cleanup.length > 0) {
+    bundled.cleanup = [...params.source.cleanup];
+  }
+  if (Object.keys(params.source.mcpServers).length > 0) {
+    bundled.mcpServers = { ...params.source.mcpServers };
+  }
+  if (Object.keys(params.source.mcpServices).length > 0) {
+    bundled.mcpServices = { ...params.source.mcpServices };
+  }
+
+  return bundled;
+}
+
+function copyDirectoryContents(sourceDir: string, destinationDir: string): void {
+  mkdirSync(destinationDir, { recursive: true });
+  for (const entry of readdirSync(sourceDir)) {
+    cpSync(join(sourceDir, entry), join(destinationDir, entry), { recursive: true });
+  }
+}
+
+function copyCaseSupportDir(sourceCaseDir: string, bundledCaseDir: string, name: string): void {
+  const sourceDir = join(sourceCaseDir, name);
+  if (!existsSync(sourceDir)) {
+    return;
+  }
+
+  const destinationDir = join(bundledCaseDir, name);
+  rmSync(destinationDir, { recursive: true, force: true });
+  cpSync(sourceDir, destinationDir, { recursive: true });
+}
+
+function copyCaseSupportDirs(sourceCaseDir: string, bundledCaseDir: string): void {
+  for (const name of ['checks', 'fixtures', 'bin', 'workspace', 'mcp']) {
+    copyCaseSupportDir(sourceCaseDir, bundledCaseDir, name);
+  }
+}
+
+function mcpNetworkName(tempDir: string): string {
+  return `skill-optimizer-mcp-${tempDir.split('/').pop() ?? 'run'}`;
+}
+
+async function createDockerNetwork(networkName: string, repoRoot: string): Promise<void> {
+  const create = await runShellCommand(`docker network create ${shellQuote(networkName)}`, { cwd: repoRoot });
+  if (create.exitCode !== 0) {
+    throw new Error(['Failed to create MCP Docker network', create.stdout.trim(), create.stderr.trim()].filter(Boolean).join('\n\n'));
+  }
+}
+
+async function removeDockerNetwork(networkName: string | undefined, repoRoot: string): Promise<void> {
+  if (!networkName) return;
+  await runShellCommand(`docker network rm ${shellQuote(networkName)}`, { cwd: repoRoot });
+}
+
+export async function startMcpServices(params: {
+  image: string;
+  networkName: string;
+  caseDir: string;
+  tempDir: string;
+  services: ResolvedWorkbenchCase['mcpServices'];
+  repoRoot: string;
+  startedContainers?: string[];
+  runCommand?: typeof runShellCommand;
+}): Promise<string[]> {
+  const containerNames = params.startedContainers ?? [];
+  const runCommand = params.runCommand ?? runShellCommand;
+  for (const [name, service] of Object.entries(params.services)) {
+    const containerName = `${mcpNetworkName(params.tempDir)}-${name}`;
+    console.log(`Starting MCP service ${name}...`);
+    const command = buildDockerMcpServiceCommand({
+      image: params.image,
+      containerName,
+      networkName: params.networkName,
+      alias: name,
+      mcpDir: join(params.caseDir, 'mcp'),
+      command: service.command,
+      args: service.args,
+    });
+    const run = await runCommand(command, { cwd: params.repoRoot });
+    if (run.exitCode !== 0) {
+      throw new Error([`Failed to start MCP service ${name}`, run.stdout.trim(), run.stderr.trim()].filter(Boolean).join('\n\n'));
+    }
+    containerNames.push(containerName);
+  }
+  return containerNames;
+}
+
+async function waitForMcpServices(params: {
+  image: string;
+  networkName: string;
+  workDir: string;
+  services: ResolvedWorkbenchCase['mcpServices'];
+  repoRoot: string;
+}): Promise<void> {
+  for (const name of Object.keys(params.services)) {
+    console.log(`Waiting for MCP service ${name}...`);
+    const command = buildDockerMcpServiceProbeCommand({
+      image: params.image,
+      networkName: params.networkName,
+      workDir: params.workDir,
+      serverName: name,
+    });
+    const probe = await runShellCommand(command, { cwd: params.repoRoot, timeoutSeconds: 30 });
+    if (probe.exitCode !== 0) {
+      throw new Error([
+        `MCP service ${name} did not become ready`,
+        probe.stdout.trim(),
+        probe.stderr.trim(),
+      ].filter(Boolean).join('\n\n'));
+    }
+    console.log(`MCP service ${name} ready.`);
+  }
+}
+
+function copyAgentSupportDirs(sourceCaseDir: string, workDir: string): void {
+  copyCaseSupportDir(sourceCaseDir, workDir, 'bin');
+}
+
+function resolveDockerWorkbenchCase(options: { casePath?: string; case?: ResolvedWorkbenchCase }): ResolvedWorkbenchCase {
+  if (options.case) {
+    return options.case;
+  }
+  if (options.casePath) {
+    return loadWorkbenchCase(options.casePath);
+  }
+  throw new Error('Workbench Docker run requires a casePath or inline case');
+}
+
+export function prepareDockerWorkbenchRun(
+  options: PrepareDockerWorkbenchRunOptions,
+): DockerWorkbenchRunResult {
+  const resolvedCase = resolveDockerWorkbenchCase(options);
+  const resultsBase = resolve(options.outDir ?? join(resolvedCase.configDir, '.results'));
+  const resultsDir = options.resultsDir
+    ? resolve(options.resultsDir)
+    : join(resultsBase, timestampSlug(options.now ?? new Date()));
+  const tempRoot = resolve(options.tempRoot ?? tmpdir());
+  mkdirSync(tempRoot, { recursive: true });
+  const tempDir = mkdtempSync(join(tempRoot, 'skill-optimizer-workbench-'));
+  const caseDir = join(tempDir, 'case');
+  const referencesDir = join(caseDir, 'references');
+  const bundledCasePath = join(caseDir, 'case.yml');
+  const workDir = join(tempDir, 'work');
+
+  mkdirSync(referencesDir, { recursive: true });
+  mkdirSync(workDir, { recursive: true });
+  mkdirSync(resultsDir, { recursive: true });
+
+  copyDirectoryContents(resolvedCase.referencesDir, referencesDir);
+  copyCaseSupportDirs(resolvedCase.configDir, caseDir);
+  prepareWorkbenchDirectory({
+    referencesDir: resolvedCase.referencesDir,
+    workspaceDir: join(resolvedCase.configDir, 'workspace'),
+    workDir,
+  });
+  copyAgentSupportDirs(resolvedCase.configDir, workDir);
+
+  const bundledCase = buildBundledCaseFile({
+    source: resolvedCase,
+    modelOverride: options.model,
+  });
+  writeFileSync(bundledCasePath, `${stringifyYaml(bundledCase)}`, 'utf-8');
+  const mcpConfigPath = writeWorkbenchMcpConfig(resolvedCase, workDir);
+
+  return {
+    tempDir,
+    caseDir,
+    bundledCasePath,
+    workDir,
+    resultsDir,
+    resultPath: join(resultsDir, 'result.json'),
+    tracePath: join(resultsDir, 'trace.jsonl'),
+    ...(mcpConfigPath ? { mcpConfigPath } : {}),
+    cleanup: () => rmSync(tempDir, { recursive: true, force: true }),
+  };
+}
+
+async function ensureDockerImage(image: string, repoRoot: string): Promise<void> {
+  const inspect = await runShellCommand(`docker image inspect ${shellQuote(image)}`, { cwd: repoRoot });
+  if (inspect.exitCode === 0) {
+    return;
+  }
+
+  const dockerfilePath = join(repoRoot, 'docker', 'workbench-runner.Dockerfile');
+  if (!existsSync(dockerfilePath)) {
+    throw new Error(`Dockerfile not found: ${dockerfilePath}`);
+  }
+
+  const build = await runShellCommand(
+    `docker build -t ${shellQuote(image)} -f ${shellQuote(dockerfilePath)} .`,
+    { cwd: repoRoot },
+  );
+
+  if (build.exitCode !== 0) {
+    throw new Error([
+      `Failed to build Docker image ${image}`,
+      build.stdout.trim(),
+      build.stderr.trim(),
+    ].filter(Boolean).join('\n\n'));
+  }
+}
+
+function writeFatalResult(params: {
+  resultPath: string;
+  caseName: string;
+  model: string;
+  evidence: string[];
+}): void {
+  writeFileSync(params.resultPath, JSON.stringify({
+    caseName: params.caseName,
+    model: params.model,
+    endedAt: new Date().toISOString(),
+    pass: false,
+    score: 0,
+    evidence: params.evidence,
+  }, null, 2), 'utf-8');
+}
+
+function readTrialPass(resultPath: string): boolean | undefined {
+  try {
+    const parsed = JSON.parse(readFileSync(resultPath, 'utf-8')) as unknown;
+    if (!parsed || typeof parsed !== 'object' || !('pass' in parsed)) {
+      return undefined;
+    }
+    return Boolean((parsed as { pass?: unknown }).pass);
+  } catch {
+    return undefined;
+  }
+}
+
+function copyWorkspaceIfRequested(
+  prepared: DockerWorkbenchRunResult,
+  keepWorkspace: boolean | undefined,
+): DockerWorkbenchRunResult {
+  const passed = readTrialPass(prepared.resultPath);
+  if (!keepWorkspace && passed !== false) {
+    return prepared;
+  }
+
+  const workspacePath = join(prepared.resultsDir, 'workspace');
+  rmSync(workspacePath, { recursive: true, force: true });
+  cpSync(prepared.workDir, workspacePath, { recursive: true });
+  return { ...prepared, workspacePath };
+}
+
+export async function runDockerWorkbenchCase(
+  options: RunDockerWorkbenchCaseOptions,
+): Promise<DockerWorkbenchRunResult> {
+  const repoRoot = packageRootFromModuleUrl(import.meta.url);
+  const image = options.image ?? DEFAULT_WORKBENCH_IMAGE;
+  const resolvedCase = resolveDockerWorkbenchCase(options);
+  const prepared = prepareDockerWorkbenchRun({ ...options, case: resolvedCase });
+  const containerName = agentContainerName(prepared.tempDir);
+  const networkName = Object.keys(resolvedCase.mcpServices).length > 0 ? mcpNetworkName(prepared.tempDir) : undefined;
+  let mcpServiceContainers: string[] = [];
+
+  try {
+    await ensureDockerImage(image, repoRoot);
+
+    const envNames = resolvedCase.env
+      .filter((name) => process.env[name] !== undefined)
+      .map((name) => name);
+
+    if (resolvedCase.setup.length > 0) {
+      const setupCommand = buildDockerSetupCommand({
+        image,
+        caseDir: prepared.caseDir,
+        workDir: prepared.workDir,
+        envNames,
+      });
+      const setupRun = await runShellCommand(setupCommand, { cwd: repoRoot });
+      if (setupRun.exitCode !== 0) {
+        writeFatalResult({
+          resultPath: prepared.resultPath,
+          caseName: resolvedCase.name,
+          model: options.model ?? resolvedCase.model,
+          evidence: [
+            'setup failed',
+            setupRun.stdout.trim(),
+            setupRun.stderr.trim(),
+          ].filter(Boolean),
+        });
+        return copyWorkspaceIfRequested(prepared, true);
+      }
+    }
+
+    if (networkName) {
+      await createDockerNetwork(networkName, repoRoot);
+      mcpServiceContainers = await startMcpServices({
+        image,
+        networkName,
+        caseDir: prepared.caseDir,
+        tempDir: prepared.tempDir,
+        services: resolvedCase.mcpServices,
+        repoRoot,
+        startedContainers: mcpServiceContainers,
+      });
+      await waitForMcpServices({
+        image,
+        networkName,
+        workDir: prepared.workDir,
+        services: resolvedCase.mcpServices,
+        repoRoot,
+      });
+    }
+
+    const agentCommand = buildDockerAgentCommand({
+      image,
+      containerName,
+      workDir: prepared.workDir,
+      caseName: resolvedCase.name,
+      model: options.model ?? resolvedCase.model,
+      task: resolvedCase.task,
+      appendSystemPrompt: options.appendSystemPrompt,
+      mcpConfigPath: prepared.mcpConfigPath ? MCPORTER_CONFIG_CONTAINER_PATH : undefined,
+      networkName,
+      timeoutSeconds: resolvedCase.timeoutSeconds,
+      envNames,
+    });
+    const agentRun = await runShellCommand(agentCommand, { cwd: repoRoot });
+    await copyAgentResults(containerName, prepared.resultsDir, repoRoot);
+
+    if (agentRun.exitCode !== 0) {
+      if (!existsSync(prepared.resultPath)) {
+        throw new Error([
+        'Docker agent run failed',
+        agentRun.stdout.trim(),
+        agentRun.stderr.trim(),
+      ].filter(Boolean).join('\n\n'));
+      }
+    } else {
+      const gradeCommand = buildDockerGradeCommand({
+        image,
+        caseDir: prepared.caseDir,
+        workDir: prepared.workDir,
+        resultsDir: prepared.resultsDir,
+        envNames,
+      });
+      const gradeRun = await runShellCommand(gradeCommand, { cwd: repoRoot });
+
+      if (gradeRun.exitCode !== 0 && !existsSync(prepared.resultPath)) {
+        throw new Error([
+          'Docker grade run failed',
+          gradeRun.stdout.trim(),
+          gradeRun.stderr.trim(),
+        ].filter(Boolean).join('\n\n'));
+      }
+    }
+
+    return copyWorkspaceIfRequested(prepared, options.keepWorkspace);
+  } finally {
+    await removeContainer(containerName, repoRoot);
+    await Promise.all(mcpServiceContainers.map((name) => removeContainer(name, repoRoot)));
+    await removeDockerNetwork(networkName, repoRoot);
+    prepared.cleanup();
+  }
+}
diff --git a/src/workbench/index.ts b/src/workbench/index.ts
new file mode 100644
index 0000000..6376574
--- /dev/null
+++ b/src/workbench/index.ts
@@ -0,0 +1,15 @@
+export * from './types.js';
+export * from './case-loader.js';
+export * from './process.js';
+export * from './check-runner.js';
+export * from './trace.js';
+export * from './models.js';
+export * from './mcp/index.js';
+export * from './pi-agent.js';
+export * from './docker-runner.js';
+export * from './run-case.js';
+export * from './suite-loader.js';
+export * from './run-suite.js';
+export * from './container-runner.js';
+export * from './trials.js';
+export * from './metrics.js';
diff --git a/src/workbench/mcp/config.ts b/src/workbench/mcp/config.ts
new file mode 100644
index 0000000..33945db
--- /dev/null
+++ b/src/workbench/mcp/config.ts
@@ -0,0 +1,33 @@
+import { chmodSync, mkdirSync, writeFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+import type { ResolvedWorkbenchCase } from '../types.js';
+
+export const MCPORTER_CONFIG_CONTAINER_PATH = '/work/mcporter.json';
+
+export function writeWorkbenchMcpConfig(source: ResolvedWorkbenchCase, workDir: string): string | undefined {
+  if (Object.keys(source.mcpServers).length === 0) {
+    return undefined;
+  }
+
+  const configPath = join(workDir, 'mcporter.json');
+  writeFileSync(configPath, `${JSON.stringify({
+    imports: [],
+    mcpServers: source.mcpServers,
+  }, null, 2)}\n`, 'utf-8');
+  writeWorkbenchMcpCommand(workDir);
+  return configPath;
+}
+
+function writeWorkbenchMcpCommand(workDir: string): void {
+  const binDir = join(workDir, 'bin');
+  const commandPath = join(binDir, 'mcp');
+  mkdirSync(binDir, { recursive: true });
+  writeFileSync(commandPath, [
+    '#!/bin/sh',
+    'export MCPORTER_CONFIG="${MCPORTER_CONFIG:-/work/mcporter.json}"',
+    'exec /app/node_modules/.bin/mcporter --config "$MCPORTER_CONFIG" --root /work "$@"',
+    '',
+  ].join('\n'), 'utf-8');
+  chmodSync(commandPath, 0o755);
+}
diff --git a/src/workbench/mcp/index.ts b/src/workbench/mcp/index.ts
new file mode 100644
index 0000000..3b3c5bf
--- /dev/null
+++ b/src/workbench/mcp/index.ts
@@ -0,0 +1 @@
+export * from './config.js';
diff --git a/src/workbench/metrics.ts b/src/workbench/metrics.ts
new file mode 100644
index 0000000..6c37c67
--- /dev/null
+++ b/src/workbench/metrics.ts
@@ -0,0 +1,135 @@
+import type {
+  WorkbenchMetrics,
+  WorkbenchResult,
+  WorkbenchTrace,
+  WorkbenchTraceEntry,
+  WorkbenchTrialSummaryFile,
+} from './types.js';
+
+function emptyMetrics(): WorkbenchMetrics {
+  return {
+    durationMs: 0,
+    turns: 0,
+    toolCalls: 0,
+    toolResults: 0,
+    bashCalls: 0,
+    readCalls: 0,
+    writeCalls: 0,
+    editCalls: 0,
+    tokens: {
+      input: 0,
+      output: 0,
+      cacheRead: 0,
+      cacheWrite: 0,
+      total: 0,
+    },
+    cost: {
+      input: 0,
+      output: 0,
+      cacheRead: 0,
+      cacheWrite: 0,
+      total: 0,
+    },
+  };
+}
+
+export function buildWorkbenchMetrics(trace: WorkbenchTrace): WorkbenchMetrics {
+  const metrics = emptyMetrics();
+  const started = Date.parse(trace.startedAt);
+  const ended = Date.parse(trace.endedAt);
+  metrics.durationMs = Number.isFinite(started) && Number.isFinite(ended)
+    ? Math.max(0, ended - started)
+    : 0;
+
+  for (const entry of trace.entries) {
+    if (entry.type === 'message') {
+      metrics.turns += 1;
+      if (typeof entry.stopReason === 'string') {
+        metrics.stopReason = entry.stopReason;
+      }
+      addUsage(metrics, entry.usage);
+      continue;
+    }
+
+    if (entry.type === 'tool_result') {
+      metrics.toolResults += 1;
+      continue;
+    }
+
+    metrics.toolCalls += 1;
+    if (entry.name === 'bash') metrics.bashCalls += 1;
+    if (entry.name === 'read') metrics.readCalls += 1;
+    if (entry.name === 'write') metrics.writeCalls += 1;
+    if (entry.name === 'edit') metrics.editCalls += 1;
+  }
+
+  return metrics;
+}
+
+export function buildTrialSummary(params: {
+  trace: WorkbenchTrace;
+  result: WorkbenchResult;
+}): WorkbenchTrialSummaryFile {
+  const failedGraders = params.result.graders
+    ?.filter((grader) => !grader.pass)
+    .map((grader) => grader.name) ?? [];
+  const metrics = params.result.metrics ?? buildWorkbenchMetrics(params.trace);
+  const terminalMessage = [...params.trace.entries]
+    .reverse()
+    .find((entry): entry is Extract<WorkbenchTraceEntry, { type: 'message' }> => entry.type === 'message' && entry.role === 'assistant');
+
+  return {
+    finalAssistantMessage: terminalMessage?.text,
+    failedGraders,
+    evidence: [...params.result.evidence],
+    bashCommands: extractBashCommands(params.trace),
+    stopReason: typeof terminalMessage?.stopReason === 'string' ? terminalMessage.stopReason : undefined,
+    errorMessage: terminalMessage?.errorMessage,
+    metrics,
+  };
+}
+
+function extractBashCommands(trace: WorkbenchTrace): string[] {
+  return trace.entries.flatMap((entry) => {
+    if (entry.type !== 'tool_call' || entry.name !== 'bash') {
+      return [];
+    }
+
+    const args = entry.arguments;
+    if (!args || typeof args !== 'object' || Array.isArray(args)) {
+      return [];
+    }
+
+    const command = (args as Record<string, unknown>).command;
+    return typeof command === 'string' ? [command] : [];
+  });
+}
+
+function addUsage(metrics: WorkbenchMetrics, usage: unknown): void {
+  if (!usage || typeof usage !== 'object' || Array.isArray(usage)) {
+    return;
+  }
+
+  const record = usage as Record<string, unknown>;
+  metrics.tokens.input += readNumber(record.input);
+  metrics.tokens.output += readNumber(record.output);
+  metrics.tokens.cacheRead += readNumber(record.cacheRead);
+  metrics.tokens.cacheWrite += readNumber(record.cacheWrite);
+  metrics.tokens.total += readNumber(record.totalTokens);
+
+  const cost = record.cost;
+  if (!cost || typeof cost !== 'object' || Array.isArray(cost)) {
+    return;
+  }
+
+  const costRecord = cost as Record<string, unknown>;
+  metrics.cost.input += readNumber(costRecord.input);
+  metrics.cost.output += readNumber(costRecord.output);
+  metrics.cost.cacheRead += readNumber(costRecord.cacheRead);
+  metrics.cost.cacheWrite += readNumber(costRecord.cacheWrite);
+  metrics.cost.total += readNumber(costRecord.total);
+}
+
+function readNumber(value: unknown): number {
+  return typeof value === 'number' && Number.isFinite(value) ? value : 0;
+}
diff --git a/src/workbench/models.ts b/src/workbench/models.ts
new file mode 100644
index 0000000..96f3ef8
--- /dev/null
+++ b/src/workbench/models.ts
@@ -0,0 +1,28 @@
+export function ensureOpenRouterModelRef(modelRef: string): string {
+  const trimmed = modelRef.trim();
+  if (!trimmed.startsWith('openrouter/')) {
+    throw new Error(`Workbench only supports OpenRouter model refs, got: ${modelRef}`);
+  }
+  return trimmed;
+}
+
+export function parseModelList(raw: string): string[] {
+  const parts = raw.split(',').map((part) => part.trim());
+  if (parts.length === 0 || parts.every((part) => part === '')) {
+    throw new Error('Expected at least one model');
+  }
+
+  return parts.map((part, index) => {
+    if (part === '') {
+      throw new Error(`Model list item at index ${index} must be non-empty`);
+    }
+    return ensureOpenRouterModelRef(part);
+  });
+}
+
+export function slugModelRef(modelRef: string): string {
+  return modelRef
+    .trim()
+    .replace(/[^A-Za-z0-9._-]+/g, '-')
+    .replace(/^-+|-+$/g, '');
+}
diff --git a/src/workbench/pi-agent.ts b/src/workbench/pi-agent.ts
new file mode 100644
index 0000000..60c99bb
--- /dev/null
+++ b/src/workbench/pi-agent.ts
@@ -0,0 +1,156 @@
+import {
+  createAgentSession,
+  createBashTool,
+  createEditTool,
+  createFindTool,
+  createGrepTool,
+  createLsTool,
+  createReadTool,
+  createWriteTool,
+  AuthStorage,
+  DefaultResourceLoader,
+  ModelRegistry,
+  SessionManager,
+  type ResourceLoader,
+} from '@mariozechner/pi-coding-agent';
+import type { AgentTool } from '@mariozechner/pi-agent-core';
+import { getModel, type Api, type Model } from '@mariozechner/pi-ai';
+import { resolve } from 'node:path';
+
+import { buildAgentSystemPrompt } from './sandbox.js';
+
+export function stripSensitiveEnv(env: NodeJS.ProcessEnv): NodeJS.ProcessEnv {
+  return { ...env };
+}
+
+export function createWorkbenchPiTools(cwd: string): AgentTool<any>[] {
+  return [
+    createReadTool(cwd),
+    createBashTool(cwd, {
+      spawnHook: (context) => ({
+        ...context,
+        env: stripSensitiveEnv(context.env),
+      }),
+    }),
+    createEditTool(cwd),
+    createWriteTool(cwd),
+    createGrepTool(cwd),
+    createFindTool(cwd),
+    createLsTool(cwd),
+  ];
+}
+
+export async function createWorkbenchPiResourceLoader(params: {
+  cwd: string;
+  appendSystemPrompt?: string;
+  mcpConfigPath?: string;
+}): Promise<ResourceLoader> {
+  const cwd = resolve(params.cwd);
+  const appendSystemPrompt = [buildAgentSystemPrompt(), buildMcpSystemPrompt(params.mcpConfigPath), params.appendSystemPrompt]
+    .filter((value): value is string => typeof value === 'string' && value.trim().length > 0)
+    .join('\n\n');
+  const loader = new DefaultResourceLoader({
+    cwd,
+    noExtensions: true,
+    noSkills: true,
+    additionalSkillPaths: [cwd],
+    appendSystemPrompt,
+  });
+
+  await loader.reload();
+  return loader;
+}
+
+export async function createWorkbenchPiSession(params: {
+  cwd: string;
+  modelRef: string;
+  apiKeyEnv?: string;
+  appendSystemPrompt?: string;
+  mcpConfigPath?: string;
+  thinkingLevel?: 'off' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';
+}) {
+  const { provider, model } = parseModelRef(params.modelRef);
+  if (provider !== 'openrouter') {
+    throw new Error(`Workbench only supports OpenRouter model refs, got: ${params.modelRef}`);
+  }
+
+  const authStorage = AuthStorage.create();
+  const apiKeyEnv = params.apiKeyEnv ?? 'OPENROUTER_API_KEY';
+  const apiKey = process.env[apiKeyEnv];
+  if (apiKey) {
+    authStorage.setRuntimeApiKey('openrouter' as never, apiKey);
+  }
+
+  const modelRegistry = ModelRegistry.create(authStorage);
+  const resolvedModel = modelRegistry.find(provider, model)
+    ?? getModel(provider as never, model)
+    ?? synthesizeOpenRouterModel(provider, model);
+  if (!resolvedModel) {
+    throw new Error(`Could not resolve Pi model ${provider}/${model}`);
+  }
+
+  const auth = await modelRegistry.getApiKeyAndHeaders(resolvedModel);
+  if (!auth.ok) {
+    throw new Error(auth.error);
+  }
+
+  const resourceLoader = await createWorkbenchPiResourceLoader({
+    cwd: params.cwd,
+    appendSystemPrompt: params.appendSystemPrompt,
+    mcpConfigPath: params.mcpConfigPath,
+  });
+
+  return createAgentSession({
+    cwd: params.cwd,
+    model: resolvedModel,
+    thinkingLevel: params.thinkingLevel ?? 'medium',
+    authStorage,
+    modelRegistry,
+    resourceLoader,
+    tools: createWorkbenchPiTools(params.cwd),
+    sessionManager: SessionManager.inMemory(),
+  });
+}
+
+function buildMcpSystemPrompt(mcpConfigPath: string | undefined): string | undefined {
+  if (!mcpConfigPath) {
+    return undefined;
+  }
+
+  return [
+    'Additional command:',
+    '- `mcp` is available on PATH for configured MCP servers.',
+    '- Run `mcp list <server> --schema` to inspect available tools when needed.',
+    '- Run `mcp call <server.tool> key=value` to call a tool from bash.',
+  ].join('\n');
+}
+
+function parseModelRef(modelRef: string): { provider: string; model: string } {
+  const slash = modelRef.indexOf('/');
+  if (slash <= 0 || slash === modelRef.length - 1) {
+    throw new Error(`Invalid model ref: ${modelRef}`);
+  }
+  return {
+    provider: modelRef.slice(0, slash),
+    model: modelRef.slice(slash + 1),
+  };
+}
+
+function synthesizeOpenRouterModel(provider: string, modelName: string): Model<Api> | undefined {
+  if (provider !== 'openrouter') {
+    return undefined;
+  }
+
+  return {
+    id: modelName,
+    name: modelName,
+    api: 'openai-completions' as const,
+    provider: 'openrouter' as const,
+    baseUrl: 'https://openrouter.ai/api/v1',
+    reasoning: false,
+    input: ['text'],
+    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+    contextWindow: 128000,
+    maxTokens: 16384,
+  };
+}
diff --git a/src/workbench/process.ts b/src/workbench/process.ts
new file mode 100644
index 0000000..c039ee3
--- /dev/null
+++ b/src/workbench/process.ts
@@ -0,0 +1,84 @@
+import { spawn } from 'node:child_process';
+
+export interface ProcessResult {
+  exitCode: number | null;
+  stdout: string;
+  stderr: string;
+  timedOut?: boolean;
+}
+
+export async function runShellCommand(
+  command: string,
+  opts: { cwd: string; env?: NodeJS.ProcessEnv; timeoutSeconds?: number },
+): Promise<ProcessResult> {
+  return await new Promise<ProcessResult>((resolve) => {
+    const isWindows = process.platform === 'win32';
+    const executable = isWindows ? 'cmd.exe' : '/bin/sh';
+    const args = isWindows ? ['/d', '/s', '/c', command] : ['-lc', command];
+
+    const child = spawn(executable, args, {
+      cwd: opts.cwd,
+      env: { ...process.env, ...opts.env },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    let stdout = '';
+    let stderr = '';
+    let timedOut = false;
+    let resolved = false;
+
+    const timeoutMs =
+      typeof opts.timeoutSeconds === 'number' && opts.timeoutSeconds > 0
+        ? opts.timeoutSeconds * 1000
+        : undefined;
+
+    const timeout =
+      timeoutMs === undefined
+        ? undefined
+        : setTimeout(() => {
+            timedOut = true;
+            child.kill('SIGKILL');
+          }, timeoutMs);
+
+    const finish = (exitCode: number | null) => {
+      if (resolved) {
+        return;
+      }
+
+      resolved = true;
+      if (timeout !== undefined) {
+        clearTimeout(timeout);
+      }
+
+      const normalizedExitCode = timedOut
+        ? typeof exitCode === 'number' && exitCode !== 0
+          ? exitCode
+          : 124
+        : exitCode;
+
+      resolve({
+        exitCode: normalizedExitCode,
+        stdout,
+        stderr,
+        ...(timedOut ? { timedOut: true } : {}),
+      });
+    };
+
+    child.stdout?.on('data', (chunk) => {
+      stdout += chunk.toString();
+    });
+
+    child.stderr?.on('data', (chunk) => {
+      stderr += chunk.toString();
+    });
+
+    child.on('error', (error) => {
+      stderr += `${error.message}\n`;
+      finish(1);
+    });
+
+    child.on('close', (code) => {
+      finish(code);
+    });
+  });
+}
diff --git a/src/workbench/run-case.ts b/src/workbench/run-case.ts
new file mode 100644
index 0000000..672ddb3
--- /dev/null
+++ b/src/workbench/run-case.ts
@@ -0,0 +1,222 @@
+import { mkdirSync } from 'node:fs';
+import { dirname, join, relative, resolve } from 'node:path';
+
+import { loadWorkbenchCase } from './case-loader.js';
+import { getFlag, positionals } from './cli-args.js';
+import { runDockerWorkbenchCase } from './docker-runner.js';
+import type { DockerWorkbenchRunResult, RunDockerWorkbenchCaseOptions } from './docker-runner.js';
+import { ensureOpenRouterModelRef, parseModelList, slugModelRef } from './models.js';
+import { aggregateTrials, formatTrialNumber, parseTrialsFlag, summarizeTrialAggregates } from './trials.js';
+import type { RunCaseAggregateResultFile, WorkbenchModelAggregateResult, WorkbenchTrialResultRef } from './types.js';
+import { readWorkbenchResultFile, timestampSlug, writeJsonFile } from './utils.js';
+
+export interface RunWorkbenchCaseParams {
+  casePath: string;
+  outDir?: string;
+  model?: string;
+  models?: string[];
+  image?: string;
+  keepWorkspace?: boolean;
+  trials?: number;
+  concurrency?: number;
+}
+
+export interface RunWorkbenchCaseDeps {
+  runDockerWorkbenchCase?: (options: RunDockerWorkbenchCaseOptions) => Promise<DockerWorkbenchRunResult>;
+  now?: Date;
+}
+
+function runResultsDir(params: RunWorkbenchCaseParams, now: Date): string {
+  const root = resolve(params.outDir ?? join(dirname(resolve(params.casePath)), '.results'));
+  return join(root, timestampSlug(now));
+}
+
+function parseConcurrencyFlag(value: string | undefined): number | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    throw new Error(`--concurrency must be a positive integer, got: ${value}`);
+  }
+  return parsed;
+}
+
+async function mapWithConcurrency<T, R>(
+  items: T[],
+  concurrency: number,
+  worker: (item: T) => Promise<R>,
+): Promise<R[]> {
+  const results = new Array<R>(items.length);
+  let nextIndex = 0;
+  const workerCount = Math.min(concurrency, items.length);
+
+  await Promise.all(Array.from({ length: workerCount }, async () => {
+    while (nextIndex < items.length) {
+      const index = nextIndex;
+      nextIndex += 1;
+      const item = items[index];
+      if (item !== undefined) {
+        results[index] = await worker(item);
+      }
+    }
+  }));
+
+  return results;
+}
+
+function trialDirName(model: string, trial: number): string {
+  return `${slugModelRef(model)}--${formatTrialNumber(trial)}`;
+}
+
+async function runWorkbenchCaseMatrix(
+  params: RunWorkbenchCaseParams & { models: string[] },
+  deps: RunWorkbenchCaseDeps,
+): Promise<void> {
+  const dockerRunner = deps.runDockerWorkbenchCase ?? runDockerWorkbenchCase;
+  const startedAt = new Date().toISOString();
+  const resultsDir = runResultsDir(params, deps.now ?? new Date());
+  const trials = params.trials ?? 1;
+  const concurrency = params.concurrency && params.concurrency > 0
+    ? Math.floor(params.concurrency)
+    : 1;
+
+  mkdirSync(resultsDir, { recursive: true });
+
+  const jobs = params.models.flatMap((model) => Array.from({ length: trials }, (_, index) => ({
+    model,
+    trial: index + 1,
+  })));
+
+  const completedTrials = await mapWithConcurrency(jobs, concurrency, async (job) => {
+    const trialDir = join(resultsDir, 'trials', trialDirName(job.model, job.trial));
+    const run = await dockerRunner({
+      casePath: params.casePath,
+      resultsDir: trialDir,
+      model: job.model,
+      image: params.image,
+      keepWorkspace: params.keepWorkspace,
+    });
+    const result = readWorkbenchResultFile(run.resultPath);
+    const trialResult: WorkbenchTrialResultRef = {
+      trial: job.trial,
+      pass: result.pass,
+      score: result.score,
+      resultPath: relative(resultsDir, run.resultPath),
+      tracePath: relative(resultsDir, run.tracePath),
+      ...(run.summaryPath ? { summaryPath: relative(resultsDir, run.summaryPath) } : {}),
+    };
+
+    console.log(`${job.model} trial ${formatTrialNumber(job.trial)}: ${result.pass ? 'PASS' : 'FAIL'}`);
+    return { ...job, trialResult };
+  });
+
+  const results: WorkbenchModelAggregateResult[] = [];
+  for (const model of params.models) {
+    const trialResults = completedTrials
+      .filter((trial) => trial.model === model)
+      .map((trial) => trial.trialResult)
+      .sort((left, right) => left.trial - right.trial);
+    const aggregate = aggregateTrials(trialResults);
+    results.push({
+      model,
+      totalTrials: aggregate.totalTrials,
+      passedTrials: aggregate.passedTrials,
+      failedTrials: aggregate.failedTrials,
+      trialPassRate: aggregate.trialPassRate,
+      meanScore: aggregate.meanScore,
+      passAtK: aggregate.passAtK,
+      passHatK: aggregate.passHatK,
+      trials: trialResults,
+    });
+  }
+
+  const summary = summarizeTrialAggregates(results);
+  const aggregate: RunCaseAggregateResultFile = {
+    name: 'run-case',
+    startedAt,
+    endedAt: new Date().toISOString(),
+    models: params.models,
+    summary,
+    results,
+  };
+
+  writeJsonFile(join(resultsDir, 'run-result.json'), aggregate);
+  console.log(`Results: ${resultsDir}`);
+  console.log(`Grade: ${summary.failedTrials === 0 ? 'PASS' : 'FAIL'}`);
+
+  if (summary.failedTrials > 0) {
+    process.exitCode = 1;
+  }
+}
+
+export async function runWorkbenchCase(
+  params: RunWorkbenchCaseParams,
+  deps: RunWorkbenchCaseDeps = {},
+): Promise<void> {
+  const model = params.model ? ensureOpenRouterModelRef(params.model) : undefined;
+  const models = params.models?.map((modelRef) => ensureOpenRouterModelRef(modelRef));
+
+  if ((models && models.length > 0) || (params.trials ?? 1) > 1) {
+    const matrixModels = models && models.length > 0
+      ? models
+      : [model ?? loadWorkbenchCase(params.casePath).model];
+    await runWorkbenchCaseMatrix({ ...params, model, models: matrixModels }, deps);
+    return;
+  }
+
+  const dockerRunner = deps.runDockerWorkbenchCase ?? runDockerWorkbenchCase;
+  const selectedModel = models?.[0] ?? model;
+  const run = await dockerRunner({
+    casePath: params.casePath,
+    outDir: params.outDir,
+    model: selectedModel,
+    image: params.image,
+    keepWorkspace: params.keepWorkspace,
+  });
+
+  const result = readWorkbenchResultFile(run.resultPath);
+  console.log(`Results: ${run.resultsDir}`);
+  console.log(`Grade: ${result.pass ? 'PASS' : 'FAIL'}`);
+
+  if (result.evidence.length > 0) {
+    for (const line of result.evidence) {
+      console.log(`- ${line}`);
+    }
+  } else {
+    console.log('- (no evidence)');
+  }
+
+  if (!result.pass) {
+    process.exitCode = 1;
+  }
+}
+
+export async function runWorkbenchCaseFromCli(args: string[]): Promise<void> {
+  const caseArg = positionals(args, {
+    valueFlags: ['--out', '--model', '--models', '--image', '--trials', '--concurrency'],
+    booleanFlags: ['--keep-workspace'],
+  })[0];
+  if (!caseArg) {
+    throw new Error('Missing case path. Usage: skill-optimizer run-case <case.yml> [--out <dir>] [--model <openrouter/...>] [--models <openrouter/...,openrouter/...>] [--trials <n>] [--concurrency <n>] [--image <name>] [--keep-workspace]');
+  }
+
+  const outDir = getFlag(args, '--out');
+  const model = getFlag(args, '--model');
+  const models = getFlag(args, '--models');
+  const image = getFlag(args, '--image');
+  const trials = parseTrialsFlag(getFlag(args, '--trials'));
+  const concurrency = parseConcurrencyFlag(getFlag(args, '--concurrency'));
+  const keepWorkspace = args.includes('--keep-workspace');
+
+  await runWorkbenchCase({
+    casePath: resolve(caseArg),
+    outDir: outDir ? resolve(outDir) : undefined,
+    model: model ? ensureOpenRouterModelRef(model) : undefined,
+    models: models ? parseModelList(models) : undefined,
+    trials,
+    concurrency,
+    image,
+    keepWorkspace,
+  });
+}
diff --git a/src/workbench/run-suite.ts b/src/workbench/run-suite.ts
new file mode 100644
index 0000000..6003523
--- /dev/null
+++ b/src/workbench/run-suite.ts
@@ -0,0 +1,194 @@
+import { mkdirSync } from 'node:fs';
+import { basename, dirname, extname, join, relative, resolve } from 'node:path';
+
+import { getFlag, positionals } from './cli-args.js';
+import { runDockerWorkbenchCase } from './docker-runner.js';
+import type { DockerWorkbenchRunResult, RunDockerWorkbenchCaseOptions } from './docker-runner.js';
+import { slugModelRef } from './models.js';
+import { loadWorkbenchSuite } from './suite-loader.js';
+import { aggregateTrials, formatTrialNumber, parseTrialsFlag, summarizeTrialAggregates } from './trials.js';
+import type { RunSuiteAggregateResultFile, WorkbenchCaseModelAggregateResult, WorkbenchTrialResultRef } from './types.js';
+import { readWorkbenchResultFile, slugPathSegment, timestampSlug, writeJsonFile } from './utils.js';
+
+export interface RunWorkbenchSuiteParams {
+  suitePath: string;
+  outDir?: string;
+  image?: string;
+  keepWorkspace?: boolean;
+  trials?: number;
+  concurrency?: number;
+}
+
+export interface RunWorkbenchSuiteDeps {
+  runDockerWorkbenchCase?: (options: RunDockerWorkbenchCaseOptions) => Promise<DockerWorkbenchRunResult>;
+  now?: Date;
+}
+
+function caseSlugFromPath(casePath: string): string {
+  const file = basename(casePath);
+  const stem = file.slice(0, file.length - extname(file).length);
+  return slugPathSegment(stem === 'case' ? basename(dirname(casePath)) : stem);
+}
+
+function parseConcurrencyFlag(value: string | undefined): number | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+  return validateConcurrency(Number(value), `--concurrency must be a positive integer, got: ${value}`);
+}
+
+function validateConcurrency(value: number, errorMessage?: string): number {
+  if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) {
+    throw new Error(errorMessage ?? `Field "concurrency" must be a positive integer, got: ${String(value)}`);
+  }
+  return value;
+}
+
+async function mapWithConcurrency<T, R>(
+  items: T[],
+  concurrency: number,
+  worker: (item: T) => Promise<R>,
+): Promise<R[]> {
+  const results = new Array<R>(items.length);
+  let nextIndex = 0;
+  const workerCount = Math.min(concurrency, items.length);
+
+  await Promise.all(Array.from({ length: workerCount }, async () => {
+    while (nextIndex < items.length) {
+      const index = nextIndex;
+      nextIndex += 1;
+      const item = items[index];
+      if (item !== undefined) {
+        results[index] = await worker(item);
+      }
+    }
+  }));
+
+  return results;
+}
+
+function trialDirName(caseName: string, model: string, trial: number): string {
+  return `${caseName}--${slugModelRef(model)}--${formatTrialNumber(trial)}`;
+}
+
+export async function runWorkbenchSuite(
+  params: RunWorkbenchSuiteParams,
+  deps: RunWorkbenchSuiteDeps = {},
+): Promise<void> {
+  const suite = loadWorkbenchSuite(params.suitePath);
+  const models = suite.models;
+  const trials = params.trials ?? 1;
+  if (models.length === 0) {
+    throw new Error('Workbench suite requires at least one model in suite.yml via the suite "models" field');
+  }
+
+  const dockerRunner = deps.runDockerWorkbenchCase ?? runDockerWorkbenchCase;
+  const startedAt = new Date().toISOString();
+  const resultsDir = join(resolve(params.outDir ?? join(suite.configDir, '.results')), timestampSlug(deps.now ?? new Date()));
+  const caseSlugs = suite.cases.map((suiteCase) => suiteCase.slug);
+  const concurrency = params.concurrency === undefined
+    ? 1
+    : validateConcurrency(params.concurrency);
+
+  mkdirSync(resultsDir, { recursive: true });
+
+  const jobs = suite.cases.flatMap((suiteCase) => models.flatMap((model) => (
+    Array.from({ length: trials }, (_, index) => ({
+      suiteCase,
+      caseName: suiteCase.slug,
+      model,
+      trial: index + 1,
+    }))
+  )));
+
+  const completedTrials = await mapWithConcurrency(jobs, concurrency, async (job) => {
+    const trialDir = join(resultsDir, 'trials', trialDirName(job.caseName, job.model, job.trial));
+    const run = await dockerRunner({
+      casePath: job.suiteCase.path,
+      case: job.suiteCase.case,
+      resultsDir: trialDir,
+      model: job.model,
+      image: params.image,
+      keepWorkspace: params.keepWorkspace,
+      appendSystemPrompt: suite.appendSystemPrompt,
+    });
+    const result = readWorkbenchResultFile(run.resultPath);
+    const trialResult: WorkbenchTrialResultRef = {
+      trial: job.trial,
+      pass: result.pass,
+      score: result.score,
+      resultPath: relative(resultsDir, run.resultPath),
+      tracePath: relative(resultsDir, run.tracePath),
+      ...(run.summaryPath ? { summaryPath: relative(resultsDir, run.summaryPath) } : {}),
+    };
+    console.log(`${job.caseName} ${job.model} trial ${formatTrialNumber(job.trial)}: ${result.pass ? 'PASS' : 'FAIL'}`);
+    return { ...job, trialResult };
+  });
+
+  const results: WorkbenchCaseModelAggregateResult[] = [];
+  for (const suiteCase of suite.cases) {
+    for (const model of models) {
+      const trialResults = completedTrials
+        .filter((trial) => trial.caseName === suiteCase.slug && trial.model === model)
+        .map((trial) => trial.trialResult)
+        .sort((left, right) => left.trial - right.trial);
+      const aggregate = aggregateTrials(trialResults);
+      results.push({
+        caseName: suiteCase.slug,
+        model,
+        totalTrials: aggregate.totalTrials,
+        passedTrials: aggregate.passedTrials,
+        failedTrials: aggregate.failedTrials,
+        trialPassRate: aggregate.trialPassRate,
+        meanScore: aggregate.meanScore,
+        passAtK: aggregate.passAtK,
+        passHatK: aggregate.passHatK,
+        trials: trialResults,
+      });
+    }
+  }
+
+  const summary = summarizeTrialAggregates(results);
+  const aggregate: RunSuiteAggregateResultFile = {
+    name: suite.name,
+    startedAt,
+    endedAt: new Date().toISOString(),
+    models,
+    cases: caseSlugs,
+    summary,
+    results,
+  };
+
+  writeJsonFile(join(resultsDir, 'suite-result.json'), aggregate);
+  console.log(`Results: ${resultsDir}`);
+  console.log(`Grade: ${summary.failedTrials === 0 ? 'PASS' : 'FAIL'}`);
+
+  if (summary.failedTrials > 0) {
+    process.exitCode = 1;
+  }
+}
+
+export async function runWorkbenchSuiteFromCli(args: string[]): Promise<void> {
+  const suiteArg = positionals(args, {
+    valueFlags: ['--out', '--image', '--trials', '--concurrency'],
+    booleanFlags: ['--keep-workspace'],
+  })[0];
+  if (!suiteArg) {
+    throw new Error('Missing suite path. Usage: skill-optimizer run-suite <suite.yml> [--out <dir>] [--trials <n>] [--concurrency <n>] [--image <name>] [--keep-workspace]');
+  }
+
+  const outDir = getFlag(args, '--out');
+  const image = getFlag(args, '--image');
+  const trials = parseTrialsFlag(getFlag(args, '--trials'));
+  const concurrency = parseConcurrencyFlag(getFlag(args, '--concurrency'));
+  const keepWorkspace = args.includes('--keep-workspace');
+
+  await runWorkbenchSuite({
+    suitePath: resolve(suiteArg),
+    outDir: outDir ? resolve(outDir) : undefined,
+    trials,
+    concurrency,
+    image,
+    keepWorkspace,
+  });
+}
diff --git a/src/workbench/sandbox.ts b/src/workbench/sandbox.ts
new file mode 100644
index 0000000..8cb0864
--- /dev/null
+++ b/src/workbench/sandbox.ts
@@ -0,0 +1,13 @@
+export function buildAgentSystemPrompt(): string {
+  return [
+    'Operating environment:',
+    '- Current working directory is /work.',
+    '- Write all outputs under /work.',
+    '- The Docker socket is not mounted.',
+    '- Internet access is available for task dependencies unless the network is unavailable.',
+    '- Node.js, npm, Python, pip, and venv are installed.',
+    '- Do not use global pip installs.',
+    '- If you need Python packages, run: python -m venv /work/.venv && /work/.venv/bin/pip install <packages>.',
+    '- Run Python scripts with /work/.venv/bin/python when using installed packages.',
+  ].join('\n');
+}
diff --git a/src/workbench/suite-loader.ts b/src/workbench/suite-loader.ts
new file mode 100644
index 0000000..a6813b4
--- /dev/null
+++ b/src/workbench/suite-loader.ts
@@ -0,0 +1,263 @@
+import { existsSync, readFileSync } from 'node:fs';
+import { basename, dirname, extname, resolve } from 'node:path';
+
+import { parse as parseYaml } from 'yaml';
+
+import { readMcpServers, readMcpServices, resolveWorkbenchCaseConfig } from './case-loader.js';
+import { ensureOpenRouterModelRef } from './models.js';
+import type { ResolvedWorkbenchCase, WorkbenchMcpServersConfig, WorkbenchMcpServicesConfig } from './types.js';
+import { slugPathSegment } from './utils.js';
+
+export interface ResolvedWorkbenchSuiteCase {
+  slug: string;
+  path?: string;
+  case?: ResolvedWorkbenchCase;
+}
+
+export interface ResolvedWorkbenchSuite {
+  configPath: string;
+  configDir: string;
+  name: string;
+  appendSystemPrompt?: string;
+  casePaths: string[];
+  cases: ResolvedWorkbenchSuiteCase[];
+  models: string[];
+}
+
+export function loadWorkbenchSuite(configPath: string): ResolvedWorkbenchSuite {
+  const resolvedConfigPath = resolve(configPath);
+  const configDir = dirname(resolvedConfigPath);
+
+  if (!existsSync(resolvedConfigPath)) {
+    throw new Error(`Workbench suite file not found: ${resolvedConfigPath}`);
+  }
+
+  let raw: string;
+  try {
+    raw = readFileSync(resolvedConfigPath, 'utf-8');
+  } catch (error) {
+    throw new Error(
+      `Failed to read workbench suite file ${resolvedConfigPath}: ` +
+      `${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+
+  const parsed = parseWorkbenchSuite(raw, resolvedConfigPath);
+  const name = requireNonEmptyString(parsed, 'name', resolvedConfigPath);
+  const appendSystemPrompt = readOptionalString(parsed, 'appendSystemPrompt', resolvedConfigPath);
+  const suiteDefaults = readSuiteCaseDefaults(parsed, resolvedConfigPath);
+  const cases = readCaseEntries(parsed, resolvedConfigPath)
+    .map((entry, index) => resolveSuiteCase(entry, index, resolvedConfigPath, configDir, suiteDefaults));
+  const casePaths = cases.flatMap((suiteCase) => suiteCase.path ? [suiteCase.path] : []);
+  const models = readStringArray(parsed, 'models', resolvedConfigPath, true)
+    .map((model) => ensureOpenRouterModelRef(model));
+
+  return {
+    configPath: resolvedConfigPath,
+    configDir,
+    name,
+    appendSystemPrompt,
+    casePaths,
+    cases,
+    models,
+  };
+}
+
+interface SuiteCaseDefaults {
+  references: string;
+  env: string[];
+  setup: string[];
+  cleanup: string[];
+  mcpServers: WorkbenchMcpServersConfig;
+  mcpServices: WorkbenchMcpServicesConfig;
+  timeoutSeconds?: number;
+}
+
+function readSuiteCaseDefaults(parsed: Record<string, unknown>, configPath: string): SuiteCaseDefaults {
+  if (parsed.artifacts !== undefined) {
+    throw new Error(`Workbench suite ${configPath}: field "artifacts" is invalid; inspect outputs in the workspace or use --keep-workspace`);
+  }
+
+  return {
+    references: readOptionalString(parsed, 'references', configPath) ?? './references',
+    env: readStringArray(parsed, 'env', configPath, true),
+    setup: readStringArray(parsed, 'setup', configPath, true),
+    cleanup: readStringArray(parsed, 'cleanup', configPath, true),
+    mcpServers: readMcpServers(parsed, configPath),
+    mcpServices: readMcpServices(parsed, configPath),
+    timeoutSeconds: readOptionalTimeoutSeconds(parsed, configPath),
+  };
+}
+
+function resolveSuiteCase(
+  entry: string | Record<string, unknown>,
+  index: number,
+  suitePath: string,
+  suiteDir: string,
+  defaults: SuiteCaseDefaults,
+): ResolvedWorkbenchSuiteCase {
+  if (typeof entry === 'string') {
+    const path = resolve(suiteDir, entry);
+    return { slug: caseSlugFromPath(path), path };
+  }
+
+  const inlineConfig = applySuiteDefaults(entry, defaults, `${suitePath}#cases[${index}]`);
+  const resolvedCase = resolveWorkbenchCaseConfig(inlineConfig, `${suitePath}#cases[${index}]`, suiteDir);
+  return { slug: slugPathSegment(resolvedCase.name), case: resolvedCase };
+}
+
+function applySuiteDefaults(
+  entry: Record<string, unknown>,
+  defaults: SuiteCaseDefaults,
+  configPath: string,
+): Record<string, unknown> {
+  const entryMcpServers = entry.mcpServers === undefined
+    ? {}
+    : readMcpServers({ mcpServers: entry.mcpServers }, configPath);
+  const mcpServers = {
+    ...defaults.mcpServers,
+    ...entryMcpServers,
+  };
+  const entryMcpServices = entry.mcpServices === undefined
+    ? {}
+    : readMcpServices({ mcpServices: entry.mcpServices }, configPath);
+  const mcpServices = {
+    ...defaults.mcpServices,
+    ...entryMcpServices,
+  };
+
+  return {
+    references: defaults.references,
+    ...(defaults.env.length > 0 ? { env: defaults.env } : {}),
+    ...(defaults.setup.length > 0 ? { setup: defaults.setup } : {}),
+    ...(defaults.cleanup.length > 0 ? { cleanup: defaults.cleanup } : {}),
+    ...(Object.keys(mcpServers).length > 0 ? { mcpServers } : {}),
+    ...(Object.keys(mcpServices).length > 0 ? { mcpServices } : {}),
+    ...(defaults.timeoutSeconds !== undefined ? { timeoutSeconds: defaults.timeoutSeconds } : {}),
+    ...entry,
+    ...(Object.keys(mcpServers).length > 0 ? { mcpServers } : {}),
+    ...(Object.keys(mcpServices).length > 0 ? { mcpServices } : {}),
+  };
+}
+
+function caseSlugFromPath(casePath: string): string {
+  const file = basename(casePath);
+  const stem = file.slice(0, file.length - extname(file).length);
+  return slugPathSegment(stem === 'case' ? basename(dirname(casePath)) : stem);
+}
+
+function parseWorkbenchSuite(raw: string, configPath: string): Record<string, unknown> {
+  const extension = extname(configPath).toLowerCase();
+
+  try {
+    if (extension === '.json') {
+      const parsed = JSON.parse(raw) as unknown;
+      ensurePlainObject(parsed, configPath);
+      return parsed;
+    }
+
+    if (extension === '.yml' || extension === '.yaml') {
+      const parsed = parseYaml(raw) as unknown;
+      ensurePlainObject(parsed, configPath);
+      return parsed;
+    }
+  } catch (error) {
+    const parser = extension === '.json' ? 'JSON' : 'YAML';
+    throw new Error(
+      `Invalid ${parser} in workbench suite file ${configPath}: ` +
+      `${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+
+  throw new Error(
+    `Unsupported workbench suite file extension for ${configPath}. Expected .json, .yml, or .yaml.`,
+  );
+}
+
+function ensurePlainObject(value: unknown, configPath: string): asserts value is Record<string, unknown> {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) {
+    throw new Error(`Workbench suite file ${configPath} must contain an object at the root`);
+  }
+}
+
+function requireNonEmptyString(
+  parsed: Record<string, unknown>,
+  field: 'name',
+  configPath: string,
+): string {
+  const value = parsed[field];
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(`Workbench suite ${configPath}: field "${field}" must be a non-empty string`);
+  }
+  return value.trim();
+}
+
+function readStringArray(
+  parsed: Record<string, unknown>,
+  field: 'models' | 'env' | 'setup' | 'cleanup',
+  configPath: string,
+  optional = false,
+): string[] {
+  const value = parsed[field];
+  if (value === undefined) {
+    if (optional) return [];
+    throw new Error(`Workbench suite ${configPath}: field "${field}" must be an array of non-empty strings`);
+  }
+  if (!Array.isArray(value) || value.length === 0) {
+    throw new Error(`Workbench suite ${configPath}: field "${field}" must be a non-empty array of strings`);
+  }
+
+  return value.map((item, index) => {
+    if (typeof item !== 'string' || item.trim() === '') {
+      throw new Error(
+        `Workbench suite ${configPath}: field "${field}" item at index ${index} must be a non-empty string`,
+      );
+    }
+    return item.trim();
+  });
+}
+
+function readCaseEntries(parsed: Record<string, unknown>, configPath: string): Array<string | Record<string, unknown>> {
+  const value = parsed.cases;
+  if (!Array.isArray(value) || value.length === 0) {
+    throw new Error(`Workbench suite ${configPath}: field "cases" must be a non-empty array`);
+  }
+
+  return value.map((item, index) => {
+    if (typeof item === 'string' && item.trim() !== '') {
+      return item.trim();
+    }
+    if (item && typeof item === 'object' && !Array.isArray(item)) {
+      return item as Record<string, unknown>;
+    }
+    throw new Error(
+      `Workbench suite ${configPath}: field "cases" item at index ${index} must be a non-empty string or object`,
+    );
+  });
+}
+
+function readOptionalString(
+  parsed: Record<string, unknown>,
+  field: 'references' | 'appendSystemPrompt',
+  configPath: string,
+): string | undefined {
+  const value = parsed[field];
+  if (value === undefined) {
+    return undefined;
+  }
+  if (typeof value !== 'string' || value.trim() === '') {
+    throw new Error(`Workbench suite ${configPath}: field "${field}" must be a non-empty string when provided`);
+  }
+  return value.trim();
+}
+
+function readOptionalTimeoutSeconds(parsed: Record<string, unknown>, configPath: string): number | undefined {
+  const value = parsed.timeoutSeconds;
+  if (value === undefined) {
+    return undefined;
+  }
+  if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) {
+    throw new Error(`Workbench suite ${configPath}: field "timeoutSeconds" must be a positive number when provided`);
+  }
+  return value;
+}
diff --git a/src/workbench/trace.ts b/src/workbench/trace.ts
new file mode 100644
index 0000000..3ea5916
--- /dev/null
+++ b/src/workbench/trace.ts
@@ -0,0 +1,290 @@
+import type { WorkbenchTrace, WorkbenchTraceEntry, WorkbenchTraceEvent } from './types.js';
+import { isRecord } from './utils.js';
+
+export interface TraceRecorder {
+  events: WorkbenchTraceEvent[];
+  record(event: unknown): void;
+  toTrace(params: {
+    caseName: string;
+    model: string;
+    startedAt: string;
+    endedAt: string;
+    messages?: unknown[];
+  }): WorkbenchTrace;
+}
+
+export function createTraceCollector(): { record(event: unknown): void; events: unknown[] } {
+  const events: unknown[] = [];
+  return {
+    events,
+    record(event: unknown) {
+      events.push(event);
+    },
+  };
+}
+
+export function createTraceRecorder(options: { now?: () => string } = {}): TraceRecorder {
+  const now = options.now ?? (() => new Date().toISOString());
+  const events: WorkbenchTraceEvent[] = [];
+
+  return {
+    events,
+    record(event: unknown) {
+      events.push(normalizeTraceEvent(event, now()));
+    },
+    toTrace(params) {
+      const eventEntries = normalizeEvents(events);
+      const entries = eventEntries.length > 0
+        ? mergeSessionMessages(eventEntries, params.messages ?? [])
+        : normalizeMessages(params.messages ?? []);
+      return {
+        schemaVersion: 1,
+        caseName: params.caseName,
+        model: params.model,
+        startedAt: params.startedAt,
+        endedAt: params.endedAt,
+        events: [...events],
+        entries,
+      };
+    },
+  };
+}
+
+export function buildWorkbenchTrace(params: {
+  caseName: string;
+  model: string;
+  startedAt: string;
+  endedAt: string;
+  messages: unknown[];
+}): WorkbenchTrace {
+  return {
+    caseName: params.caseName,
+    model: params.model,
+    startedAt: params.startedAt,
+    endedAt: params.endedAt,
+    entries: normalizeMessages(params.messages),
+  };
+}
+
+function normalizeTraceEvent(event: unknown, timestamp: string): WorkbenchTraceEvent {
+  if (!isRecord(event) || typeof event.type !== 'string') {
+    return { type: 'unknown', timestamp, value: toJsonSafe(event) };
+  }
+
+  const normalized: WorkbenchTraceEvent = { type: event.type, timestamp };
+  for (const [key, value] of Object.entries(event)) {
+    if (key === 'type') {
+      continue;
+    }
+    const safeValue = toJsonSafe(value);
+    if (safeValue !== undefined) {
+      normalized[key] = safeValue;
+    }
+  }
+  return normalized;
+}
+
+function normalizeEvents(events: WorkbenchTraceEvent[]): WorkbenchTraceEntry[] {
+  const entries: WorkbenchTraceEntry[] = [];
+
+  for (const event of events) {
+    if (event.type === 'message_end' && isRecord(event.message)) {
+      const messageEntry = normalizeMessageOnly(event.message, event.timestamp);
+      if (messageEntry) {
+        entries.push(messageEntry);
+      }
+      continue;
+    }
+
+    if (event.type === 'tool_execution_start') {
+      entries.push({
+        type: 'tool_call',
+        id: typeof event.toolCallId === 'string' ? event.toolCallId : undefined,
+        name: typeof event.toolName === 'string' ? event.toolName : 'unknown',
+        arguments: event.args,
+        timestamp: event.timestamp,
+      });
+      continue;
+    }
+
+    if (event.type === 'tool_execution_end') {
+      entries.push({
+        type: 'tool_result',
+        id: typeof event.toolCallId === 'string' ? event.toolCallId : undefined,
+        name: typeof event.toolName === 'string' ? event.toolName : undefined,
+        text: extractToolEventText(event.result),
+        isError: typeof event.isError === 'boolean' ? event.isError : undefined,
+        timestamp: event.timestamp,
+      });
+    }
+  }
+
+  return entries;
+}
+
+function mergeSessionMessages(eventEntries: WorkbenchTraceEntry[], messages: unknown[]): WorkbenchTraceEntry[] {
+  const sessionMessages = normalizeMessages(messages)
+    .filter((entry): entry is Extract<WorkbenchTraceEntry, { type: 'message' }> => entry.type === 'message');
+  const missingSessionMessages = sessionMessages.filter((message) => !eventEntries.some((entry) => sameMessageEntry(entry, message)));
+  return [...missingSessionMessages, ...eventEntries];
+}
+
+function sameMessageEntry(left: WorkbenchTraceEntry, right: Extract<WorkbenchTraceEntry, { type: 'message' }>): boolean {
+  if (left.type !== 'message') {
+    return false;
+  }
+  return left.role === right.role
+    && left.text === right.text
+    && left.thinking === right.thinking
+    && left.stopReason === right.stopReason
+    && left.errorMessage === right.errorMessage;
+}
+
+function normalizeMessageOnly(message: Record<string, unknown>, timestamp: string): WorkbenchTraceEntry | undefined {
+  const role = typeof message.role === 'string' ? message.role : 'unknown';
+  if (role === 'toolResult') {
+    return undefined;
+  }
+
+  const content = Array.isArray(message.content) ? message.content : [];
+  const text = extractContentByType(content, 'text', 'text');
+  const thinking = extractContentByType(content, 'thinking', 'thinking');
+  const hasTerminalMetadata = typeof message.stopReason === 'string' || typeof message.errorMessage === 'string';
+  if (text.length === 0 && thinking.length === 0 && role === 'assistant' && !hasTerminalMetadata) {
+    return undefined;
+  }
+
+  return {
+    type: 'message',
+    role,
+    text: text.length > 0 ? text : undefined,
+    thinking: thinking.length > 0 ? thinking : undefined,
+    timestamp,
+    usage: message.usage,
+    stopReason: message.stopReason,
+    errorMessage: typeof message.errorMessage === 'string' ? message.errorMessage : undefined,
+  };
+}
+
+function normalizeMessages(messages: unknown[]): WorkbenchTraceEntry[] {
+  const entries: WorkbenchTraceEntry[] = [];
+
+  for (const message of messages) {
+    if (!isRecord(message)) {
+      continue;
+    }
+
+    const role = typeof message.role === 'string' ? message.role : 'unknown';
+    const timestamp = message.timestamp;
+
+    if (role === 'toolResult') {
+      entries.push({
+        type: 'tool_result',
+        id: typeof message.toolCallId === 'string' ? message.toolCallId : undefined,
+        name: typeof message.toolName === 'string' ? message.toolName : undefined,
+        text: extractText(message.content),
+        isError: typeof message.isError === 'boolean' ? message.isError : undefined,
+        timestamp,
+      });
+      continue;
+    }
+
+    const content = Array.isArray(message.content) ? message.content : [];
+    const text = extractContentByType(content, 'text', 'text');
+    const thinking = extractContentByType(content, 'thinking', 'thinking');
+
+    const hasTerminalMetadata = typeof message.stopReason === 'string' || typeof message.errorMessage === 'string';
+    if (text.length > 0 || thinking.length > 0 || role !== 'assistant' || hasTerminalMetadata) {
+      entries.push({
+        type: 'message',
+        role,
+        text: text.length > 0 ? text : undefined,
+        thinking: thinking.length > 0 ? thinking : undefined,
+        timestamp,
+        usage: message.usage,
+        stopReason: message.stopReason,
+        errorMessage: typeof message.errorMessage === 'string' ? message.errorMessage : undefined,
+      });
+    }
+
+    for (const item of content) {
+      if (!isRecord(item) || item.type !== 'toolCall') {
+        continue;
+      }
+
+      entries.push({
+        type: 'tool_call',
+        id: typeof item.id === 'string' ? item.id : undefined,
+        name: typeof item.name === 'string' ? item.name : 'unknown',
+        arguments: item.arguments,
+        timestamp,
+      });
+    }
+  }
+
+  return entries;
+}
+
+function extractContentByType(content: unknown[], type: string, field: string): string {
+  return content
+    .map((item) => {
+      if (!isRecord(item) || item.type !== type) {
+        return '';
+      }
+      const value = item[field];
+      return typeof value === 'string' ? value : '';
+    })
+    .filter((value) => value.length > 0)
+    .join('\n');
+}
+
+function extractText(content: unknown): string | undefined {
+  if (typeof content === 'string') {
+    return content;
+  }
+
+  if (!Array.isArray(content)) {
+    return undefined;
+  }
+
+  const text = extractContentByType(content, 'text', 'text');
+  return text.length > 0 ? text : undefined;
+}
+
+function extractToolEventText(result: unknown): string | undefined {
+  if (!isRecord(result)) {
+    return undefined;
+  }
+  return extractText(result.content);
+}
+
+function toJsonSafe(value: unknown, seen = new WeakSet<object>(), depth = 0): unknown {
+  if (value === null || typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
+    return value;
+  }
+  if (value === undefined || typeof value === 'function' || typeof value === 'symbol') {
+    return undefined;
+  }
+  if (depth > 8) {
+    return '[MaxDepth]';
+  }
+  if (Array.isArray(value)) {
+    return value.map((item) => toJsonSafe(item, seen, depth + 1));
+  }
+  if (typeof value === 'object') {
+    if (seen.has(value)) {
+      return '[Circular]';
+    }
+    seen.add(value);
+    const record: Record<string, unknown> = {};
+    for (const [key, item] of Object.entries(value)) {
+      const safeItem = toJsonSafe(item, seen, depth + 1);
+      if (safeItem !== undefined) {
+        record[key] = safeItem;
+      }
+    }
+    seen.delete(value);
+    return record;
+  }
+  return String(value);
+}
diff --git a/src/workbench/trials.ts b/src/workbench/trials.ts
new file mode 100644
index 0000000..9dd34c0
--- /dev/null
+++ b/src/workbench/trials.ts
@@ -0,0 +1,74 @@
+import type { WorkbenchAggregateSummary } from './types.js';
+
+export interface TrialScoreInput {
+  trial: number;
+  pass: boolean;
+  score: number;
+}
+
+export interface TrialAggregate {
+  totalTrials: number;
+  passedTrials: number;
+  failedTrials: number;
+  trialPassRate: number;
+  meanScore: number;
+  passAtK: boolean;
+  passHatK: boolean;
+}
+
+export function formatTrialNumber(trial: number): string {
+  if (!Number.isInteger(trial) || trial <= 0) {
+    throw new Error('Trial number must be a positive integer');
+  }
+  return String(trial).padStart(3, '0');
+}
+
+export function parseTrialsFlag(value: string | undefined): number {
+  if (value === undefined) {
+    return 1;
+  }
+
+  const trials = Number(value);
+  if (!Number.isInteger(trials) || trials <= 0) {
+    throw new Error('Field "trials" must be a positive integer');
+  }
+  return trials;
+}
+
+export function aggregateTrials(trials: TrialScoreInput[]): TrialAggregate {
+  const totalTrials = trials.length;
+  const passedTrials = trials.filter((trial) => trial.pass).length;
+  const failedTrials = totalTrials - passedTrials;
+  const scoreTotal = trials.reduce((sum, trial) => sum + trial.score, 0);
+
+  return {
+    totalTrials,
+    passedTrials,
+    failedTrials,
+    trialPassRate: totalTrials === 0 ? 0 : passedTrials / totalTrials,
+    meanScore: totalTrials === 0 ? 0 : scoreTotal / totalTrials,
+    passAtK: totalTrials > 0 && passedTrials > 0,
+    passHatK: totalTrials > 0 && passedTrials === totalTrials,
+  };
+}
+
+export function summarizeTrialAggregates(results: TrialAggregate[]): WorkbenchAggregateSummary {
+  const totalTrials = results.reduce((sum, result) => sum + result.totalTrials, 0);
+  const passedTrials = results.reduce((sum, result) => sum + result.passedTrials, 0);
+  const failedTrials = totalTrials - passedTrials;
+  const scoreTotal = results.reduce((sum, result) => sum + result.meanScore * result.totalTrials, 0);
+  const passed = results.filter((result) => result.passHatK).length;
+  const failed = results.length - passed;
+
+  return {
+    total: results.length,
+    passed,
+    failed,
+    passRate: results.length === 0 ? 0 : passed / results.length,
+    totalTrials,
+    passedTrials,
+    failedTrials,
+    trialPassRate: totalTrials === 0 ? 0 : passedTrials / totalTrials,
+    meanScore: totalTrials === 0 ? 0 : scoreTotal / totalTrials,
+  };
+}
diff --git a/src/workbench/types.ts b/src/workbench/types.ts
new file mode 100644
index 0000000..d80c25d
--- /dev/null
+++ b/src/workbench/types.ts
@@ -0,0 +1,233 @@
+export interface WorkbenchGraderConfig {
+  name: string;
+  command: string;
+}
+
+export type WorkbenchMcpJsonValue =
+  | string
+  | number
+  | boolean
+  | null
+  | WorkbenchMcpJsonValue[]
+  | { [key: string]: WorkbenchMcpJsonValue };
+
+export interface WorkbenchMcpServerConfig {
+  description?: string;
+  baseUrl?: string;
+  url?: string;
+  serverUrl?: string;
+  command?: string;
+  args?: string[];
+  env?: Record<string, string>;
+  headers?: Record<string, string>;
+  allowedTools?: string[];
+  allowed_tools?: string[];
+  blockedTools?: string[];
+  blocked_tools?: string[];
+  [key: string]: WorkbenchMcpJsonValue | undefined;
+}
+
+export type WorkbenchMcpServersConfig = Record<string, WorkbenchMcpServerConfig>;
+
+export interface WorkbenchMcpServiceConfig {
+  command: string;
+  args: string[];
+}
+
+export type WorkbenchMcpServicesConfig = Record<string, WorkbenchMcpServiceConfig>;
+
+export interface WorkbenchCaseConfig {
+  name: string;
+  references: string;
+  task: string;
+  graders: WorkbenchGraderConfig[];
+  mcpServers?: WorkbenchMcpServersConfig;
+  mcpServices?: WorkbenchMcpServicesConfig;
+  env?: string[];
+  setup?: string[];
+  cleanup?: string[];
+  model?: string;
+  timeoutSeconds?: number;
+}
+
+export interface ResolvedWorkbenchCase {
+  configPath: string;
+  configDir: string;
+  name: string;
+  referencesDir: string;
+  task: string;
+  graders: WorkbenchGraderConfig[];
+  mcpServers: WorkbenchMcpServersConfig;
+  mcpServices: WorkbenchMcpServicesConfig;
+  env: string[];
+  setup: string[];
+  cleanup: string[];
+  model: string;
+  timeoutSeconds: number;
+}
+
+export interface WorkbenchGrade {
+  pass: boolean;
+  score: number;
+  evidence: string[];
+  graders?: WorkbenchGraderResult[];
+  metrics?: WorkbenchMetrics;
+  exitCode?: number | null;
+  command?: string;
+  stdout?: string;
+  stderr?: string;
+  durationMs?: number;
+}
+
+export interface WorkbenchGraderResult extends Omit<WorkbenchGrade, 'graders'> {
+  name: string;
+  command: string;
+}
+
+export interface WorkbenchResult extends WorkbenchGrade {
+  caseName?: string;
+  model?: string;
+  trial?: number;
+  startedAt?: string;
+  endedAt?: string;
+  error?: string;
+}
+
+export interface WorkbenchTokenMetrics {
+  input: number;
+  output: number;
+  cacheRead: number;
+  cacheWrite: number;
+  total: number;
+}
+
+export interface WorkbenchCostMetrics {
+  input: number;
+  output: number;
+  cacheRead: number;
+  cacheWrite: number;
+  total: number;
+}
+
+export interface WorkbenchMetrics {
+  durationMs: number;
+  turns: number;
+  toolCalls: number;
+  toolResults: number;
+  bashCalls: number;
+  readCalls: number;
+  writeCalls: number;
+  editCalls: number;
+  stopReason?: string;
+  tokens: WorkbenchTokenMetrics;
+  cost: WorkbenchCostMetrics;
+}
+
+export interface WorkbenchTrialSummaryFile {
+  finalAssistantMessage?: string;
+  failedGraders: string[];
+  evidence: string[];
+  bashCommands: string[];
+  stopReason?: string;
+  errorMessage?: string;
+  metrics: WorkbenchMetrics;
+}
+
+export interface WorkbenchAggregateSummary {
+  total: number;
+  passed: number;
+  failed: number;
+  passRate: number;
+  totalTrials: number;
+  passedTrials: number;
+  failedTrials: number;
+  trialPassRate: number;
+  meanScore: number;
+}
+
+export interface WorkbenchTrialResultRef {
+  trial: number;
+  pass: boolean;
+  score: number;
+  resultPath: string;
+  tracePath: string;
+  summaryPath?: string;
+}
+
+export interface WorkbenchModelAggregateResult {
+  model: string;
+  totalTrials: number;
+  passedTrials: number;
+  failedTrials: number;
+  trialPassRate: number;
+  meanScore: number;
+  passAtK: boolean;
+  passHatK: boolean;
+  trials: WorkbenchTrialResultRef[];
+}
+
+export interface WorkbenchCaseModelAggregateResult extends WorkbenchModelAggregateResult {
+  caseName: string;
+}
+
+export interface RunCaseAggregateResultFile {
+  name: string;
+  startedAt: string;
+  endedAt: string;
+  models: string[];
+  summary: WorkbenchAggregateSummary;
+  results: WorkbenchModelAggregateResult[];
+}
+
+export interface RunSuiteAggregateResultFile {
+  name: string;
+  startedAt: string;
+  endedAt: string;
+  models: string[];
+  cases: string[];
+  summary: WorkbenchAggregateSummary;
+  results: WorkbenchCaseModelAggregateResult[];
+}
+
+export type WorkbenchTraceEntry =
+  | {
+      type: 'message';
+      role: string;
+      text?: string;
+      thinking?: string;
+      timestamp?: unknown;
+      usage?: unknown;
+      stopReason?: unknown;
+      errorMessage?: string;
+    }
+  | {
+      type: 'tool_call';
+      id?: string;
+      name: string;
+      arguments?: unknown;
+      timestamp?: unknown;
+    }
+  | {
+      type: 'tool_result';
+      id?: string;
+      name?: string;
+      text?: string;
+      isError?: boolean;
+      timestamp?: unknown;
+    };
+
+export interface WorkbenchTraceEvent {
+  type: string;
+  timestamp: string;
+  [key: string]: unknown;
+}
+
+export interface WorkbenchTrace {
+  schemaVersion?: 1;
+  caseName: string;
+  model: string;
+  startedAt: string;
+  endedAt: string;
+  events?: WorkbenchTraceEvent[];
+  entries: WorkbenchTraceEntry[];
+}
diff --git a/src/workbench/utils.ts b/src/workbench/utils.ts
new file mode 100644
index 0000000..14c1fe1
--- /dev/null
+++ b/src/workbench/utils.ts
@@ -0,0 +1,56 @@
+import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { dirname } from 'node:path';
+
+import type { WorkbenchResult } from './types.js';
+
+export function timestampSlug(date: Date): string {
+  const pad = (value: number) => String(value).padStart(2, '0');
+  return [date.getUTCFullYear(), pad(date.getUTCMonth() + 1), pad(date.getUTCDate())].join('') +
+    '-' +
+    [pad(date.getUTCHours()), pad(date.getUTCMinutes()), pad(date.getUTCSeconds())].join('');
+}
+
+export function writeJsonFile(filePath: string, value: unknown, options: { ensureDir?: boolean } = {}): void {
+  if (options.ensureDir) {
+    mkdirSync(dirname(filePath), { recursive: true });
+  }
+  writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
+}
+
+export function readJsonFile(filePath: string): unknown {
+  return JSON.parse(readFileSync(filePath, 'utf-8')) as unknown;
+}
+
+export function readWorkbenchResultFile(filePath: string): WorkbenchResult {
+  const parsed = readJsonFile(filePath);
+  if (!isRecord(parsed)) {
+    throw new Error(`Workbench result must contain an object: ${filePath}`);
+  }
+  if (typeof parsed.pass !== 'boolean') {
+    throw new Error(`Workbench result pass must be boolean: ${filePath}`);
+  }
+  if (typeof parsed.score !== 'number' || !Number.isFinite(parsed.score)) {
+    throw new Error(`Workbench result score must be a finite number: ${filePath}`);
+  }
+  if (!Array.isArray(parsed.evidence) || !parsed.evidence.every((item) => typeof item === 'string')) {
+    throw new Error(`Workbench result evidence must be an array of strings: ${filePath}`);
+  }
+  return {
+    ...(parsed as Partial<WorkbenchResult>),
+    pass: parsed.pass,
+    score: parsed.score,
+    evidence: parsed.evidence,
+  };
+}
+
+export function shellQuote(value: string): string {
+  return `'${value.replace(/'/g, `'\''`)}'`;
+}
+
+export function slugPathSegment(value: string): string {
+  return value.trim().replace(/[^A-Za-z0-9._-]+/g, '-').replace(/^-+|-+$/g, '');
+}
+
+export function isRecord(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === 'object' && !Array.isArray(value);
+}
diff --git a/src/workbench/workspace.ts b/src/workbench/workspace.ts
new file mode 100644
index 0000000..eb86995
--- /dev/null
+++ b/src/workbench/workspace.ts
@@ -0,0 +1,55 @@
+import { cpSync, existsSync, mkdirSync, readdirSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+
+export function buildWorkbenchEnv(params: {
+  caseDir: string;
+  workDir: string;
+  resultsDir: string;
+  baseEnv?: NodeJS.ProcessEnv;
+}): NodeJS.ProcessEnv {
+  const caseBin = join(params.caseDir, 'bin');
+  const workBin = join(params.workDir, 'bin');
+  const baseEnv = params.baseEnv ?? process.env;
+  const pathValue = [
+    workBin,
+    existsSync(caseBin) ? caseBin : undefined,
+    baseEnv.PATH,
+  ].filter(Boolean).join(':');
+
+  return {
+    ...baseEnv,
+    ...(pathValue ? { PATH: pathValue } : {}),
+    CASE: params.caseDir,
+    WORK: params.workDir,
+    RESULTS: params.resultsDir,
+  };
+}
+
+function ensureEmptyDirectory(dirPath: string): void {
+  mkdirSync(dirPath, { recursive: true });
+
+  for (const entry of readdirSync(dirPath)) {
+    rmSync(join(dirPath, entry), { recursive: true, force: true });
+  }
+}
+
+function copyDirectoryContents(sourceDir: string, destinationDir: string): void {
+  mkdirSync(destinationDir, { recursive: true });
+
+  for (const entry of readdirSync(sourceDir)) {
+    cpSync(join(sourceDir, entry), join(destinationDir, entry), { recursive: true });
+  }
+}
+
+export function prepareWorkbenchDirectory(params: {
+  referencesDir: string;
+  workspaceDir?: string;
+  workDir: string;
+}): void {
+  ensureEmptyDirectory(params.workDir);
+  copyDirectoryContents(params.referencesDir, params.workDir);
+
+  if (params.workspaceDir && existsSync(params.workspaceDir)) {
+    copyDirectoryContents(params.workspaceDir, params.workDir);
+  }
+}
diff --git a/tests/fixtures/import-commands/argparse-sample.py b/tests/fixtures/import-commands/argparse-sample.py
deleted file mode 100644
index 64e08d1..0000000
--- a/tests/fixtures/import-commands/argparse-sample.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import argparse
-
-parser = argparse.ArgumentParser(description='CLI tool')
-subparsers = parser.add_subparsers(dest='command')
-
-create_parser = subparsers.add_parser('create', help='Create a new item')
-create_parser.add_argument('--name', help='Item name')
-create_parser.add_argument('--count', type=int, help='Count')
-
-list_parser = subparsers.add_parser('list', help='List all items')
-list_parser.add_argument('--limit', type=int, help='Max results')
-
-args = parser.parse_args()
diff --git a/tests/fixtures/import-commands/clap-sample.rs b/tests/fixtures/import-commands/clap-sample.rs
deleted file mode 100644
index 5ba1214..0000000
--- a/tests/fixtures/import-commands/clap-sample.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-use clap::{Command, Arg};
-
-fn main() {
-    let matches = Command::new("mycli")
-        .subcommand(
-            Command::new("create")
-                .about("Create a new item")
-                .arg(Arg::new("name").long("name").help("Item name").required(false))
-                .arg(Arg::new("verbose").long("verbose").help("Verbose output").action(clap::ArgAction::SetTrue))
-        )
-        .subcommand(
-            Command::new("delete")
-                .about("Delete an item")
-                .arg(Arg::new("id").long("id").help("Item ID").required(true))
-        )
-        .get_matches();
-}
diff --git a/tests/fixtures/import-commands/click-sample.py b/tests/fixtures/import-commands/click-sample.py
deleted file mode 100644
index 368613b..0000000
--- a/tests/fixtures/import-commands/click-sample.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import click
-
-@click.group()
-def cli():
-    """CLI app."""
-    pass
-
-@cli.command()
-@click.option('--name', help='Item name')
-@click.option('--verbose', is_flag=True, help='Verbose output')
-def create(name, verbose):
-    """Create a new item."""
-    pass
-
-@cli.command()
-@click.argument('item_id')
-@click.option('--force', is_flag=True, help='Skip confirmation')
-def delete(item_id, force):
-    """Delete an item."""
-    pass
-
-if __name__ == '__main__':
-    cli()
diff --git a/tests/fixtures/import-commands/commander-sample.ts b/tests/fixtures/import-commands/commander-sample.ts
deleted file mode 100644
index 0b6fef0..0000000
--- a/tests/fixtures/import-commands/commander-sample.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import { Command } from 'commander';
-
-const program = new Command();
-
-program
-  .command('create')
-  .description('Create a new item')
-  .option('--name <value>', 'Item name')
-  .option('--dry-run', 'Preview only')
-  .action(() => {});
-
-program
-  .command('delete <id>')
-  .description('Delete an item by ID')
-  .action(() => {});
-
-program
-  .command('list')
-  .description('List all items')
-  .option('--limit <n>', 'Max results')
-  .action(() => {});
-
-program.parse(process.argv);
diff --git a/tests/fixtures/import-commands/help-output-account.txt b/tests/fixtures/import-commands/help-output-account.txt
deleted file mode 100644
index 59cebc3..0000000
--- a/tests/fixtures/import-commands/help-output-account.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-Usage: fast-cli account [options] [command]
-
-Commands:
-  create       Create a new account
-  list         List all accounts
-  delete <name>  Delete an account
-
-Options:
-  -h, --help  output usage information
diff --git a/tests/fixtures/import-commands/help-output-sample.txt b/tests/fixtures/import-commands/help-output-sample.txt
deleted file mode 100644
index abe2d05..0000000
--- a/tests/fixtures/import-commands/help-output-sample.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Usage: fast-cli [options] [command]
-
-Commands:
-  account     Account management
-  network     Network management
-  send        Send tokens
-  fund        Fund account
-  pay         Pay a URL
-  help [cmd]  display help for [cmd]
-
-Options:
-  -h, --help     output usage information
-  -V, --version  output the version number
diff --git a/tests/fixtures/import-commands/yargs-sample.ts b/tests/fixtures/import-commands/yargs-sample.ts
deleted file mode 100644
index 6be045d..0000000
--- a/tests/fixtures/import-commands/yargs-sample.ts
+++ /dev/null
@@ -1,18 +0,0 @@
-import yargs from 'yargs';
-
-yargs
-  .command(
-    'create',
-    'Create a new item',
-    (y) => y
-      .option('name', { describe: 'Item name', type: 'string' })
-      .option('verbose', { describe: 'Verbose output', type: 'boolean' }),
-    (_argv) => {},
-  )
-  .command(
-    'delete <id>',
-    'Delete an item',
-    (y) => y.positional('id', { describe: 'Item ID', type: 'string' }),
-    (_argv) => {},
-  )
-  .parse();
diff --git a/tests/fixtures/sample-skill.md b/tests/fixtures/sample-skill.md
deleted file mode 100644
index 7343437..0000000
--- a/tests/fixtures/sample-skill.md
+++ /dev/null
@@ -1,72 +0,0 @@
----
-name: deploy-service
-description: A skill for deploying microservices to Kubernetes
----
-
-# Deploy Service
-
-A comprehensive skill for deploying, validating, and monitoring microservice deployments.
-
-## Phase 1 — Requirements Discovery
-
-Ask clarifying questions about the deployment target until you have enough information:
-
-1. Which environment? (staging, production, canary)
-2. Which service name and version?
-3. Any special resource limits or scaling requirements?
-
-If the user specifies "production", require explicit confirmation before proceeding.
-When the user says "canary", ask for the traffic percentage split.
-
-## Phase 2 — Manifest Generation
-
-Generate the Kubernetes manifests based on gathered requirements:
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: <service-name>
-  namespace: <environment>
-spec:
-  replicas: <replica-count>
-  template:
-    spec:
-      containers:
-        - name: <service-name>
-          image: <registry>/<service-name>:<version>
-          resources:
-            limits:
-              cpu: <cpu-limit>
-              memory: <memory-limit>
-```
-
-1. Create the Deployment resource
-2. Create the Service resource
-3. Create the HorizontalPodAutoscaler if scaling is requested
-4. Apply namespace-specific overrides
-
-Do not include deprecated API versions.
-Never use `latest` as an image tag.
-
-## Phase 3 — Validation and Testing
-
-Run pre-deployment checks to ensure correctness:
-
-```bash
-kubectl apply --dry-run=server -f manifests/
-kubectl diff -f manifests/
-```
-
-Either run a smoke test suite or skip if the user explicitly opts out.
-
-Verify the following before proceeding:
-- All container images exist in the registry
-- Resource limits are within cluster quotas
-- No conflicting service names in the target namespace
-
-| Check | Tool | Pass Criteria |
-|-------|------|---------------|
-| Image exists | crane digest | Exit code 0 |
-| Quota check | kubectl describe quota | Used < Limit |
-| Name conflict | kubectl get svc | Not found |
diff --git a/tests/smoke-actions.ts b/tests/smoke-actions.ts
deleted file mode 100644
index 469bdb4..0000000
--- a/tests/smoke-actions.ts
+++ /dev/null
@@ -1,544 +0,0 @@
-import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
-import { join } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import {
-  ACTION_SNAPSHOT_VERSION,
-  fromSurfaceSnapshot,
-  loadActionSnapshotFile,
-  toSurfaceSnapshot,
-  writeActionSnapshotFile,
-} from '../src/actions/snapshot.js';
-import { diffActionCatalog } from '../src/actions/diff.js';
-import { buildMcpToolDefinitionsFromSnapshot, buildSurfaceSnapshot, loadProjectConfig, loadSurfaceSnapshotFile } from '../src/project/index.js';
-import type { SurfaceSnapshot } from '../src/project/types.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (error: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${error.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== Action Core Smoke Tests ===\n');
-
-await test('snapshot write/load roundtrip includes artifact version', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-actions-'));
-  try {
-    const snapshotPath = join(root, 'actions.snapshot.json');
-    writeActionSnapshotFile(snapshotPath, {
-      surface: 'mcp',
-      actions: [
-        {
-          key: 'wallet.create',
-          name: 'create_wallet',
-          args: [
-            { name: 'label', required: true, type: 'string' },
-            { name: 'network', required: false, type: 'string' },
-          ],
-        },
-      ],
-    });
-
-    const raw = JSON.parse(readFileSync(snapshotPath, 'utf-8')) as { version?: number };
-    assertEqual(raw.version, ACTION_SNAPSHOT_VERSION, 'snapshot file should include expected version field');
-
-    const loaded = loadActionSnapshotFile(snapshotPath);
-    assertEqual(loaded.version, ACTION_SNAPSHOT_VERSION, 'loaded snapshot version should match constant');
-    assertEqual(loaded.catalog.actions[0].key, 'wallet.create', 'action key should roundtrip');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('snapshot roundtrip preserves nested MCP arg schemas', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-actions-schema-'));
-  try {
-    const snapshotPath = join(root, 'actions.snapshot.json');
-    writeActionSnapshotFile(snapshotPath, {
-      surface: 'mcp',
-      actions: [
-        {
-          key: 'folders.update',
-          name: 'folders.update',
-          args: [
-            {
-              name: 'folderIds',
-              required: true,
-              type: 'array',
-              schema: {
-                type: 'array',
-                items: { type: 'integer' },
-                description: 'Folder ids to update',
-              },
-            },
-          ],
-        },
-      ],
-    });
-
-    const loaded = loadActionSnapshotFile(snapshotPath);
-    const schema = loaded.catalog.actions[0].args[0].schema as { items?: { type?: string } };
-    assertEqual(schema.items?.type, 'integer', 'nested array items should survive snapshot roundtrip');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('diffActionCatalog ignores arg reordering and catches schema changes', () => {
-  const before = {
-    surface: 'mcp' as const,
-    actions: [
-      {
-        key: 'wallet.send',
-        name: 'send_tokens',
-        args: [
-          { name: 'amount', required: true, type: 'string' },
-          { name: 'to', required: true, type: 'string' },
-        ],
-      },
-    ],
-  };
-
-  const reordered = {
-    ...before,
-    actions: [
-      {
-        ...before.actions[0],
-        args: [...before.actions[0].args].reverse(),
-      },
-    ],
-  };
-
-  const noOpDiff = diffActionCatalog(before, reordered);
-  assertEqual(noOpDiff.changed.length, 0, 'arg reordering should not count as schema change');
-  assertEqual(noOpDiff.added.length, 0, 'arg reordering should not add actions');
-  assertEqual(noOpDiff.removed.length, 0, 'arg reordering should not remove actions');
-
-  const changed = {
-    ...before,
-    actions: [
-      {
-        ...before.actions[0],
-        args: [
-          { name: 'amount', required: false, type: 'string' },
-          { name: 'to', required: true, type: 'string' },
-        ],
-      },
-    ],
-  };
-
-  const changedDiff = diffActionCatalog(before, changed);
-  assertEqual(changedDiff.changed.length, 1, 'required-flag changes should count as schema changes');
-});
-
-await test('fromSurfaceSnapshot maps SurfaceSnapshot action names to ActionCatalog keys', () => {
-  const snapshot: SurfaceSnapshot = {
-    surface: 'cli',
-    actions: [
-      {
-        name: 'wallet create',
-        args: [
-          { name: 'label', required: true, type: 'string' },
-          { name: 'network', required: false, type: 'string' },
-        ],
-      },
-    ],
-  };
-
-  const catalog = fromSurfaceSnapshot(snapshot);
-  assertEqual(catalog.actions[0].key, 'wallet create', 'action name should map to canonical key');
-
-  const roundtrip = toSurfaceSnapshot(catalog);
-  assertEqual(roundtrip.actions[0].name, 'wallet create', 'action name should roundtrip via toSurfaceSnapshot');
-  assert(!('key' in roundtrip.actions[0]), 'SurfaceSnapshot actions should not include key field');
-});
-
-await test('fromSurfaceSnapshot trims whitespace from derived action keys', () => {
-  const snapshot: SurfaceSnapshot = {
-    surface: 'mcp',
-    actions: [
-      {
-        name: '  wallet.send  ',
-        args: [
-          { name: 'amount', required: true, type: 'string' },
-          { name: 'to', required: true, type: 'string' },
-        ],
-      },
-    ],
-  };
-
-  const converted = fromSurfaceSnapshot(snapshot);
-  assertEqual(converted.actions[0].key, 'wallet.send', 'action name whitespace should be trimmed to derive canonical key');
-
-  const normalized = {
-    surface: 'mcp' as const,
-    actions: [
-      {
-        key: 'wallet.send',
-        name: 'wallet.send',
-        args: [
-          { name: 'to', required: true, type: 'string' },
-          { name: 'amount', required: true, type: 'string' },
-        ],
-      },
-    ],
-  };
-
-  const diff = diffActionCatalog(converted, normalized);
-  assertEqual(diff.added.length, 0, 'trimmed key normalization should avoid false additions');
-  assertEqual(diff.removed.length, 0, 'trimmed key normalization should avoid false removals');
-  assertEqual(diff.changed.length, 0, 'trimmed key normalization should avoid false schema changes');
-});
-
-await test('diffActionCatalog reports added and removed actions', () => {
-  const before = {
-    surface: 'cli' as const,
-    actions: [
-      { key: 'wallet.create', name: 'wallet create', args: [] },
-      { key: 'wallet.balance', name: 'wallet balance', args: [] },
-    ],
-  };
-
-  const after = {
-    surface: 'cli' as const,
-    actions: [
-      { key: 'wallet.create', name: 'wallet create', args: [] },
-      { key: 'wallet.send', name: 'wallet send', args: [] },
-    ],
-  };
-
-  const diff = diffActionCatalog(before, after);
-  assertEqual(diff.added.length, 1, 'should report newly added actions');
-  assertEqual(diff.added[0].key, 'wallet.send', 'added action key should match');
-  assertEqual(diff.removed.length, 1, 'should report removed actions');
-  assertEqual(diff.removed[0].key, 'wallet.balance', 'removed action key should match');
-});
-
-await test('diffActionCatalog normalizes key whitespace during comparison', () => {
-  const before = {
-    surface: 'sdk' as const,
-    actions: [
-      {
-        key: '  FastWallet.send  ',
-        name: 'FastWallet.send',
-        args: [{ name: 'to', required: true, type: 'string' }],
-      },
-    ],
-  };
-  const after = {
-    surface: 'sdk' as const,
-    actions: [
-      {
-        key: 'FastWallet.send',
-        name: 'FastWallet.send',
-        args: [{ name: 'to', required: true, type: 'string' }],
-      },
-    ],
-  };
-
-  const diff = diffActionCatalog(before, after);
-  assertEqual(diff.added.length, 0, 'whitespace-only key differences should not add actions');
-  assertEqual(diff.removed.length, 0, 'whitespace-only key differences should not remove actions');
-  assertEqual(diff.changed.length, 0, 'whitespace-only key differences should not change schema');
-});
-
-await test('diffActionCatalog throws on duplicate keys', () => {
-  const before = {
-    surface: 'mcp' as const,
-    actions: [
-      { key: 'wallet.send', name: 'wallet.send', args: [] },
-      { key: 'wallet.send', name: 'wallet.send.v2', args: [] },
-    ],
-  };
-  const after = {
-    surface: 'mcp' as const,
-    actions: [{ key: 'wallet.send', name: 'wallet.send', args: [] }],
-  };
-
-  let threw = false;
-  try {
-    diffActionCatalog(before, after);
-  } catch (error: any) {
-    threw = true;
-    assert(error.message.includes('Duplicate action key'), 'error should mention duplicate action keys');
-    assert(error.message.includes('wallet.send'), 'error should include the offending key');
-    assert(error.message.includes('before'), 'error should include side context');
-  }
-
-  assert(threw, 'duplicate keys must throw instead of being silently collapsed');
-});
-
-await test('loadActionSnapshotFile fails clearly on malformed snapshot shape', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-actions-malformed-'));
-  try {
-    const snapshotPath = join(root, 'actions.snapshot.json');
-    writeFileSync(snapshotPath, JSON.stringify({ version: ACTION_SNAPSHOT_VERSION, catalog: { surface: 'mcp' } }, null, 2), 'utf-8');
-
-    let threw = false;
-    try {
-      loadActionSnapshotFile(snapshotPath);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('Invalid action snapshot file'), 'error should classify malformed snapshot shape');
-      assert(error.message.includes(snapshotPath), 'error should include file context');
-      assert(error.message.includes('catalog.actions'), 'error should include failing shape detail');
-    }
-
-    assert(threw, 'malformed snapshot should throw');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('loadActionSnapshotFile validates malformed action entries with context', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-actions-malformed-action-'));
-  try {
-    const snapshotPath = join(root, 'actions.snapshot.json');
-    writeFileSync(snapshotPath, JSON.stringify({
-      version: ACTION_SNAPSHOT_VERSION,
-      catalog: {
-        surface: 'mcp',
-        actions: [
-          {
-            key: 'wallet.send',
-            name: 'wallet.send',
-            args: 'not-an-array',
-          },
-        ],
-      },
-    }, null, 2), 'utf-8');
-
-    let threw = false;
-    try {
-      loadActionSnapshotFile(snapshotPath);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('Invalid action snapshot file'), 'error should classify malformed action shape');
-      assert(error.message.includes(snapshotPath), 'error should include file context');
-      assert(error.message.includes('catalog.actions[0].args'), 'error should include malformed field path');
-    }
-
-    assert(threw, 'malformed action entry should throw');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('loadActionSnapshotFile includes path on invalid JSON', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-actions-invalid-json-'));
-  try {
-    const snapshotPath = join(root, 'actions.snapshot.json');
-    writeFileSync(snapshotPath, '{ "version": 1, "catalog": ', 'utf-8');
-
-    let threw = false;
-    try {
-      loadActionSnapshotFile(snapshotPath);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('Invalid action snapshot file'), 'error should classify invalid JSON');
-      assert(error.message.includes(snapshotPath), 'error should include snapshot path');
-    }
-
-    assert(threw, 'invalid JSON should throw');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('buildSurfaceSnapshot returns surface snapshot from code-first discovery fixture', async () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-snapshot-bridge-'));
-  try {
-    const sourcePath = join(root, 'server.ts');
-    const configPath = join(root, 'skill-optimizer.json');
-
-    writeFileSync(
-      sourcePath,
-      [
-        'export const TOOLS = [',
-        '  {',
-        "    type: 'function',",
-        '    function: {',
-        "      name: 'create_wallet',",
-        "      parameters: {",
-        "        type: 'object',",
-        '        properties: {',
-        "          label: { type: 'string' },",
-        '        },',
-        "        required: ['label'],",
-        '      },',
-        '    },',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    writeFileSync(configPath, JSON.stringify({
-      name: 'snapshot-bridge',
-      target: {
-        surface: 'mcp',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./server.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const actual = buildSurfaceSnapshot(project);
-
-    assertEqual(actual.surface, 'mcp', 'snapshot surface should be mcp');
-    assertEqual(actual.actions.length, 1, 'snapshot should include discovered action');
-    assertEqual(actual.actions[0].name, 'create_wallet', 'action name should match discovered tool');
-    assertEqual(actual.actions[0].args.length, 1, 'snapshot should include tool args');
-    assertEqual(actual.actions[0].args[0].name, 'label', 'arg name should match discovered schema');
-    assertEqual(actual.actions[0].args[0].required, true, 'required arg flag should be preserved');
-    assert(!('key' in actual.actions[0]), 'SurfaceSnapshot action should not expose the key field');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('buildMcpToolDefinitionsFromSnapshot preserves nested arg schemas', () => {
-  const snapshot: SurfaceSnapshot = {
-    surface: 'mcp',
-    actions: [
-      {
-        name: 'folders.update',
-        args: [
-          {
-            name: 'folderIds',
-            required: true,
-            type: 'array',
-            schema: {
-              type: 'array',
-              items: { type: 'integer' },
-              description: 'Folder ids to update',
-            },
-          },
-          {
-            name: 'peers',
-            required: false,
-            type: 'array',
-            schema: {
-              type: 'array',
-              items: { type: 'string' },
-            },
-          },
-        ],
-      },
-    ],
-  };
-
-  const definitions = buildMcpToolDefinitionsFromSnapshot(snapshot);
-  const properties = definitions[0].function.parameters?.properties as Record<string, any>;
-  assertEqual(properties.folderIds.items.type, 'integer', 'array items should be preserved in rebuilt tool schema');
-  assertEqual(properties.peers.items.type, 'string', 'optional array items should be preserved in rebuilt tool schema');
-});
-
-await test('loadSurfaceSnapshotFile supports versioned action snapshot artifact', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-snapshot-versioned-'));
-  try {
-    const snapshotPath = join(root, 'actions.snapshot.json');
-    writeActionSnapshotFile(snapshotPath, {
-      surface: 'cli',
-      actions: [
-        {
-          key: 'wallet create',
-          name: 'wallet create',
-          args: [{ name: '--label', required: true, type: 'string' }],
-        },
-      ],
-    });
-
-    const loaded = loadSurfaceSnapshotFile(snapshotPath);
-    assertEqual(loaded.surface, 'cli', 'versioned file should map to cli snapshot');
-    assertEqual(loaded.actions[0].name, 'wallet create', 'action name should map from action catalog');
-    assertEqual(loaded.actions[0].args[0].name, 'label', 'converted cli arg names should be normalized');
-    assert(!('key' in loaded.actions[0]), 'converted SurfaceSnapshot action should not expose key field');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('loadSurfaceSnapshotFile throws a clear error for unsupported plain snapshot format', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-snapshot-unsupported-format-'));
-  try {
-    const snapshotPath = join(root, 'surface.snapshot.json');
-    writeFileSync(snapshotPath, JSON.stringify({
-      surface: 'cli',
-      actions: [
-        {
-          name: 'wallet create',
-          args: [{ name: '--label', required: true, type: 'string' }],
-        },
-      ],
-    }, null, 2), 'utf-8');
-
-    let threw = false;
-    try {
-      loadSurfaceSnapshotFile(snapshotPath);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('not supported'), 'error should describe the format as unsupported');
-      assert(error.message.includes('.skill-optimizer/'), 'error should direct user to delete .skill-optimizer/');
-    }
-
-    assert(threw, 'unsupported plain snapshot format should throw');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('loadSurfaceSnapshotFile includes file context on invalid JSON', () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-snapshot-invalid-json-'));
-  try {
-    const snapshotPath = join(root, 'surface.snapshot.json');
-    writeFileSync(snapshotPath, '{ "surface": "mcp", "actions": ', 'utf-8');
-
-    let threw = false;
-    try {
-      loadSurfaceSnapshotFile(snapshotPath);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('Invalid surface snapshot file'), 'error should classify invalid JSON');
-      assert(error.message.includes(snapshotPath), 'error should include snapshot path');
-    }
-
-    assert(threw, 'invalid JSON should throw');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-if (failed > 0) {
-  console.log(`\n${failed}/${passed + failed} tests failed`);
-  process.exit(1);
-}
-
-console.log(`\n${passed}/${passed + failed} tests passed`);
diff --git a/tests/smoke-changelog-coverage.ts b/tests/smoke-changelog-coverage.ts
deleted file mode 100644
index 4041e73..0000000
--- a/tests/smoke-changelog-coverage.ts
+++ /dev/null
@@ -1,87 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { readFileSync, readdirSync } from 'node:fs';
-import { resolve } from 'node:path';
-
-/**
- * Parses the top version block of CHANGELOG.md and checks that every
- * item mentioned in Added/Fixed has at least one test file containing
- * a relevant token.
- *
- * Guards against "shipped feature, forgot the test" — the class that
- * let P1/P2/P3 escape in the first place.
- */
-
-const repoRoot = resolve(process.cwd());
-const changelog = readFileSync(resolve(repoRoot, 'CHANGELOG.md'), 'utf-8');
-
-// Grab the first ## block
-const blocks = changelog.split(/^##\s+/m).slice(1);
-assert.ok(blocks.length > 0, 'CHANGELOG.md must have at least one ## version heading');
-const topBlock = blocks[0]!;
-
-function extractSection(block: string, name: string): string[] {
-  // Split block on ### headings and find the named section
-  const parts = block.split(/^###\s+/m);
-  for (const part of parts) {
-    if (part.trimStart().toLowerCase().startsWith(name.toLowerCase())) {
-      return part
-        .split('\n')
-        .slice(1) // skip the section heading line
-        .map((l) => l.trim())
-        .filter((l) => l.startsWith('-'))
-        .map((l) => l.replace(/^-\s*/, ''));
-    }
-  }
-  return [];
-}
-
-const added = extractSection(topBlock, 'Added');
-const fixed = extractSection(topBlock, 'Fixed');
-const items = [...added, ...fixed];
-
-// If no items exist in the top block, skip the check (pre-release state).
-if (items.length === 0) {
-  console.log('SKIP: no Added/Fixed items in top CHANGELOG block');
-  process.exit(0);
-}
-
-const STOP = new Set([
-  'this', 'that', 'from', 'with', 'into', 'when', 'then',
-  'some', 'have', 'been', 'does', 'must', 'will', 'true', 'false',
-  'none', 'more', 'less', 'only', 'each', 'other', 'also',
-  'added', 'fixed', 'remove', 'removed', 'change', 'changed',
-  'every', 'their', 'where', 'which', 'about', 'bench', 'mark',
-]);
-
-const testFiles = readdirSync(resolve(repoRoot, 'tests'))
-  .filter((f) => f.startsWith('smoke-') && f.endsWith('.ts'))
-  .map((f) => readFileSync(resolve(repoRoot, 'tests', f), 'utf-8'));
-
-let failures = 0;
-for (const item of items) {
-  const tokens = item
-    .toLowerCase()
-    .split(/[^a-z0-9]+/)
-    .filter((t) => t.length >= 4 && !STOP.has(t))
-    .slice(0, 8);
-  if (tokens.length === 0) continue;
-  // Require at least 2 tokens to co-occur in a single test file (whole-word match).
-  // Prevents false-passes where a lone generic word like "prompt" or "coverage"
-  // appears somewhere in the corpus but no test actually covers the claimed behavior.
-  const minMatch = Math.min(2, tokens.length);
-  const hit = testFiles.some((content) => {
-    const matched = tokens.filter((t) => new RegExp(`\\b${t}\\b`, 'i').test(content));
-    return matched.length >= minMatch;
-  });
-  if (!hit) {
-    console.error(`[FAIL] CHANGELOG entry has no test reference: "${item}"`);
-    console.error(`       searched for tokens: ${tokens.join(', ')}`);
-    failures += 1;
-  }
-}
-
-assert.strictEqual(failures, 0,
-  `${failures} CHANGELOG item(s) have no matching test file — ` +
-  `either add a test or remove the CHANGELOG claim`);
-
-console.log(`PASS: smoke-changelog-coverage (${items.length} items, all with tests)`);
diff --git a/tests/smoke-cli-entry.ts b/tests/smoke-cli-entry.ts
deleted file mode 100644
index 06d3065..0000000
--- a/tests/smoke-cli-entry.ts
+++ /dev/null
@@ -1,50 +0,0 @@
-import { positionals } from '../src/cli.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (error: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${error.message}`);
-  }
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== CLI Entry Smoke Tests ===\n');
-
-await test('positionals keeps optimize command when boolean flag appears first', () => {
-  const result = positionals(['--skip-generation', 'optimize', '--config', './skill-optimizer.json']);
-  assertEqual(result[0], 'optimize', 'optimize command should remain positional');
-});
-
-await test('positionals keeps run command when boolean flag appears first', () => {
-  const result = positionals(['--no-cache', 'run', '--config', './skill-optimizer.json']);
-  assertEqual(result[0], 'run', 'run command should remain positional');
-});
-
-await test('positionals rejects unknown flags instead of swallowing the command', () => {
-  let threw = false;
-  try {
-    positionals(['--verbose', 'optimize', '--config', './skill-optimizer.json']);
-  } catch (error: any) {
-    threw = true;
-    assertEqual(error.message, 'Unknown flag: --verbose', 'unknown flag error should be explicit');
-  }
-  if (!threw) {
-    throw new Error('unknown flag should throw');
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-cli.ts b/tests/smoke-cli.ts
deleted file mode 100644
index bb2c375..0000000
--- a/tests/smoke-cli.ts
+++ /dev/null
@@ -1,313 +0,0 @@
-import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
-import { join } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import { extractShellBlock, parseShellCommands, extractFromCliMarkdown } from '../src/benchmark/extractors/cli-extractor.js';
-import { extract } from '../src/benchmark/extractors/index.js';
-import { loadCliCommands } from '../src/benchmark/config.js';
-import type { BenchmarkConfig, LLMResponse } from '../src/benchmark/types.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(
-      `${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`,
-    );
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) {
-    throw new Error(`Assertion failed: ${message}`);
-  }
-}
-
-console.log('\n=== CLI Surface Smoke Tests ===\n');
-
-await test('shell block extraction success', () => {
-  const markdown = 'Before\n```bash\nfast wallet send --to abc --amount 10\n```\nAfter';
-  const shell = extractShellBlock(markdown);
-  assertEqual(shell, 'fast wallet send --to abc --amount 10', 'should extract shell block');
-});
-
-await test('shell block extraction fails when none', () => {
-  const shell = extractShellBlock('No shell block here');
-  assertEqual(shell, null, 'should return null without bash/sh block');
-});
-
-await test('shell block extraction fails when multiple shell blocks', () => {
-  const markdown = [
-    '```bash',
-    'fast wallet send --to a --amount 1',
-    '```',
-    '```sh',
-    'fast wallet send --to b --amount 2',
-    '```',
-  ].join('\n');
-  const shell = extractShellBlock(markdown);
-  assertEqual(shell, null, 'should return null for multiple shell blocks');
-});
-
-await test('simple command', () => {
-  const calls = parseShellCommands('fast status');
-  assertEqual(calls.length, 1, 'one command expected');
-  assertEqual(calls[0].method, 'fast status', 'method should include executable and subcommand');
-});
-
-await test('subcommands', () => {
-  const calls = parseShellCommands('fast wallet send --to abc --amount 10');
-  assertEqual(calls[0].method, 'fast wallet send', 'method should include subcommand path');
-});
-
-await test('long option with separate value', () => {
-  const calls = parseShellCommands('fast wallet send --to fast1abc --amount 10');
-  assertEqual(calls[0].args.to as string, 'fast1abc', 'to should be parsed');
-  assertEqual(calls[0].args.amount as string, '10', 'amount should be parsed');
-});
-
-await test('long option with equals value', () => {
-  const calls = parseShellCommands('fast wallet send --to=fast1abc --amount=10');
-  assertEqual(calls[0].args.to as string, 'fast1abc', 'to should be parsed from = form');
-  assertEqual(calls[0].args.amount as string, '10', 'amount should be parsed from = form');
-});
-
-await test('short option with value', () => {
-  const calls = parseShellCommands('fast wallet create -n testnet');
-  assertEqual(calls[0].args.n as string, 'testnet', 'short option value should be parsed');
-});
-
-await test('positional args', () => {
-  const calls = parseShellCommands('fast wallet send --to fast1abc -- 10 memo');
-  assertEqual(calls[0].args._positional_0 as string, '10', 'first positional should be parsed');
-  assertEqual(calls[0].args._positional_1 as string, 'memo', 'second positional should be parsed');
-});
-
-await test('multiple commands and line numbers', () => {
-  const calls = parseShellCommands([
-    'fast wallet create -n testnet',
-    'fast wallet balance --address fast1abc',
-  ].join('\n'));
-  assertEqual(calls.length, 2, 'two commands expected');
-  assertEqual(calls[0].line, 1, 'first command line should be 1');
-  assertEqual(calls[1].line, 2, 'second command line should be 2');
-});
-
-await test('quoted strings', () => {
-  const calls = parseShellCommands('fast wallet send --memo "hello world" --to fast1abc');
-  assertEqual(calls[0].args.memo as string, 'hello world', 'quoted value should be unwrapped');
-});
-
-await test('known commands keep trailing positional args out of method', async () => {
-  const config = {
-    name: 'test-cli',
-    surface: 'cli',
-    cli: {
-      commands: 'commands.json',
-      commandDefinitions: [
-        { command: 'fast logs' },
-      ],
-    },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  } as BenchmarkConfig & {
-    surface: 'cli';
-    cli: { commands: string; commandDefinitions: Array<{ command: string }> };
-  };
-
-  const response: LLMResponse = {
-    content: '```bash\nfast logs my-service\n```',
-  };
-
-  const { calls } = await extract(response, config);
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'fast logs', 'method should match known command path only');
-  assertEqual(calls[0].args._positional_0 as string, 'my-service', 'trailing token should be positional');
-});
-
-await test('multi-token runner prefix resolves to known command', async () => {
-  const config = {
-    name: 'test-cli',
-    surface: 'cli',
-    cli: {
-      commands: 'commands.json',
-      commandDefinitions: [{ command: 'doctor' }],
-    },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  } as BenchmarkConfig & {
-    surface: 'cli';
-    cli: { commands: string; commandDefinitions: Array<{ command: string }> };
-  };
-
-  const response: LLMResponse = {
-    content: '```bash\nnpx skill-optimizer doctor --config ./foo.json\n```',
-  };
-
-  const { calls } = await extract(response, config);
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'doctor', 'method should be the known command, not the prefix');
-  assertEqual(calls[0].args.config as string, './foo.json', 'config flag value should be parsed');
-});
-
-await test('flag value before subcommand does not match as method', async () => {
-  const config = {
-    name: 'test-cli',
-    surface: 'cli',
-    cli: {
-      commands: 'commands.json',
-      commandDefinitions: [{ command: 'run' }],
-    },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  } as BenchmarkConfig & {
-    surface: 'cli';
-    cli: { commands: string; commandDefinitions: Array<{ command: string }> };
-  };
-
-  // Pathological shape: subcommand sits at index 4 (past the skip<=2 window).
-  // The fix caps skip at 2 so we don't reach past a flag-and-value pair and
-  // accidentally anchor on a value token. The heuristic fallback applies here.
-  const response: LLMResponse = {
-    content: '```bash\nnpx skill-optimizer --config ./foo.json run\n```',
-  };
-
-  const { calls } = await extract(response, config);
-  assertEqual(calls.length, 1, 'one call expected');
-  assert(
-    calls[0].method !== 'run',
-    `method should not spuriously match past the flag-value pair (got "${calls[0].method}")`,
-  );
-});
-
-await test('command with env assignment', () => {
-  const calls = parseShellCommands('FAST_NETWORK=testnet FAST_PROFILE=dev fast wallet status');
-  const env = calls[0].args.env as Record<string, string>;
-  assertEqual(env.FAST_NETWORK, 'testnet', 'env FAST_NETWORK parsed');
-  assertEqual(env.FAST_PROFILE, 'dev', 'env FAST_PROFILE parsed');
-});
-
-await test('multiline with trailing backslash', () => {
-  const markdown = [
-    '```bash',
-    'fast wallet send \\',
-    '  --to fast1abc \\',
-    '  --amount 10',
-    '```',
-  ].join('\n');
-  const calls = extractFromCliMarkdown(markdown);
-  assertEqual(calls.length, 1, 'one merged command expected');
-  assertEqual(calls[0].args.to as string, 'fast1abc', 'to should be parsed');
-  assertEqual(calls[0].args.amount as string, '10', 'amount should be parsed');
-});
-
-await test('chained commands split into multiple calls', () => {
-  const calls = parseShellCommands('fast auth login && fast wallet balance');
-  assertEqual(calls.length, 2, 'two chained commands expected');
-  assertEqual(calls[0].method, 'fast auth login', 'first chained command should be parsed');
-  assertEqual(calls[1].method, 'fast wallet balance', 'second chained command should be parsed');
-});
-
-await test('extract factory dispatches surface=cli', async () => {
-  const config = {
-    name: 'test-cli',
-    surface: 'cli',
-    cli: { commands: 'commands.json' },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  } as BenchmarkConfig & {
-    surface: 'cli';
-    cli: { commands: string };
-  };
-
-  const response: LLMResponse = {
-    content: '```sh\nfast wallet send --to fast1abc --amount 10\n```',
-  };
-
-  const { calls, generatedCode } = await extract(response, config);
-  assertEqual(generatedCode, 'fast wallet send --to fast1abc --amount 10', 'generatedCode should preserve the shell block contents');
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'fast wallet send', 'method should be parsed');
-});
-
-await test('loadCliCommands: accepts flat command schema', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-cli-'));
-  try {
-    const file = join(dir, 'commands.json');
-    writeFileSync(file, JSON.stringify([
-      {
-        command: 'fast wallet send',
-        options: [{ name: 'to', takesValue: true }],
-      },
-    ]), 'utf-8');
-
-    const commands = loadCliCommands(file);
-    assertEqual(commands.length, 1, 'one command expected');
-    assertEqual(commands[0].command, 'fast wallet send', 'command path should be loaded');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('loadCliCommands: rejects entries without command', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-cli-'));
-  try {
-    const file = join(dir, 'commands.json');
-    writeFileSync(file, JSON.stringify([
-      {
-        path: 'fast wallet send',
-      },
-    ]), 'utf-8');
-
-    let threw = false;
-    try {
-      loadCliCommands(file);
-    } catch (error: any) {
-      threw = true;
-      assert(
-        error.message.includes('command'),
-        'error should mention missing command field',
-      );
-    }
-
-    assert(threw, 'should reject command entries without command field');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-code.ts b/tests/smoke-code.ts
deleted file mode 100644
index d33b21e..0000000
--- a/tests/smoke-code.ts
+++ /dev/null
@@ -1,1021 +0,0 @@
-import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
-import { join } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import { extractCodeBlock } from '../src/benchmark/extractors/code-extractor.js';
-import { extractSdkCodeBlock } from '../src/benchmark/extractors/code-extractor.js';
-import { extractAllFromCode } from '../src/benchmark/extractors/code-analyzer.js';
-import { extract } from '../src/benchmark/extractors/index.js';
-import { evaluateTask } from '../src/benchmark/evaluator.js';
-import { computeCoverage } from '../src/benchmark/coverage.js';
-import { initBenchmark } from '../src/benchmark/init.js';
-import { loadConfig } from '../src/benchmark/config.js';
-import type { ExtractedCall, TaskDefinition, ModelConfig, BenchmarkConfig, LLMResponse } from '../src/benchmark/types.js';
-
-// ── Test harness ───────────────────────────────────────────────────────────
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-// ── Helpers ────────────────────────────────────────────────────────────────
-
-const MODEL: ModelConfig = { id: 'test/model', name: 'TestModel', tier: 'flagship' as const };
-
-const KNOWN_METHODS = new Set([
-  'FastProvider.constructor',
-  'FastWallet.fromKeyfile',
-  'FastWallet.send',
-  'FastWallet.balance',
-]);
-
-function makeTask(id: string, methods: string[]): TaskDefinition {
-  return {
-    id,
-    prompt: `Task ${id}`,
-    expected_actions: methods.map((name) => ({ name })),
-  };
-}
-
-function makeCall(method: string, args: Record<string, unknown> = {}): ExtractedCall {
-  return { method, args, line: 1, raw: 'mock' };
-}
-
-// ── Tests ──────────────────────────────────────────────────────────────────
-
-console.log('\n=== SDK Surface Smoke Tests ===\n');
-
-// Group 1: extractCodeBlock
-
-await test('extractCodeBlock: finds typescript block', () => {
-  const md = '```typescript\nconst x = 1;\n```';
-  const result = extractCodeBlock(md);
-  assertEqual(result, 'const x = 1;', 'should extract typescript block content');
-});
-
-await test('extractCodeBlock: finds ts block', () => {
-  const md = '```ts\nconst x = 1;\n```';
-  const result = extractCodeBlock(md);
-  assertEqual(result, 'const x = 1;', 'should extract ts block content');
-});
-
-await test('extractCodeBlock: returns null on no block', () => {
-  const result = extractCodeBlock('Here is some text without any code blocks');
-  assertEqual(result, null, 'should return null when no code block present');
-});
-
-await test('extractCodeBlock: returns null on non-ts block', () => {
-  // The regex only matches typescript|ts|javascript|js or bare ```.
-  // A ```python block does NOT match, so it returns null.
-  const md = "```python\nprint('hello')\n```";
-  const result = extractCodeBlock(md);
-  assertEqual(result, null, 'should return null for python code block');
-});
-
-await test('extractSdkCodeBlock: finds python block', () => {
-  const md = '```python\nclient = FastClient()\n```';
-  const result = extractSdkCodeBlock(md, 'python');
-  assertEqual(result, 'client = FastClient()', 'should extract python block content');
-});
-
-await test('extractSdkCodeBlock: finds rust block', () => {
-  const md = '```rust\nlet client = FastClient::new();\n```';
-  const result = extractSdkCodeBlock(md, 'rust');
-  assertEqual(result, 'let client = FastClient::new();', 'should extract rust block content');
-});
-
-await test('loadConfig: rejects unsupported sdk language', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-lang-'));
-  try {
-    const configPath = join(dir, 'skill-optimizer.json');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'bad-sdk',
-      target: {
-        surface: 'sdk',
-        repoPath: '.',
-        sdk: { language: 'java' },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        baseUrl: 'https://example.com',
-        format: 'openai',
-        models: [{ id: 'test/model', name: 'Test Model', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-    writeFileSync(join(dir, 'tasks.json'), JSON.stringify({ tasks: [] }, null, 2), 'utf-8');
-
-    let threw = false;
-    try {
-      await loadConfig(configPath);
-    } catch (error: any) {
-      threw = true;
-      assert(
-        error.message.includes('sdk.language'),
-        'error should mention sdk.language validation',
-      );
-    }
-
-    assert(threw, 'should reject unsupported sdk.language values');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-// Group 2: extractAllFromCode (tree-sitter, no config hints needed)
-
-await test('extractAllFromCode: constructor call', async () => {
-  const code = 'const provider = new FastProvider("testnet");';
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  assertEqual(calls[0].method, 'FastProvider.constructor', 'method should be FastProvider.constructor');
-  assertEqual(calls[0].args['_positional_0'] as string, 'testnet', 'first positional arg should be "testnet"');
-});
-
-await test('extractAllFromCode: variable tracking (raw calls)', async () => {
-  const code = [
-    'const provider = new FastProvider("testnet");',
-    'const wallet = await FastWallet.fromKeyfile(provider);',
-    'const balance = await wallet.balance();',
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 3, 'should find 3 calls');
-  // Raw extraction: wallet.balance is unresolved (resolution happens in evaluateTask with bindings)
-  assertEqual(calls[2].method, 'wallet.balance', 'third raw call method should be wallet.balance');
-});
-
-await test('extractAllFromCode: static method', async () => {
-  const code = 'const wallet = await FastWallet.fromKeyfile(provider, "merchant");';
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  assertEqual(calls[0].method, 'FastWallet.fromKeyfile', 'method should be FastWallet.fromKeyfile');
-  assertEqual(calls[0].args['_positional_1'] as string, 'merchant', 'second positional arg should be "merchant"');
-});
-
-await test('extractAllFromCode: object arguments', async () => {
-  const code = [
-    'const provider = new FastProvider("testnet");',
-    'const wallet = await FastWallet.fromKeyfile(provider);',
-    'await wallet.send({ to: "fast1abc", amount: "5", token: "FAST" });',
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  // Raw extraction: wallet.send is unresolved
-  const sendCall = calls.find((c) => c.method === 'wallet.send');
-  assert(sendCall !== undefined, 'should find a wallet.send call (raw, unresolved)');
-  assertEqual(sendCall!.args['to'] as string, 'fast1abc', 'to arg should be "fast1abc"');
-  assertEqual(sendCall!.args['amount'] as string, '5', 'amount arg should be "5"');
-  assertEqual(sendCall!.args['token'] as string, 'FAST', 'token arg should be "FAST"');
-});
-
-await test('extractAllFromCode: empty code returns empty arrays', async () => {
-  const { calls, bindings } = await extractAllFromCode('');
-  assertEqual(calls.length, 0, 'should return empty array for empty code');
-  assertEqual(bindings.size, 0, 'should return empty bindings for empty code');
-});
-
-await test('extractAllFromCode: standalone function call', async () => {
-  const code = `const result = await x402Pay({ url: 'https://api.example.com', wallet: { type: 'evm' } });`;
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  assertEqual(calls[0].method, 'x402Pay', 'method should be x402Pay');
-  assertEqual(calls[0].args['url'] as string, 'https://api.example.com', 'url arg');
-});
-
-await test('extractAllFromCode: bindings capture factory returns', async () => {
-  const code = [
-    `const f = fast({ network: 'testnet' });`,
-    `await f.setup();`,
-    `const balance = await f.balance({ token: 'FAST' });`,
-  ].join('\n');
-  const { calls, bindings } = await extractAllFromCode(code);
-  assertEqual(calls.length, 3, 'should find 3 calls');
-  assertEqual(calls[0].method, 'fast', 'first call should be fast');
-  assertEqual(calls[0].args['network'] as string, 'testnet', 'network arg');
-  // Raw calls use variable names; bindings map f → fast
-  assertEqual(calls[1].method, 'f.setup', 'second raw call should be f.setup');
-  assertEqual(calls[2].method, 'f.balance', 'third raw call should be f.balance');
-  assertEqual(bindings.get('f'), 'fast', 'bindings should map f → fast');
-  assertEqual(calls[2].args['token'] as string, 'FAST', 'token arg');
-});
-
-await test('extractAllFromCode: mixed classes and functions', async () => {
-  const code = [
-    `const account = createEvmWallet('~/.evm/keys/default.json');`,
-    `const allset = new AllSetProvider({ network: 'testnet' });`,
-    `await allset.sendToFast({ chain: 'arbitrum', token: 'USDC', amount: '1000000' });`,
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 3, 'should find 3 calls');
-  assertEqual(calls[0].method, 'createEvmWallet', 'first call should be createEvmWallet');
-  assertEqual(calls[0].args['_positional_0'] as string, '~/.evm/keys/default.json', 'keyfile path arg');
-  assertEqual(calls[1].method, 'AllSetProvider.constructor', 'second call should be AllSetProvider.constructor');
-  // Raw: allset.sendToFast (unresolved)
-  assertEqual(calls[2].method, 'allset.sendToFast', 'third raw call should be allset.sendToFast');
-  assertEqual(calls[2].args['chain'] as string, 'arbitrum', 'chain arg');
-});
-
-await test('extractAllFromCode: standalone function with no classes', async () => {
-  const code = [
-    `const result = await x402Pay({`,
-    `  url: 'https://api.example.com/premium',`,
-    `  wallet: { type: 'evm', privateKey: '0x123', address: '0xabc' },`,
-    `  verbose: true,`,
-    `});`,
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  assertEqual(calls[0].method, 'x402Pay', 'method should be x402Pay');
-  assertEqual(calls[0].args['url'] as string, 'https://api.example.com/premium', 'url arg');
-  assertEqual(calls[0].args['verbose'] as boolean, true, 'verbose arg');
-});
-
-await test('extractAllFromCode: nested object arguments', async () => {
-  const code = [
-    `const result = await x402Pay({`,
-    `  url: 'https://api.example.com/premium',`,
-    `  wallet: { type: 'evm', privateKey: '0x123', address: '0xabc' },`,
-    `});`,
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  assertEqual((calls[0].args['wallet'] as Record<string, unknown>).type as string, 'evm', 'wallet.type arg');
-  assertEqual((calls[0].args['wallet'] as Record<string, unknown>).address as string, '0xabc', 'wallet.address arg');
-});
-
-await test('extractAllFromCode: resolves identifier-backed nested arguments', async () => {
-  const code = [
-    `const fastWallet = { type: 'fast', address: 'fast1abc', publicKey: 'pub', privateKey: 'priv' };`,
-    `const evmWallet = { type: 'evm', address: '0xabc', privateKey: '0x123' };`,
-    `const result = await x402Pay({`,
-    `  url: 'https://api.example.com/premium',`,
-    `  wallet: [fastWallet, evmWallet],`,
-    `});`,
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  const wallet = calls[0].args['wallet'] as unknown[];
-  assertEqual((wallet[0] as Record<string, unknown>).type as string, 'fast', 'wallet[0].type arg');
-  assertEqual((wallet[1] as Record<string, unknown>).type as string, 'evm', 'wallet[1].type arg');
-});
-
-await test('extract factory dispatches surface=sdk', async () => {
-  const config: BenchmarkConfig = {
-    name: 'test-sdk',
-    surface: 'sdk',
-    sdk: { language: 'typescript' },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  };
-
-  const response: LLMResponse = {
-    content: '```ts\nconst provider = new FastProvider("testnet");\n```',
-  };
-
-  const { calls, generatedCode } = await extract(response, config);
-  assertEqual(generatedCode, 'const provider = new FastProvider("testnet");', 'should preserve extracted TypeScript block');
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'FastProvider.constructor', 'method should be parsed');
-});
-
-await test('initBenchmark sdk: creates skill-optimizer.json with task generation enabled', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-init-'));
-  try {
-    initBenchmark(dir, 'sdk');
-    const config = JSON.parse(readFileSync(join(dir, '.skill-optimizer', 'skill-optimizer.json'), 'utf-8')) as {
-      target: { surface: string };
-      benchmark: { taskGeneration?: { enabled?: boolean }; tasks?: string };
-    };
-    assertEqual(config.target.surface, 'sdk', 'sdk scaffold should emit sdk surface');
-    assert(config.benchmark.taskGeneration?.enabled === true, 'scaffold should enable task generation');
-    assert(!config.benchmark.tasks, 'scaffold should not set benchmark.tasks when task generation is on');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('initBenchmark cli: creates cli-commands.json and sets target.cli.commands', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-init-'));
-  try {
-    initBenchmark(dir, 'cli');
-    const configPath = join(dir, '.skill-optimizer', 'skill-optimizer.json');
-    const commandsPath = join(dir, '.skill-optimizer', 'cli-commands.json');
-    const config = JSON.parse(readFileSync(configPath, 'utf-8')) as {
-      target: { surface: string; cli?: { commands?: string } };
-      benchmark: { taskGeneration?: { enabled?: boolean }; tasks?: string };
-    };
-    assertEqual(config.target.surface, 'cli', 'cli scaffold should emit cli surface');
-    assert(existsSync(commandsPath), 'cli scaffold should create cli-commands.json');
-    assert(typeof config.target.cli?.commands === 'string', 'cli scaffold should set target.cli.commands');
-    assert(config.benchmark.taskGeneration?.enabled === true, 'cli scaffold should enable task generation');
-    assert(!config.benchmark.tasks, 'cli scaffold should not set benchmark.tasks');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('initBenchmark mcp: creates tools.json and sets target.mcp.tools', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-init-'));
-  try {
-    initBenchmark(dir, 'mcp');
-    const configPath = join(dir, '.skill-optimizer', 'skill-optimizer.json');
-    const toolsPath = join(dir, '.skill-optimizer', 'tools.json');
-    const config = JSON.parse(readFileSync(configPath, 'utf-8')) as {
-      target: { surface: string; mcp?: { tools?: string } };
-      benchmark: { taskGeneration?: { enabled?: boolean }; tasks?: string };
-    };
-    assertEqual(config.target.surface, 'mcp', 'mcp scaffold should emit mcp surface');
-    assert(existsSync(toolsPath), 'mcp scaffold should create tools.json');
-    assert(typeof config.target.mcp?.tools === 'string', 'mcp scaffold should set target.mcp.tools');
-    assert(config.benchmark.taskGeneration?.enabled === true, 'mcp scaffold should enable task generation');
-    assert(!config.benchmark.tasks, 'mcp scaffold should not set benchmark.tasks');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-// Group 3: evaluateTask
-
-await test('evaluateTask: perfect match → taskPassed=true', () => {
-  const task = makeTask('t1', ['FastProvider.constructor', 'FastWallet.fromKeyfile']);
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('FastProvider.constructor'),
-    makeCall('FastWallet.fromKeyfile'),
-  ];
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: KNOWN_METHODS,
-  });
-  assertEqual(result.metrics.taskPassed, true, 'taskPassed should be true');
-  assertEqual(result.metrics.toolSelectionAccuracy, 1.0, 'toolSelectionAccuracy should be 1.0');
-  assertEqual(result.metrics.toolRecall, 1.0, 'toolRecall should be 1.0');
-});
-
-await test('evaluateTask: hallucinated method → hallucinationRate > 0', () => {
-  const task = makeTask('t2', ['FastProvider.constructor']);
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('FastProvider.constructor'),
-    makeCall('FastWallet.doSomethingFake'),
-  ];
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: KNOWN_METHODS,
-  });
-  assert(result.metrics.hallucinatedActions.length > 0, 'hallucinatedActions should be non-empty');
-  assert(result.metrics.hallucinationRate > 0, 'hallucinationRate should be > 0');
-});
-
-await test('evaluateTask: missing expected method → taskPassed=false', () => {
-  const task = makeTask('t3', ['FastProvider.constructor', 'FastWallet.fromKeyfile']);
-  // Only provide one of the two expected calls
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('FastProvider.constructor'),
-  ];
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: KNOWN_METHODS,
-  });
-  assertEqual(result.metrics.taskPassed, false, 'taskPassed should be false');
-  assert(result.metrics.toolRecall < 1.0, 'toolRecall should be < 1.0');
-});
-
-await test('evaluateTask: nested expected args match recursively', () => {
-  const task: TaskDefinition = {
-    id: 'nested-args',
-    prompt: 'Task nested',
-    expected_actions: [
-      {
-        name: 'x402Pay',
-        args: {
-          url: 'https://api.example.com/premium',
-          wallet: {
-            type: 'evm',
-            address: '0xabc',
-          },
-        } as unknown as Record<string, string>,
-      },
-    ],
-  };
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('x402Pay', {
-      url: 'https://api.example.com/premium',
-      wallet: {
-        type: 'evm',
-        privateKey: '0x123',
-        address: '0xabc',
-      },
-    }),
-  ];
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: new Set(['x402Pay']),
-  });
-  assertEqual(result.metrics.taskPassed, true, 'taskPassed should be true for nested args');
-  assertEqual(result.metrics.argAccuracy, 1.0, 'argAccuracy should be 1.0 for nested args');
-});
-
-await test('evaluateTask: nested expected args fail when nested field differs', () => {
-  const task: TaskDefinition = {
-    id: 'nested-args-fail',
-    prompt: 'Task nested fail',
-    expected_actions: [
-      {
-        name: 'x402Pay',
-        args: {
-          wallet: {
-            type: 'fast',
-          },
-        } as unknown as Record<string, string>,
-      },
-    ],
-  };
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('x402Pay', {
-      wallet: {
-        type: 'evm',
-      },
-    }),
-  ];
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: new Set(['x402Pay']),
-  });
-  assertEqual(result.metrics.taskPassed, false, 'taskPassed should be false for mismatched nested args');
-  assert(result.metrics.argAccuracy < 1.0, 'argAccuracy should be < 1.0 for mismatched nested args');
-});
-
-// Group 4: extractAllFromCode (generic, no config hints)
-
-await test('extractAllFromCode: extracts all calls without hints', async () => {
-  const code = [
-    `const f = fast({ network: 'testnet' });`,
-    `await f.setup();`,
-    `await f.balance();`,
-    `const result = await x402Pay({ url: 'https://example.com' });`,
-    `console.log(result);`,
-  ].join('\n');
-  const { calls, bindings } = await extractAllFromCode(code);
-  const methods = calls.map(c => c.method);
-  assert(methods.includes('fast'), 'should find fast');
-  assert(methods.includes('f.setup'), 'should find f.setup');
-  assert(methods.includes('f.balance'), 'should find f.balance');
-  assert(methods.includes('x402Pay'), 'should find x402Pay');
-  assertEqual(bindings.get('f'), 'fast', 'f should be bound to fast');
-});
-
-await test('extractAllFromCode: extracts constructors and resolves via bindings', async () => {
-  const code = [
-    `const allset = new AllSetProvider({ network: 'testnet' });`,
-    `await allset.sendToFast({ to: 'fast1abc', amount: '1000000' });`,
-  ].join('\n');
-  const { calls, bindings } = await extractAllFromCode(code);
-  assertEqual(calls.length, 2, 'should find 2 calls');
-  assertEqual(calls[0].method, 'AllSetProvider.constructor', 'first call should be constructor');
-  assertEqual(calls[1].method, 'allset.sendToFast', 'second call raw is allset.sendToFast');
-  assertEqual(bindings.get('allset'), 'AllSetProvider', 'allset bound to AllSetProvider');
-});
-
-await test('extractAllFromCode: extracts static method calls', async () => {
-  const code = [
-    `const provider = new FastProvider('testnet');`,
-    `const wallet = await FastWallet.fromKeyfile(provider, 'merchant');`,
-    `await wallet.send({ to: 'fast1abc', amount: '5' });`,
-  ].join('\n');
-  const { calls, bindings } = await extractAllFromCode(code);
-  assertEqual(calls.length, 3, 'should find 3 calls');
-  assertEqual(calls[0].method, 'FastProvider.constructor', 'constructor call');
-  assertEqual(calls[1].method, 'FastWallet.fromKeyfile', 'static method call');
-  assertEqual(calls[2].method, 'wallet.send', 'raw member call on wallet');
-  assertEqual(bindings.get('provider'), 'FastProvider', 'provider bound to FastProvider');
-  assertEqual(bindings.get('wallet'), 'FastWallet', 'wallet bound to FastWallet');
-});
-
-await test('extractAllFromCode: resolves literal-backed nested args', async () => {
-  const code = [
-    `const fastWallet = { type: 'fast', address: 'fast1abc' };`,
-    `const evmWallet = { type: 'evm', address: '0xabc' };`,
-    `const result = await x402Pay({ url: 'https://example.com', wallet: [fastWallet, evmWallet] });`,
-  ].join('\n');
-  const { calls } = await extractAllFromCode(code);
-  assertEqual(calls.length, 1, 'should find 1 call');
-  const wallet = calls[0].args['wallet'] as unknown[];
-  assert(Array.isArray(wallet), 'wallet should be an array');
-  assertEqual((wallet[0] as Record<string, unknown>).type as string, 'fast', 'wallet[0].type');
-  assertEqual((wallet[1] as Record<string, unknown>).type as string, 'evm', 'wallet[1].type');
-});
-
-// Group 5: evaluateTask with bindings (task-driven resolution)
-
-await test('evaluateTask: resolves raw calls via bindings + task expectations', () => {
-  const task: TaskDefinition = {
-    id: 'resolve-test',
-    prompt: 'Test resolution',
-    expected_actions: [
-      { name: 'fast', args: { network: 'testnet' } },
-      { name: 'FastClient.setup' },
-      { name: 'FastClient.balance' },
-    ],
-  };
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('fast', { network: 'testnet' }),
-    makeCall('f.setup', {}),
-    makeCall('f.balance', {}),
-  ];
-  const bindings = new Map([['f', 'fast']]);
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: new Set(['fast', 'FastClient.setup', 'FastClient.balance']),
-    bindings,
-  });
-  assertEqual(result.metrics.taskPassed, true, 'should pass after resolution');
-  assertEqual(result.metrics.toolRecall, 1.0, 'recall should be 1.0');
-});
-
-await test('evaluateTask: resolves constructor-based bindings', () => {
-  const task: TaskDefinition = {
-    id: 'constructor-resolve',
-    prompt: 'Test constructor',
-    expected_actions: [
-      { name: 'AllSetProvider.constructor', args: { network: 'testnet' } },
-      { name: 'AllSetProvider.sendToFast' },
-    ],
-  };
-  const extractedCalls: ExtractedCall[] = [
-    makeCall('AllSetProvider.constructor', { network: 'testnet' }),
-    makeCall('allset.sendToFast', { to: 'fast1abc' }),
-  ];
-  const bindings = new Map([['allset', 'AllSetProvider']]);
-  const result = evaluateTask({
-    task,
-    model: MODEL,
-    surface: 'sdk',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods: new Set(['AllSetProvider.constructor', 'AllSetProvider.sendToFast']),
-    bindings,
-  });
-  assertEqual(result.metrics.taskPassed, true, 'should pass constructor resolution');
-  assertEqual(result.metrics.toolRecall, 1.0, 'recall should be 1.0');
-});
-
-// Group 6: computeCoverage
-
-await test('computeCoverage: identifies covered and uncovered methods', () => {
-  const tasks: TaskDefinition[] = [
-    makeTask('task-a', ['FastProvider.constructor']),
-    makeTask('task-b', ['FastWallet.fromKeyfile']),
-  ];
-  const allMethods = ['FastProvider.constructor', 'FastWallet.fromKeyfile', 'FastWallet.send'];
-  const coverage = computeCoverage(tasks, allMethods);
-
-  assertEqual(coverage.length, 3, 'should return coverage for all 3 methods');
-
-  const providerCov = coverage.find((c) => c.method === 'FastProvider.constructor');
-  assert(providerCov !== undefined, 'should have coverage entry for FastProvider.constructor');
-  assertEqual(providerCov!.covered, true, 'FastProvider.constructor should be covered');
-
-  const walletCov = coverage.find((c) => c.method === 'FastWallet.fromKeyfile');
-  assert(walletCov !== undefined, 'should have coverage entry for FastWallet.fromKeyfile');
-  assertEqual(walletCov!.covered, true, 'FastWallet.fromKeyfile should be covered');
-
-  const sendCov = coverage.find((c) => c.method === 'FastWallet.send');
-  assert(sendCov !== undefined, 'should have coverage entry for FastWallet.send');
-  assertEqual(sendCov!.covered, false, 'FastWallet.send should NOT be covered');
-});
-
-console.log('\n=== Doctor / checkConfig Tests ===\n');
-
-await test('checkConfig: valid sdk config returns no errors', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-check-'));
-  try {
-    // Create real files so path-existence checks pass
-    writeFileSync(join(dir, 'index.ts'), '// entry', 'utf-8');
-    writeFileSync(join(dir, 'tasks.json'), JSON.stringify({ tasks: [] }), 'utf-8');
-    const configPath = join(dir, 'skill-optimizer.json');
-    const config = {
-      name: 'my-sdk',
-      target: { surface: 'sdk' as const, discovery: { sources: ['./index.ts'], language: 'typescript' as const } },
-      benchmark: {
-        format: 'pi' as const,
-        models: [{ id: 'openrouter/openai/gpt-4o', name: 'GPT-4o', tier: 'flagship' as const }],
-        tasks: './tasks.json',
-      },
-    };
-    const issues = await checkConfig(config as any, configPath);
-    // Filter out api-key-not-set warning (env-dependent) and focus on real errors
-    const errors = issues.filter(i => i.severity === 'error');
-    assert(errors.length === 0, `expected 0 errors, got: ${errors.map(i => i.message).join(', ')}`);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('checkConfig: missing name returns error', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    target: { surface: 'sdk' as const },
-    benchmark: { models: [] },
-  };
-  const issues = await checkConfig(config as any, '/fake/path/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'missing-name');
-  assert(err !== undefined, 'expected missing-name error');
-  assert(err!.severity === 'error', 'should be error severity');
-});
-
-await test('checkConfig: invalid surface returns error', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'grpc' },
-    benchmark: { models: [{ id: 'openrouter/openai/gpt-4o', name: 'GPT-4o', tier: 'flagship' }] },
-  };
-  const issues = await checkConfig(config as any, '/fake/path/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'invalid-surface');
-  assert(err !== undefined, 'expected invalid-surface issue');
-  assert(err!.severity === 'error', 'should be error severity');
-});
-
-await test('checkConfig: empty models array returns error', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'sdk' as const },
-    benchmark: { models: [] },
-  };
-  const issues = await checkConfig(config as any, '/fake/path/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'missing-models');
-  assert(err !== undefined, 'expected missing-models error for benchmark.models');
-});
-
-await test('checkConfig: model ID missing openrouter/ prefix → fixable error', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'cli' as const, discovery: { sources: ['./src/cli.ts'] } },
-    benchmark: {
-      format: 'pi' as const,
-      models: [{ id: 'z-ai/glm-5.1', name: 'GLM', tier: 'mid' as const }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(config as any, '/fake/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'model-id-missing-prefix');
-  assert(err !== undefined, 'expected model-id-missing-prefix issue');
-  assert(err!.fixable === true, 'model-id-missing-prefix should be fixable');
-  assert(err!.severity === 'error', 'model-id-missing-prefix should be error severity');
-  assert(err!.hint?.includes('openrouter/z-ai/glm-5.1'), `hint should show corrected ID, got: ${err!.hint}`);
-});
-
-await test('checkConfig: openrouter/ model ID with dot version → no warning (dots preserved)', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'cli' as const, discovery: { sources: ['./src/cli.ts'] } },
-    benchmark: {
-      format: 'pi' as const,
-      models: [{ id: 'openrouter/anthropic/claude-sonnet-4.6', name: 'Claude', tier: 'flagship' as const }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(config as any, '/fake/skill-optimizer.json');
-  const warn = issues.find(i => i.code === 'model-id-bad-format');
-  assert(warn === undefined, 'openrouter/ model IDs with dots must not trigger model-id-bad-format — dots are preserved verbatim');
-});
-
-await test('checkConfig: direct openai model IDs do not get OpenRouter dot-version warning', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'cli' as const, discovery: { sources: ['./src/cli.ts'] } },
-    benchmark: {
-      format: 'pi' as const,
-      models: [{ id: 'openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' as const }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(config as any, '/fake/skill-optimizer.json');
-  const warn = issues.find(i => i.code === 'model-id-bad-format');
-  assert(warn === undefined, 'direct openai model IDs should not be warned as OpenRouter dot versions');
-});
-
-await test('checkConfig: codex auth rejects non-openai benchmark models', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'sdk' as const, discovery: { sources: ['./src/index.ts'], language: 'typescript' as const } },
-    benchmark: {
-      format: 'pi' as const,
-      authMode: 'codex' as const,
-      models: [
-        { id: 'openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' as const },
-        { id: 'openrouter/openai/gpt-5.4', name: 'OpenRouter GPT-5.4', tier: 'flagship' as const },
-      ],
-      tasks: './tasks.json',
-    },
-  };
-  const issues = await checkConfig(config as any, '/fake/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'codex-auth-provider-mismatch' && i.field.includes('benchmark.models'));
-  assert(err !== undefined, 'expected codex-auth-provider-mismatch for benchmark.models');
-  assert(err!.severity === 'error', 'benchmark model/provider mismatch should be an error');
-});
-
-await test('checkConfig: codex auth rejects non-openai optimize model', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'sdk' as const, discovery: { sources: ['./src/index.ts'], language: 'typescript' as const } },
-    benchmark: {
-      format: 'pi' as const,
-      models: [{ id: 'openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' as const }],
-      tasks: './tasks.json',
-    },
-    optimize: {
-      authMode: 'codex' as const,
-      model: 'openrouter/anthropic/claude-sonnet-4.6',
-      allowedPaths: ['SKILL.md'],
-    },
-  };
-  const issues = await checkConfig(config as any, '/fake/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'codex-auth-provider-mismatch' && i.field === 'optimize.model');
-  assert(err !== undefined, 'expected codex-auth-provider-mismatch for optimize.model');
-  assert(err!.severity === 'error', 'optimize model/provider mismatch should be an error');
-});
-
-await test('checkConfig: inherited codex auth rejects non-openai optimize model', async () => {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const config = {
-    name: 'test',
-    target: { surface: 'sdk' as const, discovery: { sources: ['./src/index.ts'], language: 'typescript' as const } },
-    benchmark: {
-      format: 'pi' as const,
-      authMode: 'codex' as const,
-      models: [{ id: 'openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' as const }],
-      tasks: './tasks.json',
-    },
-    optimize: {
-      model: 'openrouter/anthropic/claude-sonnet-4.6',
-      allowedPaths: ['SKILL.md'],
-    },
-  };
-  const issues = await checkConfig(config as any, '/fake/skill-optimizer.json');
-  const err = issues.find(i => i.code === 'codex-auth-provider-mismatch' && i.field === 'optimize.model');
-  assert(err !== undefined, 'expected inherited codex-auth-provider-mismatch for optimize.model');
-  assert(err!.severity === 'error', 'inherited optimize model/provider mismatch should be an error');
-});
-
-await test('applyFixes: adds openrouter/ prefix to model IDs', async () => {
-  const { applyFixes } = await import('../src/project/fix.js');
-  const { checkConfig } = await import('../src/project/validate.js');
-  const rawJson = {
-    name: 'test',
-    target: { surface: 'cli', discovery: { sources: ['./src/cli.ts'] } },
-    benchmark: {
-      format: 'pi',
-      models: [
-        { id: 'z-ai/glm-5.1', name: 'GLM', tier: 'mid' },
-        { id: 'openrouter/openai/gpt-4o', name: 'GPT', tier: 'flagship' },
-      ],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(rawJson as any, '/fake/skill-optimizer.json');
-  const fixed = applyFixes(rawJson as any, issues, '/fake');
-  const models = (fixed.benchmark as any).models as Array<{ id: string }>;
-  assertEqual(models[0]!.id, 'openrouter/z-ai/glm-5.1', 'prefix should be prepended');
-  assertEqual(models[1]!.id, 'openrouter/openai/gpt-4o', 'already-prefixed ID should be unchanged');
-});
-
-await test('applyFixes: preserves dots in openrouter/ model IDs (not rewritten)', async () => {
-  const { applyFixes } = await import('../src/project/fix.js');
-  const { checkConfig } = await import('../src/project/validate.js');
-  const rawJson = {
-    name: 'test',
-    target: { surface: 'cli', discovery: { sources: ['./src/cli.ts'] } },
-    benchmark: {
-      format: 'pi',
-      models: [{ id: 'openrouter/anthropic/claude-sonnet-4.6', name: 'Claude', tier: 'flagship' }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(rawJson as any, '/fake/skill-optimizer.json');
-  const fixed = applyFixes(rawJson as any, issues, '/fake');
-  const models = (fixed.benchmark as any).models as Array<{ id: string }>;
-  assertEqual(models[0]!.id, 'openrouter/anthropic/claude-sonnet-4.6', 'openrouter/ dots must be preserved — not rewritten');
-});
-
-await test('applyFixes: does not mutate input', async () => {
-  const { applyFixes } = await import('../src/project/fix.js');
-  const { checkConfig } = await import('../src/project/validate.js');
-  const rawJson = {
-    name: 'test',
-    target: { surface: 'cli', discovery: { sources: ['./src/cli.ts'] } },
-    benchmark: {
-      format: 'pi',
-      models: [{ id: 'z-ai/glm-5.1', name: 'GLM', tier: 'mid' }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(rawJson as any, '/fake/skill-optimizer.json');
-  applyFixes(rawJson as any, issues, '/fake');
-  assertEqual((rawJson.benchmark.models[0] as any).id, 'z-ai/glm-5.1', 'input should not be mutated');
-});
-
-console.log('\n=== Doctor command smoke tests ===\n');
-
-await test('doctor --static: exits 1 for config with model-id error', async () => {
-  const { runDoctor } = await import('../src/doctor/index.js');
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-doctor-'));
-  try {
-    const configPath = join(dir, 'skill-optimizer.json');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'test-bad',
-      target: { surface: 'cli', discovery: { sources: ['./src/cli.ts'] } },
-      benchmark: {
-        format: 'pi',
-        models: [{ id: 'z-ai/glm-5.1', name: 'GLM', tier: 'mid' }],
-        taskGeneration: { enabled: true, maxTasks: 5 },
-      },
-    }, null, 2), 'utf-8');
-
-    const exitCode = await runDoctor(configPath, { staticOnly: true });
-    assertEqual(exitCode, 1, 'should exit 1 for model-id-missing-prefix error');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('doctor --static: exits 0 or 1 (not 2) for readable config', async () => {
-  const { runDoctor } = await import('../src/doctor/index.js');
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-doctor-'));
-  try {
-    const configPath = join(dir, 'skill-optimizer.json');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'test-ok',
-      target: { surface: 'mcp', discovery: { sources: ['./src/server.ts'] } },
-      benchmark: {
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-4o', name: 'GPT-4o', tier: 'flagship' }],
-        taskGeneration: { enabled: true, maxTasks: 10 },
-      },
-    }, null, 2), 'utf-8');
-
-    const exitCode = await runDoctor(configPath, { staticOnly: true });
-    // discovery-source-missing fires (./src/server.ts doesn't exist in tmpdir)
-    // but config is readable JSON so it must not be exit code 2
-    assert(exitCode === 0 || exitCode === 1, `should exit 0 or 1 (config is readable JSON), got ${exitCode}`);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('doctor --fix: corrects model ID in-place', async () => {
-  const { runDoctor } = await import('../src/doctor/index.js');
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-doctor-'));
-  try {
-    const configPath = join(dir, 'skill-optimizer.json');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'test-fix',
-      target: { surface: 'cli', discovery: { sources: ['./src/cli.ts'] } },
-      benchmark: {
-        format: 'pi',
-        models: [{ id: 'z-ai/glm-5.1', name: 'GLM', tier: 'mid' }],
-        taskGeneration: { enabled: true, maxTasks: 5 },
-      },
-    }, null, 2), 'utf-8');
-
-    await runDoctor(configPath, { staticOnly: true, fix: true });
-
-    const fixed = JSON.parse(readFileSync(configPath, 'utf-8')) as { benchmark: { models: Array<{ id: string }> } };
-    // Fixed-point loop applies both prefix and dot-normalisation fixes in sequence
-    assertEqual(fixed.benchmark.models[0]!.id, 'openrouter/z-ai/glm-5.1', '--fix should add openrouter/ prefix but preserve dots');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('checkModelReachability: mixed list skips non-openrouter and probes only openrouter models', async () => {
-  const { checkModelReachability } = await import('../src/doctor/checks.js');
-
-  const project = {
-    configPath: '/fake/skill-optimizer.json',
-    configDir: '/fake',
-    name: 'test-mixed',
-    target: {
-      surface: 'mcp',
-      repoPath: '/fake',
-      scope: { include: ['.*'], exclude: [] },
-    },
-    benchmark: {
-      format: 'pi',
-      authMode: 'env',
-      timeout: 30000,
-      models: [
-        { id: 'anthropic/claude-sonnet-4-6', name: 'Claude', tier: 'flagship' as const },
-        { id: 'openrouter/openai/gpt-4o', name: 'GPT-4o', tier: 'flagship' as const },
-      ],
-      taskGeneration: { enabled: false, maxTasks: 10, useExisting: false },
-      output: { dir: '/fake/.results' },
-      verdict: { perModelFloor: 0.5, targetWeightedAverage: 0.6 },
-    },
-  } as unknown as ResolvedProjectConfig;
-
-  // Without OPENROUTER_API_KEY set the key resolution throws, so the function
-  // returns early after emitting reachability-skipped. We verify:
-  //   1. A reachability-skipped issue appears (for the 1 non-openrouter model)
-  //   2. No reachability-skipped with field 'benchmark.format' (wrong early-exit path)
-  //   3. No model-unreachable for the anthropic model
-  const savedKey = process.env['OPENROUTER_API_KEY'];
-  delete process.env['OPENROUTER_API_KEY'];
-  try {
-    const issues = await checkModelReachability(project);
-    const skipped = issues.filter((i) => i.code === 'reachability-skipped');
-    assert(skipped.length >= 1, 'should have at least one reachability-skipped issue');
-    const modelSkipped = skipped.find((i) => i.field === 'benchmark.models');
-    assert(modelSkipped !== undefined, 'reachability-skipped issue should reference benchmark.models field');
-    assert(
-      modelSkipped!.message.includes('1 non-OpenRouter'),
-      `message should count 1 skipped, got: ${modelSkipped!.message}`,
-    );
-    const anthropicUnreachable = issues.find(
-      (i) => i.code === 'model-unreachable' && i.message.includes('anthropic/'),
-    );
-    assert(anthropicUnreachable === undefined, 'anthropic model must not produce a model-unreachable issue');
-  } finally {
-    if (savedKey !== undefined) process.env['OPENROUTER_API_KEY'] = savedKey;
-  }
-});
-
-// ── Summary ────────────────────────────────────────────────────────────────
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-coverage.ts b/tests/smoke-coverage.ts
deleted file mode 100644
index 9df5146..0000000
--- a/tests/smoke-coverage.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-import { strict as assert } from 'node:assert';
-
-import type { ActionDefinition } from '../src/actions/types.js';
-import type { GeneratedTask } from '../src/tasks/types.js';
-import {
-  computeCoverage,
-  computeUncovered,
-  buildRetryPrompt,
-} from '../src/tasks/coverage.js';
-
-function mkAction(name: string): ActionDefinition {
-  return { key: name, name, args: [] };
-}
-
-function mkTask(id: string, actions: string[]): GeneratedTask {
-  return {
-    id,
-    prompt: `do ${id}`,
-    expected_actions: actions.map((name) => ({ name })),
-  };
-}
-
-function mkPromptTask(id: string, capabilityId: string): GeneratedTask {
-  return { id, prompt: `do ${id}`, expected_actions: [], capabilityId };
-}
-
-function testFullCoverage() {
-  const actions = [mkAction('Wallet.send'), mkAction('Wallet.receive')];
-  const tasks = [mkTask('t1', ['Wallet.send']), mkTask('t2', ['Wallet.receive'])];
-  const coverage = computeCoverage(actions, tasks);
-  assert.strictEqual(coverage.uncoveredActions.length, 0);
-  assert.strictEqual(coverage.coverageViolation, false);
-  assert.deepStrictEqual(Object.keys(coverage.tasksPerAction).sort(), ['Wallet.receive', 'Wallet.send']);
-  console.log('PASS: full coverage reports zero uncovered');
-}
-
-function testPartialCoverage() {
-  const actions = [mkAction('Wallet.send'), mkAction('Wallet.receive'), mkAction('Token.mint')];
-  const tasks = [mkTask('t1', ['Wallet.send'])];
-  const coverage = computeCoverage(actions, tasks);
-  assert.deepStrictEqual(coverage.uncoveredActions.sort(), ['Token.mint', 'Wallet.receive']);
-  assert.strictEqual(coverage.coverageViolation, true);
-  console.log('PASS: partial coverage flags uncovered');
-}
-
-function testUncoveredDriver() {
-  const actions = [mkAction('A'), mkAction('B'), mkAction('C')];
-  const tasks = [mkTask('t1', ['A'])];
-  const uncovered = computeUncovered(actions, tasks);
-  assert.deepStrictEqual(uncovered.sort(), ['B', 'C']);
-  console.log('PASS: computeUncovered returns action names');
-}
-
-function testRetryPromptMentionsActions() {
-  const prompt = buildRetryPrompt(['Wallet.receive', 'Token.mint']);
-  assert.ok(prompt.includes('Wallet.receive'));
-  assert.ok(prompt.includes('Token.mint'));
-  console.log('PASS: retry prompt names uncovered actions');
-}
-
-function testPromptCoverageFromCapabilityId() {
-  // Regression guard for Issue 2: prompt tasks have expected_actions:[] but
-  // must count as covering their declared capabilityId capability.
-  const actions = [mkAction('summarize'), mkAction('translate')];
-  const tasks = [mkPromptTask('t1', 'summarize'), mkPromptTask('t2', 'translate')];
-  const coverage = computeCoverage(actions, tasks);
-  assert.strictEqual(coverage.coverageViolation, false);
-  assert.strictEqual(coverage.uncoveredActions.length, 0);
-  assert.deepStrictEqual(coverage.coveredActions.sort(), ['summarize', 'translate']);
-  console.log('PASS: prompt tasks covered via capabilityId (Issue 2 guard)');
-}
-
-async function main() {
-  testFullCoverage();
-  testPartialCoverage();
-  testUncoveredDriver();
-  testRetryPromptMentionsActions();
-  testPromptCoverageFromCapabilityId();
-  console.log('\nALL PASS: smoke-coverage');
-}
-
-main().catch((err) => {
-  console.error('FAIL: smoke-coverage', err);
-  process.exit(1);
-});
diff --git a/tests/smoke-discovery-cli.ts b/tests/smoke-discovery-cli.ts
deleted file mode 100644
index 8c2736d..0000000
--- a/tests/smoke-discovery-cli.ts
+++ /dev/null
@@ -1,385 +0,0 @@
-import { existsSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join } from 'node:path';
-
-import { discoverCliSurfaceFromSources } from '../src/discovery/cli.js';
-import { discoverActions } from '../src/actions/index.js';
-import { readCliActionsFromSources } from '../src/actions/index.js';
-import { loadProjectConfig } from '../src/project/index.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  + ${name}`);
-  } catch (error: any) {
-    failed++;
-    console.log(`  - ${name}`);
-    console.log(`    ${error.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string): void {
-  if (!condition) {
-    throw new Error(`Assertion failed: ${message}`);
-  }
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string): void {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== CLI Discovery Smoke Tests ===\n');
-
-await test('discovers exported const command arrays', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-discovery-'));
-  const sourcePath = join(dir, 'commands.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export const COMMANDS = [',
-        '  {',
-        "    command: 'tickets:list',",
-        "    description: 'List tickets',",
-        '    options: [],',
-        '  },',
-        '  {',
-        "    command: 'tickets:create',",
-        "    description: 'Create a ticket',",
-        '    options: [',
-        "      { name: '--title', takesValue: true },",
-        "      { name: '--quiet', takesValue: false },",
-        '    ],',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverCliSurfaceFromSources([sourcePath]);
-    assertEqual(snapshot.surface, 'cli', 'surface should be cli');
-    assertEqual(snapshot.actions.length, 2, 'should discover two commands');
-
-    const names = snapshot.actions.map((action) => action.name).sort();
-    assertEqual(names.join(','), 'tickets:create,tickets:list', 'discovered command names should match');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('extracts command options including takesValue mappings', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-discovery-'));
-  const sourcePath = join(dir, 'commands.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export const COMMANDS = [',
-        '  {',
-        "    command: 'tickets:create',",
-        "    description: 'Create a ticket',",
-        '    options: [',
-        "      { name: '--title', description: 'Ticket title', takesValue: true },",
-        "      { name: '--quiet', description: 'Suppress output', takesValue: false },",
-        '    ],',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverCliSurfaceFromSources([sourcePath]);
-    const create = snapshot.actions.find((action) => action.name === 'tickets:create');
-    assert(create !== undefined, 'tickets:create should be discovered');
-
-    if (!create) {
-      throw new Error('tickets:create should be discovered');
-    }
-
-    const byName = new Map<string, (typeof create.args)[number]>(create.args.map((arg) => [arg.name, arg]));
-    assertEqual(byName.get('--title')?.type, 'string', '--title should be value-taking');
-    assertEqual(byName.get('--quiet')?.type, 'boolean', '--quiet should be a boolean flag');
-    assertEqual(byName.get('--title')?.description, 'Ticket title', '--title description should be discovered');
-    assertEqual(byName.get('--quiet')?.description, 'Suppress output', '--quiet description should be discovered');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers default-exported command arrays', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-discovery-default-export-'));
-  const sourcePath = join(dir, 'commands.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'const COMMANDS = [',
-        '  {',
-        "    command: 'tickets:archive',",
-        "    description: 'Archive a ticket',",
-        '    options: [',
-        "      { name: '--id', takesValue: true },",
-        '    ],',
-        '  },',
-        '];',
-        'export default COMMANDS;',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverCliSurfaceFromSources([sourcePath]);
-    assertEqual(snapshot.actions.length, 1, 'should discover one default-exported command');
-    assertEqual(snapshot.actions[0].name, 'tickets:archive', 'default export should be discovered');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovery remains static and does not execute source file', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-discovery-'));
-  const sourcePath = join(dir, 'commands.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        "throw new Error('this file must never execute during discovery');",
-        'export const COMMANDS = [',
-        '  {',
-        "    command: 'safe:command',",
-        "    description: 'Safe command',",
-        '    options: [',
-        "      { name: '--force', takesValue: false },",
-        '    ],',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverCliSurfaceFromSources([sourcePath]);
-    assertEqual(snapshot.actions.length, 1, 'should discover one command');
-    assertEqual(snapshot.actions[0].name, 'safe:command', 'should discover command from static source');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers cli actions via public action discovery entrypoint', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-discovery-actions-'));
-  const sourcePath = join(dir, 'commands.ts');
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export const COMMANDS = [',
-        '  {',
-        "    command: 'tickets:create',",
-        '    options: [',
-        "      { name: '--title', takesValue: true },",
-        '    ],',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    writeFileSync(configPath, JSON.stringify({
-      name: 'cli-actions-entrypoint',
-      target: {
-        surface: 'cli',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./commands.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const catalog = discoverActions(project);
-    assertEqual(catalog.surface, 'cli', 'surface should be cli');
-    assertEqual(catalog.actions.length, 1, 'should discover one cli action');
-    assertEqual(catalog.actions[0].key, 'tickets:create', 'action key should match command name');
-    assertEqual(catalog.actions[0].args[0]?.name, 'title', 'cli option names should be normalized');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('reads cli actions via actions-layer reader export', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-reader-actions-'));
-  const sourcePath = join(dir, 'commands.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export const COMMANDS = [',
-        '  {',
-        "    command: 'tickets:create',",
-        '    options: [',
-        "      { name: '--title', takesValue: true },",
-        '    ],',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const actions = readCliActionsFromSources([sourcePath]);
-    assertEqual(actions.length, 1, 'reader should return discovered cli action');
-    assertEqual(actions[0].key, 'tickets:create', 'reader should map key from command name');
-    assertEqual(actions[0].args[0]?.name, 'title', 'reader should normalize option names');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discoverActions uses manifest commands when CLI discovery mode is manifest', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'cli-discovery-manifest-'));
-  const commandsPath = join(dir, 'commands.json');
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(commandsPath, JSON.stringify([
-      {
-        command: 'tickets:list',
-        description: 'List tickets',
-        options: [
-          { name: '--limit', takesValue: true },
-        ],
-      },
-    ], null, 2), 'utf-8');
-
-    writeFileSync(configPath, JSON.stringify({
-      name: 'cli-actions-manifest-entrypoint',
-      target: {
-        surface: 'cli',
-        repoPath: '.',
-        discovery: {
-          mode: 'manifest',
-          fallbackManifest: './commands.json',
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const catalog = discoverActions(project);
-    assertEqual(catalog.surface, 'cli', 'surface should be cli');
-    assertEqual(catalog.actions.length, 1, 'manifest-backed discovery should return one action');
-    assertEqual(catalog.actions[0].key, 'tickets:list', 'action key should come from manifest command');
-    assertEqual(catalog.actions[0].args[0]?.name, 'limit', 'manifest options should be normalized');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-// ── Optique extractor tests ───────────────────────────────────────────────────
-
-const FAST_CLI = '/root/openclaw-workspace/fast-sdk/app/cli/src/cli.ts';
-const fastCliExists = existsSync(FAST_CLI);
-
-await test('optique: discovers all 20 leaf commands from fast-sdk cli.ts', () => {
-  if (!fastCliExists) { console.log('    (skipped — fast-sdk not present)'); return; }
-  const snapshot = discoverCliSurfaceFromSources([FAST_CLI]);
-  assertEqual(snapshot.surface, 'cli', 'surface should be cli');
-  assertEqual(snapshot.actions.length, 20, `expected 20 commands, got ${snapshot.actions.length}`);
-});
-
-await test('optique: command names include full hierarchical path', () => {
-  if (!fastCliExists) { console.log('    (skipped — fast-sdk not present)'); return; }
-  const snapshot = discoverCliSurfaceFromSources([FAST_CLI]);
-  const names = new Set(snapshot.actions.map((a) => a.name));
-  for (const expected of ['account create', 'account import', 'account list',
-    'account set-default', 'account export', 'account delete',
-    'network list', 'network add', 'network set-default', 'network remove',
-    'info status', 'info balance', 'info tx', 'info history',
-    'info bridge-tokens', 'info bridge-chains',
-    'send', 'fund fiat', 'fund crypto', 'pay']) {
-    assert(names.has(expected), `missing expected command: ${expected}`);
-  }
-});
-
-await test('optique: extracts descriptions from tagged template literals', () => {
-  if (!fastCliExists) { console.log('    (skipped — fast-sdk not present)'); return; }
-  const snapshot = discoverCliSurfaceFromSources([FAST_CLI]);
-  const send = snapshot.actions.find((a) => a.name === 'send');
-  assert(send !== undefined, 'send command should be discovered');
-  assert(send!.description !== undefined && send!.description.length > 0, 'send should have a description');
-});
-
-await test('optique: extracts named options with correct types', () => {
-  if (!fastCliExists) { console.log('    (skipped — fast-sdk not present)'); return; }
-  const snapshot = discoverCliSurfaceFromSources([FAST_CLI]);
-  const send = snapshot.actions.find((a) => a.name === 'send');
-  assert(send !== undefined, 'send command should be discovered');
-  const byName = new Map(send!.args.map((a) => [a.name, a]));
-  assertEqual(byName.get('--token')?.type, 'string', '--token should be string');
-  assertEqual(byName.get('--eip-7702')?.type, 'boolean', '--eip-7702 should be boolean');
-  assertEqual(byName.get('--from-chain')?.type, 'string', '--from-chain should be string');
-});
-
-await test('optique: positional arguments marked required and use property key as name', () => {
-  if (!fastCliExists) { console.log('    (skipped — fast-sdk not present)'); return; }
-  const snapshot = discoverCliSurfaceFromSources([FAST_CLI]);
-  const send = snapshot.actions.find((a) => a.name === 'send');
-  assert(send !== undefined, 'send command should be discovered');
-  const byName = new Map(send!.args.map((a) => [a.name, a]));
-  assert(byName.get('address')?.required === true, 'address should be required');
-  assert(byName.get('amount')?.required === true, 'amount should be required');
-});
-
-await test('optique: optional/withDefault wrappers produce non-required args', () => {
-  if (!fastCliExists) { console.log('    (skipped — fast-sdk not present)'); return; }
-  const snapshot = discoverCliSurfaceFromSources([FAST_CLI]);
-  const accountExport = snapshot.actions.find((a) => a.name === 'account export');
-  assert(accountExport !== undefined, 'account export should be discovered');
-  // `name` is optional(argument(...)) → required: false
-  const nameArg = accountExport!.args.find((a) => a.name === 'name');
-  assert(nameArg !== undefined, 'name arg should be present');
-  assert(nameArg!.required === false, 'optional argument should not be required');
-});
-
-await test('optique: static — does not execute source file', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'optique-static-'));
-  const sourcePath = join(dir, 'cli.ts');
-  try {
-    writeFileSync(sourcePath, [
-      "import { command, object, or } from '@optique/core/primitives';",
-      "throw new Error('must never execute');",
-      "const listParser = command('list', object({}), {});",
-      "export const parser = command('items', or(listParser), {});",
-    ].join('\n'), 'utf-8');
-    // discoverCliSurfaceFromSources uses static AST — no execution
-    // We just verify it doesn't throw (the file would throw if imported)
-    const snapshot = discoverCliSurfaceFromSources([sourcePath]);
-    assert(Array.isArray(snapshot.actions), 'should return actions array without executing the file');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-discovery-mcp.ts b/tests/smoke-discovery-mcp.ts
deleted file mode 100644
index 32a7512..0000000
--- a/tests/smoke-discovery-mcp.ts
+++ /dev/null
@@ -1,269 +0,0 @@
-import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join, resolve } from 'node:path';
-
-import { discoverMcpSurfaceFromSources } from '../src/discovery/mcp.js';
-import { discoverActions } from '../src/actions/index.js';
-import { loadProjectConfig, buildSurfaceSnapshot } from '../src/project/index.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  + ${name}`);
-  } catch (error: any) {
-    failed++;
-    console.log(`  - ${name}`);
-    console.log(`    ${error.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string): void {
-  if (!condition) {
-    throw new Error(`Assertion failed: ${message}`);
-  }
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string): void {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== MCP Discovery Smoke Tests ===\n');
-
-await test('discovers MCP actions from tracker mock server source', () => {
-  const sourcePath = resolve(process.cwd(), 'mock-repos/mcp-tracker-demo/src/server.ts');
-  const snapshot = discoverMcpSurfaceFromSources([sourcePath]);
-
-  assertEqual(snapshot.surface, 'mcp', 'surface should be mcp');
-  assertEqual(snapshot.actions.length, 4, 'should discover all tracker tools');
-
-  const actionNames = snapshot.actions.map((action) => action.name).sort();
-  assertEqual(actionNames.join(','), 'add_cmnt,get_tkt,tkt_new,update_tkt_state', 'discovered action names should match');
-});
-
-await test('extracts argument names and required flags for tkt_new', () => {
-  const sourcePath = resolve(process.cwd(), 'mock-repos/mcp-tracker-demo/src/server.ts');
-  const snapshot = discoverMcpSurfaceFromSources([sourcePath]);
-  const action = snapshot.actions.find((candidate) => candidate.name === 'tkt_new');
-
-  assert(action !== undefined, 'tkt_new should be discovered');
-  if (!action) {
-    throw new Error('tkt_new should be discovered');
-  }
-
-  const requiredByName = new Map(action.args.map((arg) => [arg.name, arg.required]));
-  assertEqual(requiredByName.get('t'), true, 't should be required');
-  assertEqual(requiredByName.get('d'), true, 'd should be required');
-  assertEqual(requiredByName.get('p'), true, 'p should be required');
-  assertEqual(requiredByName.get('usr'), false, 'usr should be optional');
-});
-
-await test('discovery remains static and does not execute source file', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'mcp-discovery-'));
-  const filePath = join(dir, 'server.ts');
-
-  try {
-    writeFileSync(
-      filePath,
-      [
-        "throw new Error('this file must never execute during discovery');",
-        'export const TOOLS = [',
-        '  {',
-        "    type: 'function',",
-        '    function: {',
-        "      name: 'safe_tool',",
-        "      description: 'safe',",
-        '      parameters: {',
-        "        type: 'object',",
-        '        properties: {',
-        "          id: { type: 'string' },",
-        '        },',
-        "        required: ['id'],",
-        '      },',
-        '    },',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverMcpSurfaceFromSources([filePath]);
-    assertEqual(snapshot.actions.length, 1, 'should discover one tool from static source');
-    assertEqual(snapshot.actions[0].name, 'safe_tool', 'discovered tool should match exported literal');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('code-first MCP project config works without fallback manifest', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'mcp-project-config-'));
-  try {
-    const serverPath = join(dir, 'server.ts');
-    const configPath = join(dir, 'skill-optimizer.json');
-
-    writeFileSync(serverPath, readFileSync(resolve(process.cwd(), 'mock-repos/mcp-tracker-demo/src/server.ts'), 'utf-8'), 'utf-8');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'code-first-mcp',
-      target: {
-        surface: 'mcp',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./server.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const snapshot = buildSurfaceSnapshot(project);
-    assertEqual(snapshot.actions.length, 4, 'code-first config should discover tools without fallback manifest');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('code-first discovery fails fast when MCP source is missing', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'mcp-project-config-'));
-  try {
-    const configPath = join(dir, 'skill-optimizer.json');
-
-    writeFileSync(configPath, JSON.stringify({
-      name: 'missing-source-mcp',
-      target: {
-        surface: 'mcp',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./missing-server.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    // loadProjectConfig now validates paths eagerly — expect it to throw when source is missing
-    let threw = false;
-    try {
-      await loadProjectConfig(configPath);
-    } catch (error: any) {
-      threw = true;
-      assert(
-        error.message.includes('does not exist'),
-        `missing source error should mention missing path, got: ${error.message}`,
-      );
-    }
-    assert(threw, 'missing discovery source should fail fast');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers mcp actions via public action discovery entrypoint', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'mcp-discovery-actions-'));
-  const sourcePath = join(dir, 'server.ts');
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export const TOOLS = [',
-        '  {',
-        "    type: 'function',",
-        '    function: {',
-        "      name: 'create_ticket',",
-        '      parameters: {',
-        "        type: 'object',",
-        '        properties: {',
-        "          title: { type: 'string' },",
-        '        },',
-        "        required: ['title'],",
-        '      },',
-        '    },',
-        '  },',
-        '];',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    writeFileSync(configPath, JSON.stringify({
-      name: 'mcp-actions-entrypoint',
-      target: {
-        surface: 'mcp',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./server.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const catalog = discoverActions(project);
-    assertEqual(catalog.surface, 'mcp', 'surface should be mcp');
-    assertEqual(catalog.actions.length, 1, 'should discover one mcp action');
-    assertEqual(catalog.actions[0].key, 'create_ticket', 'action key should match tool name');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discoverActions fails fast when MCP discovery source is missing and no fallback exists', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'mcp-actions-missing-source-'));
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(configPath, JSON.stringify({
-      name: 'missing-source-mcp-actions',
-      target: {
-        surface: 'mcp',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./missing-server.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    let threw = false;
-    try {
-      await loadProjectConfig(configPath);
-    } catch (error: any) {
-      threw = true;
-      assert(
-        error.message.includes('does not exist'),
-        `missing source error should mention missing path, got: ${error.message}`,
-      );
-    }
-
-    assert(threw, 'missing discovery source should fail fast when no fallback exists');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-discovery-sdk.ts b/tests/smoke-discovery-sdk.ts
deleted file mode 100644
index bb9806b..0000000
--- a/tests/smoke-discovery-sdk.ts
+++ /dev/null
@@ -1,344 +0,0 @@
-import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join } from 'node:path';
-
-import { discoverSdkSurfaceFromSources } from '../src/discovery/sdk.js';
-import { discoverActions } from '../src/actions/index.js';
-import { buildSurfaceSnapshot, loadProjectConfig } from '../src/project/index.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  + ${name}`);
-  } catch (error: any) {
-    failed++;
-    console.log(`  - ${name}`);
-    console.log(`    ${error.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string): void {
-  if (!condition) {
-    throw new Error(`Assertion failed: ${message}`);
-  }
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string): void {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== SDK Discovery Smoke Tests ===\n');
-
-await test('discovers exported class constructor and methods', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const sourcePath = join(dir, 'client.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export class MyClient {',
-        '  constructor(network: string, retries = 3) {}',
-        '  public getBalance(accountId: string) { return accountId; }',
-        '  private sign(secret: string) { return secret; }',
-        '  protected refresh() {}',
-        '  static fromKey(key: string) { return new MyClient(key); }',
-        '}',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverSdkSurfaceFromSources([sourcePath]);
-    assertEqual(snapshot.surface, 'sdk', 'surface should be sdk');
-
-    const names = snapshot.actions.map((action) => action.name).sort();
-    assertEqual(
-      names.join(','),
-      'MyClient.constructor,MyClient.fromKey,MyClient.getBalance',
-      'should discover class callable actions',
-    );
-
-    const ctor = snapshot.actions.find((action) => action.name === 'MyClient.constructor');
-    assert(ctor !== undefined, 'constructor action should exist');
-    if (ctor) {
-      assertEqual(ctor.args.map((arg) => arg.name).join(','), 'network,retries', 'constructor arg names should match');
-      assertEqual(ctor.args[0].required, true, 'network should be required');
-      assertEqual(ctor.args[1].required, false, 'retries should be optional');
-    }
-
-    const getBalance = snapshot.actions.find((action) => action.name === 'MyClient.getBalance');
-    assert(getBalance !== undefined, 'getBalance action should exist');
-    if (getBalance) {
-      assertEqual(getBalance.args.map((arg) => arg.name).join(','), 'accountId', 'method arg names should match');
-    }
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers exported standalone functions and default exports', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const sourcePath = join(dir, 'functions.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export function sendTokens(to: string, amount: number) {}',
-        'export default function createClient(network: string) { return network; }',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverSdkSurfaceFromSources([sourcePath]);
-    const names = snapshot.actions.map((action) => action.name).sort();
-
-    assertEqual(
-      names.join(','),
-      'createClient,sendTokens',
-      'should discover exported named and default functions',
-    );
-
-    const sendTokens = snapshot.actions.find((action) => action.name === 'sendTokens');
-    assert(sendTokens !== undefined, 'sendTokens should exist');
-    if (sendTokens) {
-      assertEqual(sendTokens.args.map((arg) => arg.name).join(','), 'to,amount', 'function arg names should match');
-    }
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers re-exported SDK actions from a barrel file', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const barrelPath = join(dir, 'index.ts');
-  const clientPath = join(dir, 'client.ts');
-
-  try {
-    writeFileSync(
-      clientPath,
-      [
-        'export class MyClient {',
-        '  constructor(network: string) {}',
-        '  getBalance(accountId: string) { return accountId; }',
-        '}',
-      ].join('\n'),
-      'utf-8',
-    );
-    writeFileSync(barrelPath, 'export { MyClient } from "./client";\n', 'utf-8');
-
-    const snapshot = discoverSdkSurfaceFromSources([barrelPath]);
-    const names = snapshot.actions.map((action) => action.name).sort();
-    assertEqual(names.join(','), 'MyClient.constructor,MyClient.getBalance', 'barrel discovery should follow named re-exports');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers re-exported SDK actions when barrel uses explicit file extensions', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const barrelPath = join(dir, 'index.ts');
-  const clientPath = join(dir, 'client.js');
-
-  try {
-    writeFileSync(
-      clientPath,
-      [
-        'export class MyClient {',
-        '  constructor(network) {}',
-        '  getBalance(accountId) { return accountId; }',
-        '}',
-      ].join('\n'),
-      'utf-8',
-    );
-    writeFileSync(barrelPath, 'export { MyClient } from "./client.js";\n', 'utf-8');
-
-    const snapshot = discoverSdkSurfaceFromSources([barrelPath]);
-    const names = snapshot.actions.map((action) => action.name).sort();
-    assertEqual(names.join(','), 'MyClient.constructor,MyClient.getBalance', 'explicit-extension re-export should be followed');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers alias names for re-exported SDK actions', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const barrelPath = join(dir, 'index.ts');
-  const clientPath = join(dir, 'client.ts');
-
-  try {
-    writeFileSync(
-      clientPath,
-      [
-        'export class MyClient {',
-        '  constructor(network: string) {}',
-        '  getBalance(accountId: string) { return accountId; }',
-        '}',
-      ].join('\n'),
-      'utf-8',
-    );
-    writeFileSync(barrelPath, 'export { MyClient as Client } from "./client";\n', 'utf-8');
-
-    const snapshot = discoverSdkSurfaceFromSources([barrelPath]);
-    const names = snapshot.actions.map((action) => action.name).sort();
-    assertEqual(names.join(','), 'Client.constructor,Client.getBalance', 're-export alias should become the public SDK action prefix');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('project snapshot falls back to sdk.apiSurface when discovery returns zero actions', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const sourcePath = join(dir, 'index.ts');
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(sourcePath, 'const sendTokens = () => {}; export { sendTokens };\n', 'utf-8');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'sdk-fallback',
-      target: {
-        surface: 'sdk',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./index.ts'],
-          language: 'typescript',
-        },
-        sdk: {
-          language: 'typescript',
-          apiSurface: ['sendTokens'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const snapshot = buildSurfaceSnapshot(project);
-    assertEqual(snapshot.actions.length, 1, 'sdk apiSurface should be used as fallback');
-    assertEqual(snapshot.actions[0]?.name, 'sendTokens', 'fallback action should come from sdk.apiSurface');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('parses files statically and never executes source code', () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-'));
-  const sourcePath = join(dir, 'safe.ts');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        "throw new Error('this must never run during discovery');",
-        'export class Wallet {',
-        '  transfer(to: string) {}',
-        '}',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    const snapshot = discoverSdkSurfaceFromSources([sourcePath]);
-    assertEqual(snapshot.actions.length, 1, 'should discover action from static parse');
-    assertEqual(snapshot.actions[0].name, 'Wallet.transfer', 'discovered action should match exported class method');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discovers sdk actions via public action discovery entrypoint', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-actions-'));
-  const sourcePath = join(dir, 'client.ts');
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(
-      sourcePath,
-      [
-        'export class Wallet {',
-        '  constructor(network: string) {}',
-        '  send(to: string, amount: number) {}',
-        '}',
-      ].join('\n'),
-      'utf-8',
-    );
-
-    writeFileSync(configPath, JSON.stringify({
-      name: 'sdk-actions-entrypoint',
-      target: {
-        surface: 'sdk',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./client.ts'],
-          language: 'typescript',
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const catalog = discoverActions(project);
-    assertEqual(catalog.surface, 'sdk', 'surface should be sdk');
-
-    const actionKeys = catalog.actions.map((action) => action.key).sort();
-    assertEqual(actionKeys.join(','), 'Wallet.constructor,Wallet.send', 'action keys should match discovered sdk methods');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('discoverActions falls back to sdk.apiSurface when discovery returns zero actions', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'sdk-discovery-actions-fallback-'));
-  const sourcePath = join(dir, 'index.ts');
-  const configPath = join(dir, 'skill-optimizer.json');
-
-  try {
-    writeFileSync(sourcePath, 'const sendTokens = () => {}; export { sendTokens };\n', 'utf-8');
-    writeFileSync(configPath, JSON.stringify({
-      name: 'sdk-actions-fallback-entrypoint',
-      target: {
-        surface: 'sdk',
-        repoPath: '.',
-        discovery: {
-          mode: 'auto',
-          sources: ['./index.ts'],
-          language: 'typescript',
-        },
-        sdk: {
-          language: 'typescript',
-          apiSurface: ['sendTokens'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const project = await loadProjectConfig(configPath);
-    const catalog = discoverActions(project);
-    assertEqual(catalog.surface, 'sdk', 'surface should be sdk');
-    assertEqual(catalog.actions.length, 1, 'sdk apiSurface should be used as fallback');
-    assertEqual(catalog.actions[0]?.key, 'sendTokens', 'fallback action key should come from sdk.apiSurface');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-dry-run.ts b/tests/smoke-dry-run.ts
deleted file mode 100644
index 707364c..0000000
--- a/tests/smoke-dry-run.ts
+++ /dev/null
@@ -1,77 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { spawnSync } from 'node:child_process';
-import { mkdtempSync, writeFileSync, rmSync, readFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join, resolve } from 'node:path';
-
-const REPO_ROOT = resolve(import.meta.dirname, '..');
-const MOCK_CONFIG_REL = 'mock-repos/mcp-tracker-demo/skill-optimizer.json';
-const MOCK_CONFIG_ABS = resolve(REPO_ROOT, MOCK_CONFIG_REL);
-
-function run(args: string[]) {
-  return spawnSync('npx', ['tsx', 'src/cli.ts', ...args], {
-    encoding: 'utf-8',
-    env: {
-      ...process.env,
-      // Intentionally wipe API keys — dry-run must not need them.
-      OPENROUTER_API_KEY: '',
-    },
-    cwd: REPO_ROOT,
-  });
-}
-
-function testDryRunNoLLM() {
-  const result = run(['--dry-run', '--config', MOCK_CONFIG_REL]);
-  assert.strictEqual(result.status, 0, `dry-run failed: ${result.stderr}`);
-  assert.ok(result.stdout.includes('=== skill-optimizer dry run ==='));
-  assert.ok(result.stdout.includes('No LLM calls made'));
-  console.log('PASS: --dry-run succeeds with zero API keys, zero LLM calls');
-}
-
-function testDryRunMaxTasksTooSmall() {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-dry-'));
-  try {
-    const base = JSON.parse(readFileSync(MOCK_CONFIG_ABS, 'utf-8')) as Record<string, unknown>;
-    const mockDir = resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo');
-    const baseTarget = base.target as Record<string, unknown>;
-    const baseDiscovery = (baseTarget.discovery ?? {}) as Record<string, unknown>;
-    const baseSources = (baseDiscovery.sources ?? []) as string[];
-    const baseBenchmark = base.benchmark as Record<string, unknown>;
-    const baseTaskGen = (baseBenchmark.taskGeneration ?? {}) as Record<string, unknown>;
-
-    // Resolve paths to absolute so they work from the temp dir
-    base.target = {
-      ...baseTarget,
-      repoPath: mockDir,
-      ...(baseTarget.skill && { skill: resolve(mockDir, String(baseTarget.skill)) }),
-      discovery: {
-        ...baseDiscovery,
-        sources: baseSources.map((s) => resolve(mockDir, s)),
-      },
-    };
-    base.benchmark = {
-      ...baseBenchmark,
-      taskGeneration: { ...baseTaskGen, enabled: true, maxTasks: 1 },
-    };
-    const cfgPath = join(dir, 'skill-optimizer.json');
-    writeFileSync(cfgPath, JSON.stringify(base, null, 2));
-
-    const result = run(['--dry-run', '--config', cfgPath]);
-    assert.ok(result.status !== null, `process was killed by signal: ${result.signal}`);
-    assert.notStrictEqual(result.status, 0, `expected non-zero exit, got: ${result.stdout}`);
-    const combined = result.stderr + result.stdout;
-    assert.ok(combined.includes('maxTasks'), `expected maxTasks in output, got: ${combined}`);
-    assert.ok(combined.includes('in-scope'), `expected in-scope in output, got: ${combined}`);
-    console.log('PASS: --dry-run rejects maxTasks < scope_size');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-async function main() {
-  testDryRunNoLLM();
-  testDryRunMaxTasksTooSmall();
-  console.log('\nALL PASS: smoke-dry-run');
-}
-
-main().catch((err) => { console.error('FAIL: smoke-dry-run', err); process.exit(1); });
diff --git a/tests/smoke-e2e.ts b/tests/smoke-e2e.ts
deleted file mode 100644
index 82ce0ae..0000000
--- a/tests/smoke-e2e.ts
+++ /dev/null
@@ -1,221 +0,0 @@
-/**
- * smoke-e2e.ts — end-to-end optimize loop smoke test with deterministic in-memory mocks.
- * No real LLM or git calls are made.
- */
-
-import { strict as assert } from 'node:assert';
-import { mkdirSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join } from 'node:path';
-
-import { runOptimizeLoop } from '../src/optimizer/loop.js';
-import type { BenchmarkReport } from '../src/benchmark/types.js';
-import type { OptimizeManifest } from '../src/optimizer/types.js';
-
-// ---------------------------------------------------------------------------
-// Helper: build a BenchmarkReport from a pass/fail matrix
-// matrix: { taskId -> { modelId -> passed } }
-// ---------------------------------------------------------------------------
-function buildReport(matrix: Record<string, Record<string, boolean>>): BenchmarkReport {
-  const tasks = Object.keys(matrix);
-  const modelIds = [...new Set(tasks.flatMap((t) => Object.keys(matrix[t]!)))];
-
-  const perModel: Record<
-    string,
-    {
-      passRate: number;
-      avgRecall: number;
-      avgPrecision: number;
-      avgToolSelectionAccuracy: number;
-      avgArgAccuracy: number;
-      avgHallucinationRate: number;
-      tasksRun: number;
-    }
-  > = {};
-
-  for (const m of modelIds) {
-    const passed = tasks.filter((t) => matrix[t]![m]).length;
-    perModel[m] = {
-      passRate: passed / tasks.length,
-      avgRecall: passed / tasks.length,
-      avgPrecision: 1,
-      avgToolSelectionAccuracy: 1,
-      avgArgAccuracy: 1,
-      avgHallucinationRate: 0,
-      tasksRun: tasks.length,
-    };
-  }
-
-  const overall =
-    modelIds.length > 0
-      ? modelIds.reduce((a, m) => a + perModel[m]!.passRate, 0) / modelIds.length
-      : 0;
-
-  return {
-    timestamp: new Date().toISOString(),
-    config: { name: 'e2e-smoke', surface: 'mcp' } as BenchmarkReport['config'],
-    skillVersion: {
-      source: 'local',
-      commitSha: 'local',
-      ref: 'file',
-      fetchedAt: new Date().toISOString(),
-    },
-    results: [],
-    coverage: [],
-    summary: {
-      totalTasks: tasks.length,
-      totalModels: modelIds.length,
-      totalEvaluations: tasks.length * modelIds.length,
-      overallPassRate: overall,
-      weightedAverage: overall,
-      avgToolRecall: 0,
-      avgToolPrecision: 0,
-      avgToolSelectionAccuracy: 0,
-      avgArgAccuracy: 0,
-      avgHallucinationRate: 0,
-      methodCoveragePercent: 1,
-      perModel,
-      perTask: {},
-      perTier: {
-        flagship: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-        mid: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-        low: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-      },
-    },
-  };
-}
-
-// ---------------------------------------------------------------------------
-// Main test
-// ---------------------------------------------------------------------------
-async function testFullLoopReachesPass(): Promise<void> {
-  // Three benchmark results: baseline 0% -> 50% -> 100%
-  const reports: BenchmarkReport[] = [
-    buildReport({ a: { m1: false, m2: false }, b: { m1: false, m2: false } }), // 0%
-    buildReport({ a: { m1: true, m2: true }, b: { m1: false, m2: false } }),   // 50%
-    buildReport({ a: { m1: true, m2: true }, b: { m1: true, m2: true } }),     // 100%
-  ];
-
-  // benchmarkCallIndex tracks how many times benchmark.run has been called.
-  // Index 0 = baseline, 1 = iteration-1, 2 = iteration-2, ...
-  let benchmarkCallIndex = 0;
-
-  // Use a real temp directory so mkdirSync inside the loop doesn't fail.
-  const tmpDir = join(tmpdir(), `smoke-e2e-${Date.now()}`);
-  mkdirSync(tmpDir, { recursive: true });
-
-  // The manifest uses OptimizeManifest (unresolved) shape.
-  // requireCleanGit must not be false (loop.ts throws if it is exactly false).
-  const manifest: OptimizeManifest = {
-    benchmarkConfig: join(tmpDir, 'benchmark.json'), // must be a string path
-    targetRepo: {
-      path: tmpDir,
-      surface: 'mcp',
-      allowedPaths: ['SKILL.md'],
-      validation: [],
-      requireCleanGit: undefined as unknown as boolean, // omit = treated as true
-    },
-    optimizer: {
-      maxIterations: 3,
-      stabilityWindow: 3, // large window so we don't stop early on stable
-      minImprovement: 0.0,  // accept any improvement (even 0 delta)
-      perModelFloor: 0,
-      targetWeightedAverage: 0,
-      models: [
-        { id: 'm1', name: 'M1', tier: 'flagship' },
-        { id: 'm2', name: 'M2', tier: 'mid' },
-      ],
-      taskGeneration: {
-        enabled: false,
-        outputDir: join(tmpDir, '.skill-optimizer'),
-      },
-    },
-  };
-
-  const deps = {
-    repo: {
-      ensureReady: async (_targetRepo: unknown): Promise<string> => {
-        return tmpDir;
-      },
-      captureCheckpoint: async (_targetRepo: unknown): Promise<string> => {
-        return 'mock-checkpoint-sha';
-      },
-      restoreCheckpoint: async (_targetRepo: unknown, _checkpoint: string): Promise<void> => {
-        // no-op
-      },
-      updateAcceptedCheckpoint: async (
-        _targetRepo: unknown,
-        _prevCheckpoint: string,
-        _candidate: unknown,
-        _changedFiles?: string[],
-      ): Promise<string> => {
-        return 'mock-updated-sha';
-      },
-    },
-    benchmark: {
-      run: async (
-        _configPath: string,
-        _opts: { outputDir: string; label: string },
-      ): Promise<{ report: BenchmarkReport; reportPath: string }> => {
-        const idx = Math.min(benchmarkCallIndex, reports.length - 1);
-        const report = reports[idx]!;
-        benchmarkCallIndex += 1;
-        return { report, reportPath: join(tmpDir, `report-${benchmarkCallIndex}.json`) };
-      },
-    },
-    mutation: {
-      apply: async (_context: unknown): Promise<{
-        summary: string;
-        changedFiles: string[];
-        toolActivity?: string[];
-      }> => {
-        return {
-          summary: 'mock mutation applied',
-          changedFiles: ['SKILL.md'],
-          toolActivity: [],
-        };
-      },
-    },
-    validation: {
-      run: async (_targetRepo: unknown): Promise<{ ok: boolean; commands: unknown[] }> => {
-        return { ok: true, commands: [] };
-      },
-    },
-    ledger: {
-      record: async (_event: Record<string, unknown>): Promise<void> => {
-        // no-op
-      },
-    },
-  };
-
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  const result = await runOptimizeLoop(manifest, deps as any);
-
-  assert.ok(result.bestReport, 'bestReport must be present');
-  assert.strictEqual(
-    result.bestReport.summary.overallPassRate,
-    1.0,
-    `best report should reach 100% — got ${result.bestReport.summary.overallPassRate}`,
-  );
-
-  const validStopReasons: string[] = ['max-iterations', 'stable', 'target-hit'];
-  assert.ok(
-    validStopReasons.includes(result.stopReason),
-    `stopReason should be one of ${validStopReasons.join(', ')}, got ${result.stopReason}`,
-  );
-
-  console.log(`PASS: full optimize loop reached 100% pass (stopReason=${result.stopReason})`);
-}
-
-// ---------------------------------------------------------------------------
-// Entry point
-// ---------------------------------------------------------------------------
-async function main(): Promise<void> {
-  await testFullLoopReachesPass();
-  console.log('\nALL PASS: smoke-e2e');
-}
-
-main().catch((err) => {
-  console.error('FAIL: smoke-e2e', err);
-  process.exit(1);
-});
diff --git a/tests/smoke-errors.ts b/tests/smoke-errors.ts
deleted file mode 100644
index 7d538c4..0000000
--- a/tests/smoke-errors.ts
+++ /dev/null
@@ -1,144 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { spawnSync } from 'node:child_process';
-import { mkdtempSync, writeFileSync, rmSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join, resolve } from 'node:path';
-
-const REPO_ROOT = resolve(import.meta.dirname, '..');
-
-function run(args: string[], env: Record<string, string | undefined> = {}) {
-  return spawnSync('npx', ['tsx', 'src/cli.ts', ...args], {
-    encoding: 'utf-8',
-    env: { ...process.env, ...env },
-    cwd: REPO_ROOT,
-  });
-}
-
-function writeTmpConfig(partial: Record<string, unknown>) {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-err-'));
-  const path = join(dir, 'skill-optimizer.json');
-  writeFileSync(path, JSON.stringify(partial, null, 2));
-  return { dir, path };
-}
-
-function testConfigNotFound() {
-  const result = run(['run', '--config', '/nonexistent/skill-optimizer.json']);
-  assert.notStrictEqual(result.status, 0);
-  assert.ok(result.stderr.includes('Project config not found'), `got: ${result.stderr}`);
-  assert.ok(result.stderr.includes('skill-optimizer init'), `got: ${result.stderr}`);
-  console.log('PASS: config-not-found error has next step');
-}
-
-
-function testInvalidJson() {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-inv-'));
-  try {
-    const p = join(dir, 'skill-optimizer.json');
-    writeFileSync(p, '{not json');
-    const result = run(['run', '--config', p]);
-    assert.notStrictEqual(result.status, 0);
-    assert.ok(result.stderr.includes('Invalid JSON'), `got: ${result.stderr}`);
-    console.log('PASS: invalid JSON error identifies file');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-function testMissingApiKeyOnRun() {
-  // Only run if a runnable mock config exists
-  const result = run(['run', '--config', 'mock-repos/mcp-tracker-demo/skill-optimizer.json'], { OPENROUTER_API_KEY: '' });
-  const combined = result.stderr + result.stdout;
-  if (result.status !== 0 && combined.includes('OPENROUTER_API_KEY')) {
-    console.log('PASS: missing API key error names env var');
-  } else if (result.status !== 0) {
-    // Accept other failures (e.g. network timeout) without asserting API key message
-    console.log('SKIP: missing API key test — CLI failed for another reason');
-  } else {
-    console.log('SKIP: missing API key test — CLI did not reach LLM stage');
-  }
-}
-
-function testEmptyScope() {
-  const { dir, path } = writeTmpConfig({
-    name: 'empty-scope',
-    target: {
-      surface: 'mcp',
-      repoPath: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo'),
-      scope: { include: ['NONE.*'] },
-      mcp: { tools: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo/tools.json') },
-      skill: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo/SKILL.md'),
-    },
-    benchmark: {
-      models: [{ id: 'openrouter/test/mock', name: 'Mock', tier: 'mid' }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  });
-  try {
-    const result = run(['--dry-run', '--config', path]);
-    assert.notStrictEqual(result.status, 0);
-    assert.ok((result.stderr + result.stdout).match(/zero in-scope actions/), `got: ${result.stderr + result.stdout}`);
-    console.log('PASS: empty scope error');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-function testMaxTasksTooSmall() {
-  const { dir, path } = writeTmpConfig({
-    name: 'too-few-tasks',
-    target: {
-      surface: 'mcp',
-      repoPath: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo'),
-      mcp: { tools: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo/tools.json') },
-      skill: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo/SKILL.md'),
-    },
-    benchmark: {
-      models: [{ id: 'openrouter/test/mock', name: 'Mock', tier: 'mid' }],
-      taskGeneration: { enabled: true, maxTasks: 1 },
-    },
-  });
-  try {
-    const result = run(['--dry-run', '--config', path]);
-    assert.notStrictEqual(result.status, 0);
-    assert.ok((result.stderr + result.stdout).includes('maxTasks'), `got: ${result.stderr + result.stdout}`);
-    console.log('PASS: maxTasks-too-small preflight error');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-function testRepoPathMissing() {
-  const { dir, path } = writeTmpConfig({
-    name: 'no-repo',
-    target: {
-      surface: 'mcp',
-      repoPath: '/nonexistent/repo/at/all',
-      mcp: { tools: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo/tools.json') },
-    },
-    benchmark: {
-      models: [{ id: 'openrouter/test/mock', name: 'Mock', tier: 'mid' }],
-      tasks: resolve(REPO_ROOT, 'mock-repos/mcp-tracker-demo/tasks.json'),
-    },
-  });
-  try {
-    const result = run(['run', '--config', path]);
-    assert.notStrictEqual(result.status, 0);
-    const combined = (result.stderr + result.stdout).toLowerCase();
-    assert.ok(combined.includes('repopath') || combined.includes('not found') || combined.includes('does not exist'), `got: ${result.stderr + result.stdout}`);
-    console.log('PASS: repoPath-missing error reported');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-async function main() {
-  testConfigNotFound();
-  testInvalidJson();
-  testMissingApiKeyOnRun();
-  testEmptyScope();
-  testMaxTasksTooSmall();
-  testRepoPathMissing();
-  console.log('\nALL PASS: smoke-errors');
-}
-
-main().catch((err) => { console.error('FAIL: smoke-errors', err); process.exit(1); });
diff --git a/tests/smoke-feedback.ts b/tests/smoke-feedback.ts
deleted file mode 100644
index 6384268..0000000
--- a/tests/smoke-feedback.ts
+++ /dev/null
@@ -1,111 +0,0 @@
-import { strict as assert } from 'node:assert';
-
-import type { TaskResult } from '../src/benchmark/types.js';
-import { extractFailureDetails } from '../src/optimizer/feedback/failure-details.js';
-import { detectPatterns } from '../src/optimizer/feedback/patterns.js';
-import { buildPassingFailingDiff } from '../src/optimizer/feedback/passing-failing-diff.js';
-
-function mkResult(opts: {
-  taskId: string;
-  modelId: string;
-  modelName: string;
-  passed: boolean;
-  expected: { name: string; args?: Record<string, unknown> };
-  extracted?: Array<{ name: string; args?: Record<string, unknown> }>;
-  error?: string;
-}): TaskResult {
-  return {
-    task: { id: opts.taskId, prompt: 'p', expected_actions: [opts.expected] },
-    model: { id: opts.modelId, name: opts.modelName, tier: 'mid' },
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls: (opts.extracted ?? []).map((c) => ({ method: c.name, args: c.args ?? {} })) as any,
-    actionMatches: [{
-      expected: opts.expected,
-      found: opts.passed ? ({ method: opts.expected.name, args: opts.expected.args ?? {} } as any) : null,
-      methodFound: opts.passed,
-      argsCorrect: opts.passed,
-      matched: opts.passed,
-    }],
-    metrics: {
-      toolPrecision: 0,
-      toolRecall: 0,
-      taskPassed: opts.passed,
-      toolSelectionAccuracy: opts.passed ? 1 : 0,
-      argAccuracy: opts.passed ? 1 : 0,
-      unnecessaryActions: [],
-      hallucinatedActions: opts.extracted?.filter((c) => c.name !== opts.expected.name).map((c) => c.name) ?? [],
-      hallucinationRate: 0,
-    },
-    llmLatencyMs: 0,
-    error: opts.error,
-  };
-}
-
-function testMissingToolKind() {
-  const result = mkResult({
-    taskId: 't1', modelId: 'm1', modelName: 'M1', passed: false,
-    expected: { name: 'Wallet.send', args: { amount: 10 } },
-    extracted: [{ name: 'Wallet.transfer' }],
-  });
-  const [detail] = extractFailureDetails([result]);
-  assert.strictEqual(detail.kind, 'missing-tool');
-  assert.ok(detail.mismatch_detail.includes('Wallet.transfer'));
-  console.log('PASS: missing-tool detail');
-}
-
-function testBadArgsKind() {
-  const result = mkResult({
-    taskId: 't2', modelId: 'm1', modelName: 'M1', passed: false,
-    expected: { name: 'Wallet.send', args: { amount: 10 } },
-    extracted: [{ name: 'Wallet.send', args: { amount: 'ten' } }],
-  });
-  const [detail] = extractFailureDetails([result]);
-  assert.strictEqual(detail.kind, 'bad-args');
-  console.log('PASS: bad-args detail');
-}
-
-function testErrorKind() {
-  const result = mkResult({
-    taskId: 't3', modelId: 'm1', modelName: 'M1', passed: false,
-    expected: { name: 'Wallet.send' },
-    error: 'rate limited',
-  });
-  const [detail] = extractFailureDetails([result]);
-  assert.strictEqual(detail.kind, 'error');
-  assert.ok(detail.mismatch_detail.includes('rate limited'));
-  console.log('PASS: error detail');
-}
-
-function testPatternDetection() {
-  const details = [
-    { task_id: 't1', model_id: 'a', kind: 'missing-tool' as const, expected_action: 'Wallet.send', expected_args: {}, actual_calls: [{ action: 'Wallet.transfer', args: {} }], mismatch_detail: '' },
-    { task_id: 't1', model_id: 'b', kind: 'missing-tool' as const, expected_action: 'Wallet.send', expected_args: {}, actual_calls: [{ action: 'Wallet.transfer', args: {} }], mismatch_detail: '' },
-    { task_id: 't2', model_id: 'c', kind: 'missing-tool' as const, expected_action: 'Wallet.send', expected_args: {}, actual_calls: [{ action: 'Wallet.transfer', args: {} }], mismatch_detail: '' },
-  ];
-  const patterns = detectPatterns(details);
-  assert.ok(patterns.some((p) => p.kind === 'systematic-hallucination' && p.summary.includes('Wallet.transfer')));
-  console.log('PASS: systematic hallucination pattern detected');
-}
-
-function testPassingFailingDiff() {
-  const passing = mkResult({ taskId: 't1', modelId: 'a', modelName: 'A', passed: true, expected: { name: 'Wallet.send' }, extracted: [{ name: 'Wallet.send' }] });
-  const failing = mkResult({ taskId: 't1', modelId: 'b', modelName: 'B', passed: false, expected: { name: 'Wallet.send' }, extracted: [{ name: 'Wallet.transfer' }] });
-  const diff = buildPassingFailingDiff([passing, failing]);
-  const t1 = diff.find((d) => d.task_id === 't1');
-  assert.ok(t1);
-  assert.deepStrictEqual(t1!.passing_models.sort(), ['A']);
-  assert.deepStrictEqual(t1!.failing_models.sort(), ['B']);
-  console.log('PASS: passing/failing diff split by model');
-}
-
-async function main() {
-  testMissingToolKind();
-  testBadArgsKind();
-  testErrorKind();
-  testPatternDetection();
-  testPassingFailingDiff();
-  console.log('\nALL PASS: smoke-feedback');
-}
-
-main().catch((err) => { console.error('FAIL: smoke-feedback', err); process.exit(1); });
diff --git a/tests/smoke-gen-docs.ts b/tests/smoke-gen-docs.ts
deleted file mode 100644
index b059d29..0000000
--- a/tests/smoke-gen-docs.ts
+++ /dev/null
@@ -1,77 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { readFileSync, existsSync } from 'node:fs';
-import { spawnSync } from 'node:child_process';
-import { resolve } from 'node:path';
-
-const REPO_ROOT = resolve(import.meta.dirname, '..');
-
-async function test(name: string, fn: () => void | Promise<void>): Promise<void> {
-  try {
-    await fn();
-    console.log(`PASS: ${name}`);
-  } catch (e) {
-    const message = e instanceof Error ? e.message : String(e);
-    console.error(`FAIL: ${name} — ${message}`);
-    process.exit(1);
-  }
-}
-
-const ERRORS_MD = resolve(REPO_ROOT, 'docs/reference/errors.md');
-const CONFIG_SCHEMA_MD = resolve(REPO_ROOT, 'docs/reference/config-schema.md');
-
-function assertContainsAllErrorCodes(content: string, contextMsg: string): Promise<void> {
-  return import('../src/errors.js').then(({ ERRORS }) => {
-    for (const key of Object.keys(ERRORS)) {
-      assert.ok(content.includes(`\`${key}\``), `${contextMsg}: missing error code \`${key}\``);
-    }
-  });
-}
-
-async function main() {
-  await test('gen-docs: errors.md exists and contains AUTO-GENERATED header', () => {
-    assert.ok(existsSync(ERRORS_MD), `errors.md should exist at ${ERRORS_MD}`);
-    const content = readFileSync(ERRORS_MD, 'utf-8');
-    assert.ok(content.includes('AUTO-GENERATED'), 'errors.md should contain AUTO-GENERATED header comment');
-    assert.ok(content.includes('# Error Reference'), 'errors.md should contain "# Error Reference" heading');
-    assert.ok(
-      content.includes('| Code | Description | Quick fix |'),
-      'errors.md should contain summary table header'
-    );
-  });
-
-  await test('gen-docs: errors.md contains all ERRORS codes', async () => {
-    await assertContainsAllErrorCodes(readFileSync(ERRORS_MD, 'utf-8'), 'errors.md');
-  });
-
-  await test('gen-docs: config-schema.md exists and contains expected fields', () => {
-    assert.ok(existsSync(CONFIG_SCHEMA_MD), `config-schema.md should exist at ${CONFIG_SCHEMA_MD}`);
-    const content = readFileSync(CONFIG_SCHEMA_MD, 'utf-8');
-    assert.ok(content.includes('AUTO-GENERATED'), 'config-schema.md should contain AUTO-GENERATED header comment');
-    assert.ok(content.includes('# Config Schema Reference'), 'config-schema.md should contain "# Config Schema Reference" heading');
-    assert.ok(content.includes('`target.surface`'), 'config-schema.md should contain `target.surface` field');
-    assert.ok(content.includes('`benchmark.models`'), 'config-schema.md should contain `benchmark.models` field');
-    assert.ok(
-      content.includes('| Field | Type | Default | Description |'),
-      'config-schema.md should contain table header'
-    );
-  });
-
-  await test('gen-docs: script runs cleanly and output is stable', async () => {
-    const result = spawnSync('npm', ['run', 'gen-docs'], {
-      cwd: REPO_ROOT,
-      encoding: 'utf-8',
-      shell: true,
-    });
-    assert.strictEqual(
-      result.status,
-      0,
-      `gen-docs should exit 0, got ${result.status}. stderr: ${result.stderr}`
-    );
-
-    await assertContainsAllErrorCodes(readFileSync(ERRORS_MD, 'utf-8'), 'errors.md after re-run');
-  });
-
-  console.log('\nALL PASS: smoke-gen-docs');
-}
-
-main().catch((err) => { console.error('FAIL: smoke-gen-docs', err); process.exit(1); });
diff --git a/tests/smoke-generation.ts b/tests/smoke-generation.ts
deleted file mode 100644
index f786705..0000000
--- a/tests/smoke-generation.ts
+++ /dev/null
@@ -1,695 +0,0 @@
-import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
-import { join, resolve } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import type { BenchmarkConfig, LLMResponse } from '../src/benchmark/types.js';
-import { evaluateTask } from '../src/benchmark/evaluator.js';
-import { extract } from '../src/benchmark/extractors/index.js';
-import {
-  discoverTaskSurface,
-  freezeTaskArtifacts,
-  generateCandidateTasks,
-  generateTasksForProject,
-  groundTasks,
-} from '../src/tasks/index.js';
-import type { GeneratedTask, TaskGeneratorDeps } from '../src/tasks/types.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-function makeFixture(): {
-  root: string;
-  benchmarkConfigPath: string;
-  skillPath: string;
-  sourcePath: string;
-} {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-generation-'));
-  const skillPath = join(root, 'SKILL.md');
-  const sourcePath = join(root, 'server.ts');
-  const benchmarkConfigPath = join(root, 'skill-optimizer.json');
-
-  writeFileSync(skillPath, '# Wallet skill\nUse MCP tools only.\n', 'utf-8');
-  writeFileSync(sourcePath, [
-    'export const TOOLS = [',
-    '  {',
-    "    type: 'function',",
-    '    function: {',
-    "      name: 'create_wallet',",
-    "      description: 'Create wallet',",
-    '      parameters: {',
-    "        type: 'object',",
-    '        properties: {',
-    "          label: { type: 'string' },",
-    '        },',
-    "        required: ['label'],",
-    '      },',
-    '    },',
-    '  },',
-    '  {',
-    "    type: 'function',",
-    '    function: {',
-    "      name: 'get_balance',",
-    "      description: 'Get balance',",
-    '      parameters: {',
-    "        type: 'object',",
-    '        properties: {',
-    "          address: { type: 'string' },",
-    '        },',
-    "        required: ['address'],",
-    '      },',
-    '    },',
-    '  },',
-    '] as const;',
-  ].join('\n'), 'utf-8');
-
-  writeFileSync(benchmarkConfigPath, JSON.stringify({
-    name: 'gen-smoke',
-    target: {
-      surface: 'mcp',
-      repoPath: '.',
-      skill: './SKILL.md',
-      discovery: {
-        mode: 'auto',
-        sources: ['./server.ts'],
-      },
-    },
-    benchmark: {
-      tasks: './tasks.json',
-      format: 'pi',
-      models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-    },
-  }, null, 2), 'utf-8');
-
-  return { root, benchmarkConfigPath, skillPath, sourcePath };
-}
-
-console.log('\n=== Task Generation Smoke Tests ===\n');
-
-await test('discoverTaskSurface: resolves and loads skill/snapshot', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    assertEqual(surface.skillPath, resolve(fixture.skillPath), 'skill path should resolve absolute');
-    assert(surface.skillMarkdown.includes('Wallet skill'), 'skill markdown should be loaded');
-    assertEqual(surface.snapshot.surface, 'mcp', 'surface should be mcp');
-    assertEqual(surface.snapshot.actions.length, 2, 'should load discovered actions');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('generateCandidateTasks: parses strict JSON response', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            {
-              id: 'task-create',
-              prompt: 'Create a wallet named alpha.',
-              expected_actions: [
-                { name: 'create_wallet', args: { label: 'alpha' } },
-              ],
-            },
-          ],
-        });
-      },
-    };
-
-    const generated = await generateCandidateTasks(surface, { maxTasks: 5, seed: 7 }, deps);
-    assertEqual(generated.length, 1, 'should parse one task');
-    // ID is derived from action names (content-stable hash), not the LLM-supplied id field
-    assert(/^[0-9a-f]{12}$/.test(generated[0].id), 'task id should be a 12-char hex hash of action names');
-    assertEqual(generated[0].prompt, 'Create a wallet named alpha.', 'task prompt should match');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('generateCandidateTasks: enforces maxTasks cap after parsing', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            { id: 't1', prompt: 'one', expected_actions: [{ name: 'create_wallet', args: { label: 'one' } }] },
-            { id: 't2', prompt: 'two', expected_actions: [{ name: 'create_wallet', args: { label: 'two' } }] },
-            { id: 't3', prompt: 'three', expected_actions: [{ name: 'create_wallet', args: { label: 'three' } }] },
-          ],
-        });
-      },
-    };
-
-    const generated = await generateCandidateTasks(surface, { maxTasks: 2, seed: 7 }, deps);
-    assertEqual(generated.length, 2, 'generator output should be capped to maxTasks');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('generateCandidateTasks: sanitizes unsafe task ids instead of throwing', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            { id: '../../escape', prompt: 'bad', expected_actions: [{ name: 'create_wallet', args: { label: 'bad' } }] },
-          ],
-        });
-      },
-    };
-    // Should not throw — sanitizes the id instead
-    const tasks = await generateCandidateTasks(surface, { maxTasks: 2, seed: 7 }, deps);
-    assert(tasks.length === 1, 'should return one task');
-    assert(!tasks[0].id.includes('/'), 'sanitized id must not contain path separators');
-    assert(tasks[0].id !== '..' && tasks[0].id !== '.', 'sanitized id must not be a dot-segment');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('generateCandidateTasks: sanitizes dot-segment task ids instead of throwing', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            { id: '..', prompt: 'bad', expected_actions: [{ name: 'create_wallet', args: { label: 'bad' } }] },
-          ],
-        });
-      },
-    };
-    // Should not throw — falls back to index-based id
-    const tasks = await generateCandidateTasks(surface, { maxTasks: 2, seed: 7 }, deps);
-    assert(tasks.length === 1, 'should return one task');
-    assert(tasks[0].id !== '..', 'dot-segment id must be replaced');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('generateCandidateTasks: rejects malformed top-level shape', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({ not_tasks: [] });
-      },
-    };
-
-    let threw = false;
-    try {
-      await generateCandidateTasks(surface, { maxTasks: 3, seed: 1 }, deps);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('top-level "tasks" array'), 'error should mention tasks array shape');
-    }
-
-    assert(threw, 'malformed response should throw');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('groundTasks: rejects unknown methods and invalid args', async () => {
-  const fixture = makeFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const tasks: GeneratedTask[] = [
-      {
-        id: 'ok-task',
-        prompt: 'Create a wallet.',
-        expected_actions: [{ name: 'create_wallet', args: { label: 'alpha' } }],
-      },
-      {
-        id: 'bad-method',
-        prompt: 'Call made up method.',
-        expected_actions: [{ name: 'delete_wallet', args: { id: 'x' } }],
-      },
-      {
-        id: 'bad-arg',
-        prompt: 'Use unknown arg key.',
-        expected_actions: [{ name: 'get_balance', args: { walletId: 'w1' } }],
-      },
-    ];
-
-    const result = groundTasks(tasks, surface.snapshot);
-    assertEqual(result.kept.length, 1, 'only valid task should remain');
-    assertEqual(result.rejected.length, 2, 'two tasks should be rejected');
-    assert(result.rejected.some((entry) => entry.reason.includes('unknown method')), 'should include unknown method rejection');
-    assert(result.rejected.some((entry) => entry.reason.includes('unknown arg key')), 'should include unknown arg key rejection');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('freezeGeneratedBenchmark: writes artifacts and absolute paths', async () => {
-  const fixture = makeFixture();
-  try {
-    writeFileSync(fixture.benchmarkConfigPath, JSON.stringify({
-      name: 'gen-smoke',
-      target: {
-        surface: 'mcp',
-        repoPath: '.',
-        skill: { source: './SKILL.md', cache: false },
-        discovery: {
-          mode: 'auto',
-          sources: ['./server.ts'],
-        },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const outDir = join(fixture.root, 'generated');
-    const kept: GeneratedTask[] = [
-      {
-        id: 'frozen-task',
-        prompt: 'Create wallet and check balance.',
-        expected_actions: [
-          { name: 'create_wallet', args: { label: 'frozen' } },
-          { name: 'get_balance', args: { address: '<dynamic>' } },
-        ],
-      },
-    ];
-    const rejected = [{ task: kept[0], reason: 'example reason' }];
-
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const frozen = freezeTaskArtifacts({
-      project: surface.project,
-      snapshot: surface.snapshot,
-      outputDir: outDir,
-      kept,
-      rejected,
-    });
-
-    assert(existsSync(frozen.tasksPath), 'tasks.generated.json should exist');
-    assert(existsSync(frozen.benchmarkPath), 'benchmark.generated.json should exist');
-    assert(existsSync(frozen.logPath), 'generation.log.json should exist');
-    assert(existsSync(frozen.snapshotPath), 'surface.snapshot.json should exist');
-
-    const benchmark = JSON.parse(readFileSync(frozen.benchmarkPath, 'utf-8')) as {
-      target: { skill: { source: string; cache: boolean } };
-      benchmark: { authMode?: string; tasks: string; surfaceSnapshot: string };
-      optimize?: unknown;
-    };
-
-    assert(benchmark.target.skill.source.startsWith('/'), 'target.skill.source should be absolute');
-    assertEqual(benchmark.target.skill.cache, false, 'target.skill.cache should be preserved');
-    assertEqual(benchmark.benchmark.authMode, 'env', 'benchmark authMode should be preserved in generated config');
-    assertEqual(benchmark.benchmark.tasks, frozen.tasksPath, 'tasks should point at generated tasks path');
-    assertEqual(benchmark.benchmark.surfaceSnapshot, frozen.snapshotPath, 'surface snapshot should be pinned in generated config');
-    assertEqual(benchmark.optimize, undefined, 'generated benchmark config should omit optimize-only settings');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('generateTasksForProject: runs discover -> generate -> ground -> freeze', async () => {
-  const fixture = makeFixture();
-  try {
-    const outDir = join(fixture.root, 'end-to-end');
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            {
-              id: 'kept-task',
-              prompt: 'Create wallet called beta.',
-              expected_actions: [{ name: 'create_wallet', args: { label: 'beta' } }],
-            },
-            {
-              id: 'balance-task',
-              prompt: 'Get balance for address 0x1.',
-              expected_actions: [{ name: 'get_balance', args: { address: '0x1' } }],
-            },
-            {
-              id: 'rejected-task',
-              prompt: 'Use unknown method.',
-              expected_actions: [{ name: 'delete_wallet', args: { id: 'x' } }],
-            },
-          ],
-        });
-      },
-    };
-
-    const result = await generateTasksForProject({
-      configPath: fixture.benchmarkConfigPath,
-      maxTasks: 10,
-      seed: 1,
-      outputDir: outDir,
-      deps,
-    });
-
-    assertEqual(result.kept.length, 2, 'two tasks should remain after grounding');
-    // IDs are now content-based hashes; verify by the action the task covers
-    assert(result.kept.some((t) => t.expected_actions.some(a => a.name === 'create_wallet')), 'task covering create_wallet should be kept');
-    assert(result.kept.some((t) => t.expected_actions.some(a => a.name === 'get_balance')), 'task covering get_balance should be kept');
-    assert(result.rejected.length >= 1, 'at least one rejected task expected');
-    assert(existsSync(result.artifacts.benchmarkPath), 'generated benchmark config should exist');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('discoverTaskSurface: supports sdk code-first projects', async () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-sdk-generation-'));
-  try {
-    writeFileSync(join(root, 'SKILL.md'), '# SDK skill\nUse SDK methods.\n', 'utf-8');
-    writeFileSync(join(root, 'index.ts'), 'export class Client { constructor(key: string) {} getBalance(accountId: string) {} }\n', 'utf-8');
-    writeFileSync(join(root, 'skill-optimizer.json'), JSON.stringify({
-      name: 'sdk-gen-smoke',
-      target: {
-        surface: 'sdk',
-        repoPath: '.',
-        skill: './SKILL.md',
-        discovery: { mode: 'auto', sources: ['./index.ts'], language: 'typescript' },
-        sdk: { language: 'typescript' },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const surface = await discoverTaskSurface(join(root, 'skill-optimizer.json'));
-    assertEqual(surface.snapshot.surface, 'sdk', 'surface should be sdk');
-    assert(surface.snapshot.actions.some((action) => action.name === 'Client.getBalance'), 'sdk action should be discovered');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('discoverTaskSurface: supports cli code-first projects', async () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-cli-generation-'));
-  try {
-    writeFileSync(join(root, 'SKILL.md'), '# CLI skill\nUse commands.\n', 'utf-8');
-    writeFileSync(join(root, 'commands.ts'), [
-      'export const COMMANDS = [',
-      '  {',
-      "    command: 'wallet:create',",
-      '    options: [',
-      "      { name: '--label', takesValue: true },",
-      '    ],',
-      '  },',
-      '];',
-    ].join('\n'), 'utf-8');
-    writeFileSync(join(root, 'skill-optimizer.json'), JSON.stringify({
-      name: 'cli-gen-smoke',
-      target: {
-        surface: 'cli',
-        repoPath: '.',
-        skill: './SKILL.md',
-        discovery: { mode: 'auto', sources: ['./commands.ts'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const surface = await discoverTaskSurface(join(root, 'skill-optimizer.json'));
-    assertEqual(surface.snapshot.surface, 'cli', 'surface should be cli');
-    assert(surface.snapshot.actions.some((action) => action.name === 'wallet:create'), 'cli action should be discovered');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-await test('cli discovery/task generation canonicalizes option keys for extraction and evaluation', async () => {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-cli-eval-'));
-  try {
-    writeFileSync(join(root, 'SKILL.md'), '# CLI skill\nUse commands.\n', 'utf-8');
-    writeFileSync(join(root, 'commands.ts'), [
-      'export const COMMANDS = [',
-      '  {',
-      "    command: 'wallet:create',",
-      '    options: [',
-      "      { name: '--label', takesValue: true },",
-      '    ],',
-      '  },',
-      '];',
-    ].join('\n'), 'utf-8');
-    writeFileSync(join(root, 'skill-optimizer.json'), JSON.stringify({
-      name: 'cli-eval-smoke',
-      target: {
-        surface: 'cli',
-        repoPath: '.',
-        skill: './SKILL.md',
-        discovery: { mode: 'auto', sources: ['./commands.ts'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-      },
-    }, null, 2), 'utf-8');
-
-    const surface = await discoverTaskSurface(join(root, 'skill-optimizer.json'));
-    assertEqual(surface.snapshot.actions[0]?.args[0]?.name, 'label', 'CLI arg names should be canonicalized without dashes');
-
-    const grounded = groundTasks([
-      {
-        id: 'cli-task',
-        prompt: 'Create a wallet.',
-        expected_actions: [{ name: 'wallet:create', args: { label: 'demo' } }],
-      },
-    ], surface.snapshot);
-    assertEqual(grounded.kept.length, 1, 'CLI task should ground against canonical arg name');
-
-    const config: BenchmarkConfig & { surface: 'cli'; cli: { commands: string; commandDefinitions: Array<{ command: string }> } } = {
-      name: 'cli-eval-smoke',
-      surface: 'cli',
-      cli: {
-        commands: 'commands.json',
-        commandDefinitions: [{ command: 'wallet:create' }],
-      },
-      tasks: 'tasks.json',
-      llm: {
-        baseUrl: '',
-        apiKeyEnv: 'OPENROUTER_API_KEY',
-        format: 'openai',
-        models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-      },
-    };
-    const response: LLMResponse = {
-      content: '```bash\nwallet:create --label demo\n```',
-    };
-    const { calls } = await extract(response, config);
-    const result = evaluateTask({
-      task: grounded.kept[0],
-      model: config.llm.models[0],
-      surface: 'cli',
-      generatedCode: null,
-      rawResponse: response.content,
-      extractedCalls: calls,
-      llmLatencyMs: 0,
-      error: undefined,
-      knownMethods: new Set(['wallet:create']),
-    });
-    assertEqual(result.metrics.taskPassed, true, 'CLI canonical arg names should still match extracted command args');
-  } finally {
-    rmSync(root, { recursive: true, force: true });
-  }
-});
-
-// ── Prompt surface fixture ────────────────────────────────────────────────────
-
-function makePromptFixture(): {
-  root: string;
-  benchmarkConfigPath: string;
-  skillPath: string;
-  // Capability keys produced by the phase headings in the skill file.
-  capabilityKeys: { summarize: string; translate: string };
-} {
-  const root = mkdtempSync(join(tmpdir(), 'skill-optimizer-prompt-'));
-  const skillPath = join(root, 'SKILL.md');
-  const benchmarkConfigPath = join(root, 'skill-optimizer.json');
-
-  // Two ## Phase headings produce exactly two capabilities:
-  //   phase_1_summarize  and  phase_2_translate
-  // Body text avoids imperative verbs and decision-point words so no extra
-  // instruction / decision capabilities are generated alongside the phases.
-  writeFileSync(skillPath, [
-    '# Translation Service Skill',
-    '',
-    '## Phase 1: Summarize',
-    'Condenses long documents into brief summaries for quick reading.',
-    '',
-    '## Phase 2: Translate',
-    'Converts text from one language to another while preserving meaning.',
-  ].join('\n'), 'utf-8');
-
-  writeFileSync(benchmarkConfigPath, JSON.stringify({
-    name: 'prompt-smoke',
-    target: {
-      surface: 'prompt',
-      repoPath: '.',
-      skill: './SKILL.md',
-    },
-    benchmark: {
-      tasks: './tasks.json',
-      format: 'pi',
-      models: [{ id: 'openai/test', name: 'Test', tier: 'flagship' }],
-    },
-  }, null, 2), 'utf-8');
-
-  return {
-    root,
-    benchmarkConfigPath,
-    skillPath,
-    capabilityKeys: { summarize: 'phase_1_summarize', translate: 'phase_2_translate' },
-  };
-}
-
-// ── Prompt surface: capabilityId tagging ─────────────────────────────────────
-
-await test('prompt surface: generator tags tasks with capabilityId', async () => {
-  const fixture = makePromptFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    assertEqual(surface.snapshot.surface, 'prompt', 'surface should be prompt');
-
-    const { summarize, translate } = fixture.capabilityKeys;
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            {
-              id: 'summarize_long_doc',
-              prompt: 'Summarize this long research paper into three bullet points.',
-              expected_actions: [],
-              capabilityId: summarize,
-            },
-            {
-              id: 'translate_spanish',
-              prompt: 'Translate the following paragraph from English to Spanish.',
-              expected_actions: [],
-              capabilityId: translate,
-            },
-          ],
-        });
-      },
-    };
-
-    const generated = await generateCandidateTasks(surface, { maxTasks: 5, seed: 7 }, deps);
-    assertEqual(generated.length, 2, 'should produce 2 tasks');
-
-    const summarizeTask = generated.find((t) => t.capabilityId === summarize);
-    const translateTask = generated.find((t) => t.capabilityId === translate);
-
-    assert(summarizeTask !== undefined, `task with capabilityId "${summarize}" should exist`);
-    assert(translateTask !== undefined, `task with capabilityId "${translate}" should exist`);
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-// ── Prompt surface: grounding rejects unknown capabilityId ───────────────────
-
-await test('prompt surface: grounding rejects unknown capabilityId', async () => {
-  const fixture = makePromptFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const { summarize } = fixture.capabilityKeys;
-
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            {
-              id: 'bad_task',
-              prompt: 'Use an unknown capability.',
-              expected_actions: [],
-              capabilityId: 'not-real',
-            },
-            {
-              id: 'good_task',
-              prompt: 'Summarize this document into bullet points.',
-              expected_actions: [],
-              capabilityId: summarize,
-            },
-          ],
-        });
-      },
-    };
-
-    const generated = await generateCandidateTasks(surface, { maxTasks: 5, seed: 7 }, deps);
-
-    // Only the task with a valid capabilityId passes grounding.
-    const grounded = groundTasks(generated, surface.snapshot);
-    assertEqual(grounded.kept.length, 1, 'only the valid capabilityId task should be kept');
-    assertEqual(grounded.rejected.length, 1, 'task with unknown capabilityId should be rejected');
-    assert(
-      grounded.rejected[0].reason.includes('unknown capabilityId'),
-      `rejection reason should mention unknown capabilityId, got: ${grounded.rejected[0].reason}`,
-    );
-    assertEqual(grounded.kept[0].capabilityId, summarize, 'kept task should have the valid capabilityId');
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-await test('prompt surface: grounding rejects task with missing capabilityId', async () => {
-  const fixture = makePromptFixture();
-  try {
-    const surface = await discoverTaskSurface(fixture.benchmarkConfigPath);
-    const deps: TaskGeneratorDeps = {
-      async complete() {
-        return JSON.stringify({
-          tasks: [
-            // capabilityId field is entirely absent from this task
-            { id: 't1', prompt: 'Do something.', expected_actions: [] },
-          ],
-        });
-      },
-    };
-    const generated = await generateCandidateTasks(surface, { maxTasks: 5, seed: 7 }, deps);
-    const result = groundTasks(generated, surface.snapshot);
-    assertEqual(result.kept.length, 0, 'task without capabilityId must be rejected');
-    assertEqual(result.rejected.length, 1, 'rejected list must have one entry');
-    assert(result.rejected[0]!.reason.includes('capabilityId'),
-      `rejection reason must mention capabilityId, got: "${result.rejected[0]!.reason}"`);
-  } finally {
-    rmSync(fixture.root, { recursive: true, force: true });
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-import.ts b/tests/smoke-import.ts
deleted file mode 100644
index b485c8a..0000000
--- a/tests/smoke-import.ts
+++ /dev/null
@@ -1,201 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { mkdtempSync, existsSync, readFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join } from 'node:path';
-import type { CliCommandDefinition } from '../src/import/types.js';
-import { writeOutput } from '../src/import/output.js';
-import { parseHelpOutput } from '../src/import/extractors/help-scraper.js';
-import { extractCommander } from '../src/import/extractors/ts-commander.js';
-import { extractYargs } from '../src/import/extractors/ts-yargs.js';
-import { extractClick } from '../src/import/extractors/py-click.js';
-import { extractArgparse } from '../src/import/extractors/py-argparse.js';
-import { extractClap } from '../src/import/extractors/rs-clap.js';
-import { detectFramework } from '../src/import/detect.js';
-import { importCommands } from '../src/import/index.js';
-
-// === Types check ===
-const _typeCheck: CliCommandDefinition = { command: 'create', description: 'Create item' };
-assert.strictEqual(typeof _typeCheck.command, 'string');
-
-// === writeOutput ===
-{
-  const dir = mkdtempSync(join(tmpdir(), 'import-test-'));
-  const outPath = join(dir, 'cli-commands.json');
-  const commands: CliCommandDefinition[] = [
-    { command: 'create', description: 'Create item', options: [{ name: '--name', takesValue: true }] },
-  ];
-  writeOutput(commands, outPath);
-  assert.strictEqual(existsSync(outPath), true);
-  const written = JSON.parse(readFileSync(outPath, 'utf-8')) as CliCommandDefinition[];
-  assert.strictEqual(written.length, 1);
-  assert.strictEqual(written[0]!.command, 'create');
-}
-
-// === help-scraper: parseHelpOutput (root) ===
-{
-  const rootHelp = readFileSync('tests/fixtures/import-commands/help-output-sample.txt', 'utf-8');
-  const commands = parseHelpOutput(rootHelp, []);
-  assert.ok(commands.length >= 5, `Expected >=5 commands, got ${commands.length}: ${commands.map(c => c.command).join(', ')}`);
-  const accountCmd = commands.find(c => c.command === 'account');
-  assert.ok(accountCmd !== undefined, 'Should find "account" command');
-  assert.strictEqual(accountCmd?.description, 'Account management');
-}
-
-// === help-scraper: parseHelpOutput (subcommand with prefix) ===
-{
-  const subHelp = readFileSync('tests/fixtures/import-commands/help-output-account.txt', 'utf-8');
-  const commands = parseHelpOutput(subHelp, ['account']);
-  // 3 subcommands + 1 synthetic parent entry with options
-  assert.ok(commands.length >= 3, `Expected >=3, got ${commands.length}: ${commands.map(c => c.command).join(', ')}`);
-  assert.ok(commands.find(c => c.command === 'account create') !== undefined);
-  assert.ok(commands.find(c => c.command === 'account delete') !== undefined);
-  // Parent entry includes options from the page
-  const parent = commands.find(c => c.command === 'account');
-  assert.ok(parent !== undefined, 'Should find synthetic parent "account" entry with options');
-}
-
-// === ts-commander extractor ===
-{
-  const fixturePath = 'tests/fixtures/import-commands/commander-sample.ts';
-  const commands = extractCommander(fixturePath);
-  assert.strictEqual(commands.length, 3, `Expected 3 commands, got ${commands.length}: ${commands.map(c => c.command).join(', ')}`);
-
-  const create = commands.find(c => c.command === 'create');
-  assert.ok(create !== undefined, 'Should find "create" command');
-  assert.strictEqual(create?.description, 'Create a new item');
-
-  const nameOpt = create?.options?.find(o => o.name === '--name <value>');
-  assert.ok(nameOpt !== undefined, 'Should find --name option');
-  assert.strictEqual(nameOpt?.takesValue, true);
-
-  const dryRun = create?.options?.find(o => o.name === '--dry-run');
-  assert.ok(dryRun !== undefined, 'Should find --dry-run option');
-  assert.strictEqual(dryRun?.takesValue, false);
-
-  const deleteCmd = commands.find(c => c.command === 'delete');
-  assert.ok(deleteCmd !== undefined, 'Should find "delete" command (positional stripped)');
-}
-
-// === ts-yargs extractor ===
-{
-  const fixturePath = 'tests/fixtures/import-commands/yargs-sample.ts';
-  const commands = extractYargs(fixturePath);
-  assert.strictEqual(commands.length, 2, `Expected 2 commands, got ${commands.length}: ${commands.map(c => c.command).join(', ')}`);
-
-  const create = commands.find(c => c.command === 'create');
-  assert.ok(create !== undefined, 'Should find "create" command');
-  assert.strictEqual(create?.description, 'Create a new item');
-
-  const nameOpt = create?.options?.find(o => o.name === '--name');
-  assert.ok(nameOpt !== undefined, 'Should find --name option');
-  assert.strictEqual(nameOpt?.takesValue, true);
-
-  const verboseOpt = create?.options?.find(o => o.name === '--verbose');
-  assert.ok(verboseOpt !== undefined, 'Should find --verbose option');
-  assert.strictEqual(verboseOpt?.takesValue, false);
-
-  const deleteCmd = commands.find(c => c.command === 'delete');
-  assert.ok(deleteCmd !== undefined, 'Should find "delete" command (positional stripped)');
-}
-
-// === py-click extractor ===
-{
-  const fixturePath = 'tests/fixtures/import-commands/click-sample.py';
-  const commands = await extractClick(fixturePath);
-  assert.strictEqual(commands.length, 2, `Expected 2 commands, got ${commands.length}: ${commands.map(c => c.command).join(', ')}`);
-
-  const create = commands.find(c => c.command === 'create');
-  assert.ok(create !== undefined, 'Should find "create" command');
-  assert.strictEqual(create?.description, 'Create a new item.');
-
-  const nameOpt = create?.options?.find(o => o.name === '--name');
-  assert.ok(nameOpt !== undefined, 'Should find --name option');
-  assert.strictEqual(nameOpt?.takesValue, true);
-
-  const verboseOpt = create?.options?.find(o => o.name === '--verbose');
-  assert.ok(verboseOpt !== undefined, 'Should find --verbose option');
-  assert.strictEqual(verboseOpt?.takesValue, false);
-
-  const deleteCmd = commands.find(c => c.command === 'delete');
-  assert.ok(deleteCmd !== undefined, 'Should find "delete" command');
-}
-
-// === py-argparse extractor ===
-{
-  const fixturePath = 'tests/fixtures/import-commands/argparse-sample.py';
-  const commands = await extractArgparse(fixturePath);
-  assert.strictEqual(commands.length, 2, `Expected 2 commands, got ${commands.length}: ${commands.map(c => c.command).join(', ')}`);
-
-  const create = commands.find(c => c.command === 'create');
-  assert.ok(create !== undefined, 'Should find "create" command');
-  assert.strictEqual(create?.description, 'Create a new item');
-
-  const nameOpt = create?.options?.find(o => o.name === '--name');
-  assert.ok(nameOpt !== undefined, 'Should find --name option');
-  assert.strictEqual(nameOpt?.takesValue, true);
-
-  const listCmd = commands.find(c => c.command === 'list');
-  assert.ok(listCmd !== undefined, 'Should find "list" command');
-  assert.strictEqual(listCmd?.description, 'List all items');
-}
-
-// === rs-clap extractor ===
-{
-  const fixturePath = 'tests/fixtures/import-commands/clap-sample.rs';
-  const commands = await extractClap(fixturePath);
-  assert.ok(commands.length >= 2, `Expected >=2 commands, got ${commands.length}`);
-
-  const create = commands.find(c => c.command === 'create');
-  assert.ok(create !== undefined, 'Should find "create" command');
-  assert.strictEqual(create?.description, 'Create a new item');
-
-  const nameOpt = create?.options?.find(o => o.name === '--name');
-  assert.ok(nameOpt !== undefined, 'Should find --name option');
-  assert.strictEqual(nameOpt?.takesValue, true);
-
-  const verboseOpt = create?.options?.find(o => o.name === '--verbose');
-  assert.ok(verboseOpt !== undefined, 'Should find --verbose option');
-  assert.strictEqual(verboseOpt?.takesValue, false);
-
-  const deleteCmd = commands.find(c => c.command === 'delete');
-  assert.ok(deleteCmd !== undefined, 'Should find "delete" command');
-}
-
-// === detectFramework ===
-{
-  const result = detectFramework('tests/fixtures/import-commands/click-sample.py', process.cwd());
-  assert.strictEqual(result.kind, 'click');
-}
-
-{
-  const result = detectFramework('tests/fixtures/import-commands/argparse-sample.py', process.cwd());
-  assert.strictEqual(result.kind, 'argparse');
-}
-
-{
-  const result = detectFramework('tests/fixtures/import-commands/commander-sample.ts', process.cwd());
-  assert.strictEqual(result.kind, 'commander');
-}
-
-{
-  const result = detectFramework('tests/fixtures/import-commands/clap-sample.rs', process.cwd());
-  assert.strictEqual(result.kind, 'clap');
-}
-
-// === importCommands orchestration ===
-{
-  const dir = mkdtempSync(join(tmpdir(), 'import-orch-'));
-  const outPath = join(dir, 'cli-commands.json');
-  await importCommands({
-    from: 'tests/fixtures/import-commands/commander-sample.ts',
-    out: outPath,
-    scrape: false,
-    depth: 2,
-    cwd: process.cwd(),
-  });
-  assert.strictEqual(existsSync(outPath), true);
-  const written = JSON.parse(readFileSync(outPath, 'utf-8')) as CliCommandDefinition[];
-  assert.strictEqual(written.length, 3);
-}
-
-console.log('smoke-import: all tests passed');
diff --git a/tests/smoke-init.ts b/tests/smoke-init.ts
deleted file mode 100644
index ebadf10..0000000
--- a/tests/smoke-init.ts
+++ /dev/null
@@ -1,375 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { mkdirSync, mkdtempSync, rmSync, writeFileSync, existsSync, readFileSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join } from 'node:path';
-import { detectProject, detectedToPreseed } from '../src/init/detect-project.js';
-import type { WizardAnswers } from '../src/init/answers.js';
-import { buildDefaultAnswers, readAnswersFile } from '../src/init/answers.js';
-import { scaffoldInit } from '../src/init/scaffold.js';
-
-// Type check
-const _a: WizardAnswers = {
-  surface: 'sdk',
-  repoPath: '/tmp/repo',
-  models: ['openrouter/openai/gpt-4o'],
-  maxTasks: 20,
-  maxIterations: 5,
-  targetPassRate: 0.8,
-};
-assert.strictEqual(typeof _a.surface, 'string');
-
-// buildDefaultAnswers
-{
-  const defaults = buildDefaultAnswers('cli');
-  assert.strictEqual(defaults.surface, 'cli');
-  assert.ok(defaults.models.length >= 1, 'should have at least one default model');
-  assert.strictEqual(typeof defaults.maxTasks, 'number');
-  assert.strictEqual(typeof defaults.maxIterations, 'number');
-}
-
-// readAnswersFile
-{
-  const dir = mkdtempSync(join(tmpdir(), 'answers-test-'));
-  try {
-    const answers: WizardAnswers = {
-      surface: 'mcp',
-      repoPath: '/tmp/myrepo',
-      models: ['openrouter/openai/gpt-4o'],
-      maxTasks: 15,
-      maxIterations: 3,
-      entryFile: 'src/server.ts',
-    };
-    const file = join(dir, 'answers.json');
-    writeFileSync(file, JSON.stringify(answers), 'utf-8');
-    const loaded = readAnswersFile(file);
-    assert.strictEqual(loaded.surface, 'mcp');
-    assert.strictEqual(loaded.entryFile, 'src/server.ts');
-    assert.strictEqual(loaded.maxIterations, 3);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// readAnswersFile error: missing surface
-{
-  const dir = mkdtempSync(join(tmpdir(), 'answers-err-'));
-  try {
-    const bad = join(dir, 'bad.json');
-    writeFileSync(bad, JSON.stringify({ repoPath: '/tmp', models: ['openrouter/openai/gpt-4o'], maxTasks: 5, maxIterations: 1 }), 'utf-8');
-    let threw = false;
-    try { readAnswersFile(bad); } catch { threw = true; }
-    assert.ok(threw, 'readAnswersFile should throw on missing surface');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// scaffoldInit sdk
-{
-  const dir = mkdtempSync(join(tmpdir(), 'scaffold-sdk-'));
-  try {
-    await scaffoldInit({
-      surface: 'sdk',
-      repoPath: dir,
-      models: ['openrouter/openai/gpt-4o'],
-      maxTasks: 10,
-      maxIterations: 3,
-    }, dir);
-    const configPath = join(dir, '.skill-optimizer', 'skill-optimizer.json');
-    assert.ok(existsSync(configPath), 'sdk scaffold should create .skill-optimizer/skill-optimizer.json');
-    const config = JSON.parse(readFileSync(configPath, 'utf-8')) as {
-      target: { surface: string; repoPath: string };
-      benchmark: { models: Array<{ id: string }>; taskGeneration: { maxTasks: number } };
-      optimize: { maxIterations: number };
-    };
-    assert.strictEqual(config.target.surface, 'sdk');
-    assert.strictEqual(config.benchmark.models[0]?.id, 'openrouter/openai/gpt-4o');
-    assert.strictEqual(config.benchmark.taskGeneration.maxTasks, 10);
-    assert.strictEqual(config.optimize.maxIterations, 3);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// scaffoldInit cli — no entryFile, writes template
-{
-  const dir = mkdtempSync(join(tmpdir(), 'scaffold-cli-'));
-  try {
-    await scaffoldInit({
-      surface: 'cli',
-      repoPath: dir,
-      models: ['openrouter/openai/gpt-4o'],
-      maxTasks: 15,
-      maxIterations: 2,
-    }, dir);
-    const configPath = join(dir, '.skill-optimizer', 'skill-optimizer.json');
-    const commandsPath = join(dir, '.skill-optimizer', 'cli-commands.json');
-    assert.ok(existsSync(configPath), 'cli scaffold should create .skill-optimizer/skill-optimizer.json');
-    assert.ok(existsSync(commandsPath), 'cli scaffold should create .skill-optimizer/cli-commands.json');
-    const config = JSON.parse(readFileSync(configPath, 'utf-8')) as {
-      target: { surface: string; cli?: { commands?: string } };
-    };
-    assert.strictEqual(config.target.surface, 'cli');
-    assert.ok(config.target.cli?.commands?.includes('cli-commands.json'), 'config should reference cli-commands.json');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// scaffoldInit mcp — writes template tools.json
-{
-  const dir = mkdtempSync(join(tmpdir(), 'scaffold-mcp-'));
-  try {
-    await scaffoldInit({
-      surface: 'mcp',
-      repoPath: dir,
-      models: ['openrouter/openai/gpt-4o'],
-      maxTasks: 5,
-      maxIterations: 1,
-    }, dir);
-    const toolsPath = join(dir, '.skill-optimizer', 'tools.json');
-    assert.ok(existsSync(toolsPath), 'mcp scaffold should create .skill-optimizer/tools.json');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// --answers equivalent: readAnswersFile + scaffoldInit mcp
-{
-  const dir = mkdtempSync(join(tmpdir(), 'scaffold-answers-'));
-  try {
-    const answersObj = {
-      surface: 'mcp',
-      repoPath: dir,
-      models: ['openrouter/openai/gpt-4o'],
-      maxTasks: 5,
-      maxIterations: 1,
-    };
-    const answersFile = join(dir, 'answers.json');
-    writeFileSync(answersFile, JSON.stringify(answersObj), 'utf-8');
-    const answers = readAnswersFile(answersFile);
-    await scaffoldInit(answers, dir);
-    const toolsPath = join(dir, '.skill-optimizer', 'tools.json');
-    assert.ok(existsSync(toolsPath), 'mcp scaffold via readAnswersFile should create tools.json');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// --yes equivalent test (buildDefaultAnswers + scaffoldInit)
-{
-  const dir = mkdtempSync(join(tmpdir(), 'scaffold-yes-'));
-  try {
-    const answers = buildDefaultAnswers('sdk', dir);
-    await scaffoldInit(answers, dir);
-    const configPath = join(dir, '.skill-optimizer', 'skill-optimizer.json');
-    assert.ok(existsSync(configPath), '--yes sdk should create .skill-optimizer/skill-optimizer.json');
-    const config = JSON.parse(readFileSync(configPath, 'utf-8')) as { target: { surface: string }; optimize: { maxIterations: number }; benchmark: { taskGeneration: { maxTasks: number } } };
-    assert.strictEqual(config.target.surface, 'sdk');
-    assert.strictEqual(config.optimize.maxIterations, 5);
-    assert.strictEqual(config.benchmark.taskGeneration.maxTasks, 20);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// MODEL_PRESETS count check
-{
-  const { MODEL_PRESETS } = await import('../src/init/wizard.js');
-  assert.strictEqual(MODEL_PRESETS.length, 23, `Expected 23 presets, got ${MODEL_PRESETS.length}`);
-  // openrouter/ IDs use slugs from the OpenRouter catalog verbatim — dots in version segments
-  // are valid and must not be rewritten to hyphens. validate.ts exempts openrouter/ from the
-  // 'model-id-bad-format' check.
-  assert.ok(MODEL_PRESETS.every(p => p.value.startsWith('openrouter/')), 'All presets must be openrouter/ IDs');
-}
-
-// detectProject: TypeScript SDK (package.json with main, no bin)
-{
-  const dir = mkdtempSync(join(tmpdir(), 'detect-ts-sdk-'));
-  try {
-    writeFileSync(join(dir, 'package.json'), JSON.stringify({
-      name: 'my-sdk',
-      main: './dist/index.js',
-    }), 'utf-8');
-    mkdirSync(join(dir, 'src'), { recursive: true });
-    writeFileSync(join(dir, 'src', 'index.ts'), '');
-    const result = detectProject(dir);
-    assert.strictEqual(result.surface, 'sdk');
-    assert.strictEqual(result.name, 'my-sdk');
-    assert.ok(result.entryFile.includes('index'), `entryFile should reference index, got: ${result.entryFile}`);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// detectProject: TypeScript CLI (package.json with bin)
-{
-  const dir = mkdtempSync(join(tmpdir(), 'detect-ts-cli-'));
-  try {
-    writeFileSync(join(dir, 'package.json'), JSON.stringify({
-      name: 'my-cli',
-      bin: { 'my-cli': './dist/cli.js' },
-    }), 'utf-8');
-    mkdirSync(join(dir, 'src'), { recursive: true });
-    writeFileSync(join(dir, 'src', 'cli.ts'), '');
-    const result = detectProject(dir);
-    assert.strictEqual(result.surface, 'cli');
-    assert.strictEqual(result.name, 'my-cli');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// detectProject: MCP server (package.json with @modelcontextprotocol/sdk dep)
-{
-  const dir = mkdtempSync(join(tmpdir(), 'detect-mcp-'));
-  try {
-    writeFileSync(join(dir, 'package.json'), JSON.stringify({
-      name: 'my-mcp',
-      dependencies: { '@modelcontextprotocol/sdk': '^1.0.0' },
-    }), 'utf-8');
-    mkdirSync(join(dir, 'src'), { recursive: true });
-    writeFileSync(join(dir, 'src', 'server.ts'), '');
-    const result = detectProject(dir);
-    assert.strictEqual(result.surface, 'mcp');
-    assert.strictEqual(result.name, 'my-mcp');
-    assert.ok(result.entryFile.includes('server'), `entryFile should reference server.ts, got: ${result.entryFile}`);
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// detectProject: unknown dir (no manifest) → defaults to sdk with low confidence
-{
-  const dir = mkdtempSync(join(tmpdir(), 'detect-empty-'));
-  try {
-    const result = detectProject(dir);
-    assert.strictEqual(result.surface, 'sdk');
-    assert.strictEqual(result.confidence, 'low');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// detectProject: SKILL.md found → skillFile set
-{
-  const dir = mkdtempSync(join(tmpdir(), 'detect-skill-'));
-  try {
-    writeFileSync(join(dir, 'package.json'), JSON.stringify({ name: 'x' }), 'utf-8');
-    writeFileSync(join(dir, 'SKILL.md'), '# skill');
-    const result = detectProject(dir);
-    assert.strictEqual(result.skillFile, 'SKILL.md');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// SkillOptimizerError — structured error with fix hints
-{
-  const { ERRORS, SkillOptimizerError, printError } = await import('../src/errors.js');
-
-  // Basic throw/catch
-  let caught: InstanceType<typeof SkillOptimizerError> | undefined;
-  try {
-    throw new SkillOptimizerError(ERRORS.E_MISSING_API_KEY);
-  } catch (err) {
-    if (err instanceof SkillOptimizerError) caught = err;
-  }
-  assert.ok(caught, 'should have caught SkillOptimizerError');
-  assert.strictEqual(caught.name, 'E_MISSING_API_KEY');
-  assert.ok(caught.message.includes('API key'), `message should mention API key, got: ${caught.message}`);
-
-  // Detail appended
-  const withDetail = new SkillOptimizerError(ERRORS.E_MAXTASKS_TOO_LOW, 'scope has 5 actions, maxTasks is 3');
-  assert.ok(withDetail.message.includes('scope has 5'), `detail should be appended, got: ${withDetail.message}`);
-
-  // ERRORS registry: all entries have code, message, fix array
-  for (const [key, def] of Object.entries(ERRORS)) {
-    assert.strictEqual(def.code, key, `code mismatch for ${key}`);
-    assert.ok(typeof def.message === 'string' && def.message.length > 0, `${key} needs a message`);
-    assert.ok(Array.isArray(def.fix) && def.fix.length > 0, `${key} needs at least one fix step`);
-  }
-
-  // printError is callable (won't throw)
-  const orig = console.error;
-  let printed = '';
-  console.error = (...args: unknown[]) => { printed += args.join(' '); };
-  printError(new SkillOptimizerError(ERRORS.E_DIRTY_GIT));
-  console.error = orig;
-  assert.ok(printed.includes('E_DIRTY_GIT'), `printError should include code, got: ${printed}`);
-}
-
-// detectedToPreseed maps DetectedProject to Partial<WizardAnswers>
-{
-  const dir = mkdtempSync(join(tmpdir(), 'preseed-'));
-  try {
-    writeFileSync(join(dir, 'package.json'), JSON.stringify({
-      name: 'preseed-cli',
-      bin: { 'preseed-cli': './dist/cli.js' },
-    }), 'utf-8');
-    mkdirSync(join(dir, 'src'), { recursive: true });
-    writeFileSync(join(dir, 'src', 'cli.ts'), '');
-    const detected = detectProject(dir);
-    const preseed = detectedToPreseed(detected);
-    assert.strictEqual(preseed.surface, 'cli');
-    assert.strictEqual(preseed.repoPath, dir);
-    assert.ok(typeof preseed.name === 'string' && preseed.name.length > 0, 'preseed.name should be set');
-    assert.ok(preseed.entryFile !== undefined, 'preseed.entryFile should be set for cli');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// --auto --yes high confidence path: scaffoldInit called without wizard
-{
-  const dir = mkdtempSync(join(tmpdir(), 'auto-yes-'));
-  try {
-    writeFileSync(join(dir, 'package.json'), JSON.stringify({
-      name: 'auto-test',
-      dependencies: { '@modelcontextprotocol/sdk': '^1.0.0' },
-    }), 'utf-8');
-    mkdirSync(join(dir, 'src'), { recursive: true });
-    writeFileSync(join(dir, 'src', 'server.ts'), '');
-    const detected = detectProject(dir);
-    assert.strictEqual(detected.confidence, 'high', 'mcp with dep should be high confidence');
-    assert.strictEqual(detected.surface, 'mcp');
-    const answers = {
-      ...buildDefaultAnswers(detected.surface, detected.repoPath),
-      ...detectedToPreseed(detected),
-    };
-    await scaffoldInit(answers, dir);
-    const configPath = join(dir, '.skill-optimizer', 'skill-optimizer.json');
-    assert.ok(existsSync(configPath), '--auto --yes should scaffold config');
-    const config = JSON.parse(readFileSync(configPath, 'utf-8')) as { target: { surface: string } };
-    assert.strictEqual(config.target.surface, 'mcp');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-// --auto --yes low confidence: verifies error infrastructure (E_INIT_AUTO_LOW_CONFIDENCE exists and can be thrown)
-// Note: this simulates the guard logic in cli.ts rather than calling through the CLI handler,
-// since the full CLI path requires process.argv and process.exit mocking.
-{
-  const { detectProject } = await import('../src/init/detect-project.js');
-  const { ERRORS, SkillOptimizerError } = await import('../src/errors.js');
-  const dir = mkdtempSync(join(tmpdir(), 'auto-low-'));
-  try {
-    const detected = detectProject(dir);
-    assert.strictEqual(detected.confidence, 'low');
-    let threw = false;
-    try {
-      if (detected.confidence !== 'high') {
-        throw new SkillOptimizerError(ERRORS.E_INIT_AUTO_LOW_CONFIDENCE, `confidence is ${detected.confidence}`);
-      }
-    } catch (err) {
-      if (err instanceof SkillOptimizerError && err.def.code === 'E_INIT_AUTO_LOW_CONFIDENCE') threw = true;
-      else throw err;
-    }
-    assert.ok(threw, 'low confidence with --yes should throw E_INIT_AUTO_LOW_CONFIDENCE');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-
-console.log('smoke-init: all tests passed');
diff --git a/tests/smoke-llm.ts b/tests/smoke-llm.ts
deleted file mode 100644
index 9ac7eae..0000000
--- a/tests/smoke-llm.ts
+++ /dev/null
@@ -1,1434 +0,0 @@
-/**
- * Smoke tests for LLM format handlers (openai-format.ts, anthropic-format.ts, index.ts).
- * No test framework — uses simple assertion helpers.
- */
-
-import { createLLMClient } from '../src/benchmark/llm/index.js';
-import { __setPiImplementationsForTest } from '../src/benchmark/llm/pi-format.js';
-import { resolveApiCredential, resolveApiKey } from '../src/runtime/pi/auth.js';
-import type { LLMConfig, McpToolDefinition } from '../src/benchmark/types.js';
-
-// ---------------------------------------------------------------------------
-// Assertion helpers
-// ---------------------------------------------------------------------------
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(
-      `${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`,
-    );
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Fetch mock helpers
-// ---------------------------------------------------------------------------
-
-type MockFetch = (url: string | URL | Request, init?: RequestInit) => Promise<Response>;
-
-function mockFetch(
-  handler: (url: string, init: RequestInit) => { status: number; body: any },
-): MockFetch {
-  return async (url: string | URL | Request, init?: RequestInit) => {
-    const urlStr = typeof url === 'string' ? url : url.toString();
-    const result = handler(urlStr, init as RequestInit);
-    return new Response(JSON.stringify(result.body), {
-      status: result.status,
-      headers: { 'Content-Type': 'application/json' },
-    });
-  };
-}
-
-// ---------------------------------------------------------------------------
-// Shared test data
-// ---------------------------------------------------------------------------
-
-// Set a test API key in the environment before creating any client
-process.env.__TEST_API_KEY__ = 'test-key-12345';
-
-const openaiConfig: LLMConfig = {
-  format: 'openai',
-  baseUrl: 'https://api.test.com/v1',
-  apiKeyEnv: '__TEST_API_KEY__',
-  timeout: 5_000,
-  models: [],
-};
-
-const anthropicConfig: LLMConfig = {
-  format: 'anthropic',
-  baseUrl: 'https://api.anthropic.com',
-  apiKeyEnv: '__TEST_API_KEY__',
-  timeout: 5_000,
-  models: [],
-};
-
-const piConfig: LLMConfig = {
-  format: 'pi',
-  apiKeyEnv: '__TEST_API_KEY__',
-  timeout: 5_000,
-  models: [],
-};
-
-const sampleTools: McpToolDefinition[] = [
-  {
-    type: 'function',
-    function: {
-      name: 'get_weather',
-      description: 'Get the weather for a city',
-      parameters: {
-        type: 'object',
-        properties: { city: { type: 'string' } },
-        required: ['city'],
-      },
-    },
-  },
-];
-
-const dottedSampleTools: McpToolDefinition[] = [
-  {
-    type: 'function',
-    function: {
-      name: 'auth.status',
-      description: 'Check session auth state',
-      parameters: {
-        type: 'object',
-        properties: {},
-      },
-    },
-  },
-];
-
-// ---------------------------------------------------------------------------
-// Save original fetch so we can restore it after all tests
-// ---------------------------------------------------------------------------
-
-const originalFetch = globalThis.fetch;
-
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-
-console.log('\n=== LLM Format Handler Smoke Tests ===\n');
-
-// --- Group 1: createLLMClient factory ---
-
-await test('createLLMClient: creates client for pi format', () => {
-  const client = createLLMClient(piConfig);
-  assert(typeof client.chat === 'function', 'client.chat should be a function');
-  assert(typeof client.chatWithTools === 'function', 'client.chatWithTools should be a function');
-});
-
-await test('createLLMClient: creates client for openai format', () => {
-  const client = createLLMClient(openaiConfig);
-  assert(typeof client.chat === 'function', 'client.chat should be a function');
-  assert(typeof client.chatWithTools === 'function', 'client.chatWithTools should be a function');
-});
-
-await test('createLLMClient: creates client for anthropic format', () => {
-  const client = createLLMClient(anthropicConfig);
-  assert(typeof client.chat === 'function', 'client.chat should be a function');
-  assert(typeof client.chatWithTools === 'function', 'client.chatWithTools should be a function');
-});
-
-await test('createLLMClient: throws when apiKeyEnv is set but env var is missing', async () => {
-  let threw = false;
-  try {
-    const client = createLLMClient({ ...openaiConfig, apiKeyEnv: '__MISSING_ENV_VAR_XYZ__' });
-    await client.chat('gpt-5.4', 'sys', 'user');
-  } catch (e: any) {
-    threw = true;
-    assert(
-      e.message.includes('__MISSING_ENV_VAR_XYZ__'),
-      'error message should mention the missing env var name',
-    );
-  }
-  assert(threw, 'should have thrown for missing env var');
-});
-
-await test('pi format: uses provider/model id and runtime auth override for text chat', async () => {
-  let capturedModel: string | null = null;
-  let capturedAuthMode: string | undefined;
-  let capturedApiKeyEnv: string | undefined;
-  let capturedApiKey: string | undefined;
-
-  __setPiImplementationsForTest({
-    async resolve(modelId, authOptions) {
-      capturedModel = modelId;
-      capturedAuthMode = authOptions?.authMode;
-      capturedApiKeyEnv = authOptions?.apiKeyEnv;
-      capturedApiKey = authOptions?.apiKeyOverride;
-      return {
-        model: { id: 'openai/gpt-5.4', provider: 'openrouter', api: 'openai-completions', name: 'GPT-5.4' } as any,
-        auth: { apiKey: 'test-key-12345', headers: { 'x-test-header': 'yes' } },
-      };
-    },
-    async completeSimple(_model, context, options) {
-      assertEqual(context.systemPrompt, 'sys', 'system prompt passed through');
-      assertEqual((context.messages[0] as any).content, 'user', 'user content passed through');
-      assertEqual(options?.apiKey, 'test-key-12345', 'api key override should be forwarded');
-      assertEqual(options?.headers?.['x-test-header'], 'yes', 'resolved headers should be forwarded');
-      return {
-        role: 'assistant',
-        api: 'openai-completions',
-        provider: 'openrouter',
-        model: 'openai/gpt-5.4',
-        stopReason: 'stop',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0, totalTokens: 3, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [{ type: 'text', text: 'hello from pi' }],
-      } as any;
-    },
-    async complete() {
-      throw new Error('unexpected complete() call');
-    },
-  });
-
-  const client = createLLMClient(piConfig);
-  const result = await client.chat('openrouter/openai/gpt-5.4', 'sys', 'user');
-
-  assertEqual(capturedModel, 'openrouter/openai/gpt-5.4', 'model id should be resolved through pi');
-  assertEqual(capturedAuthMode, undefined, 'default auth mode should be passed through as undefined');
-  assertEqual(capturedApiKeyEnv, '__TEST_API_KEY__', 'api key env should be forwarded to pi resolution');
-  assertEqual(capturedApiKey, undefined, 'pi resolution should now read the configured env var itself');
-  assertEqual(result.content, 'hello from pi', 'pi text response should be surfaced');
-
-  __setPiImplementationsForTest(null);
-});
-
-await test('pi format: converts tool calls for MCP chat', async () => {
-  __setPiImplementationsForTest({
-    async resolve() {
-      return {
-        model: { id: 'openai/gpt-5.4', provider: 'openrouter', api: 'openai-completions', name: 'GPT-5.4' } as any,
-        auth: { apiKey: 'test-key-12345', headers: {} },
-      };
-    },
-    async completeSimple() {
-      throw new Error('unexpected completeSimple() call');
-    },
-    async complete(_model, context, options) {
-      assert(Array.isArray(context.tools), 'tools should be passed to pi complete()');
-      assertEqual(context.tools?.[0]?.name, 'get_weather', 'tool name should be preserved');
-      assertEqual(options?.apiKey, 'test-key-12345', 'resolved auth should be forwarded');
-      return {
-        role: 'assistant',
-        api: 'openai-completions',
-        provider: 'openrouter',
-        model: 'openai/gpt-5.4',
-        stopReason: 'toolUse',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [
-          { type: 'toolCall', id: 'call-1', name: 'get_weather', arguments: { city: 'NYC' } },
-        ],
-      } as any;
-    },
-  });
-
-  const client = createLLMClient(piConfig);
-  const result = await client.chatWithTools('openrouter/openai/gpt-5.4', 'sys', 'user', sampleTools);
-
-  assertEqual(result.toolCalls?.[0]?.name, 'get_weather', 'pi tool call name should be surfaced');
-  assertEqual((result.toolCalls?.[0]?.arguments as any).city, 'NYC', 'pi tool call args should be surfaced');
-
-  __setPiImplementationsForTest(null);
-});
-
-await test('pi format: sanitizes dotted MCP tool names and maps them back', async () => {
-  __setPiImplementationsForTest({
-    async resolve() {
-      return {
-        model: { id: 'gpt-5.4', provider: 'openai-codex', api: 'openai-codex-responses', name: 'GPT-5.4' } as any,
-        auth: { apiKey: 'codex-access-token', headers: {} },
-      };
-    },
-    async completeSimple() {
-      throw new Error('unexpected completeSimple() call');
-    },
-    async complete(_model, context) {
-      assert(Array.isArray(context.tools), 'tools should be passed to pi complete()');
-      assertEqual(context.tools?.[0]?.name, 'auth_status', 'tool name should be sanitized for provider request');
-      return {
-        role: 'assistant',
-        api: 'openai-codex-responses',
-        provider: 'openai-codex',
-        model: 'gpt-5.4',
-        stopReason: 'toolUse',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [
-          { type: 'toolCall', id: 'call-1', name: 'auth_status', arguments: {} },
-        ],
-      } as any;
-    },
-  });
-
-  const client = createLLMClient({ ...piConfig, authMode: 'codex' });
-  const result = await client.chatWithTools('openai/gpt-5.4', 'sys', 'user', dottedSampleTools);
-
-  assertEqual(result.toolCalls?.[0]?.name, 'auth.status', 'tool call name should map back to canonical form');
-
-  __setPiImplementationsForTest(null);
-});
-
-await test('pi format: throws provider-side errors instead of returning empty output', async () => {
-  __setPiImplementationsForTest({
-    async resolve() {
-      return {
-        model: { id: 'gpt-5.4', provider: 'openai-codex', api: 'openai-codex-responses', name: 'GPT-5.4' } as any,
-        auth: { apiKey: 'codex-access-token', headers: {} },
-      };
-    },
-    async completeSimple() {
-      throw new Error('unexpected completeSimple() call');
-    },
-    async complete() {
-      return {
-        role: 'assistant',
-        api: 'openai-codex-responses',
-        provider: 'openai-codex',
-        model: 'gpt-5.4',
-        stopReason: 'error',
-        errorMessage: 'Invalid tool schema',
-        timestamp: Date.now(),
-        usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [],
-      } as any;
-    },
-  });
-
-  const client = createLLMClient({ ...piConfig, authMode: 'codex' });
-  let threw = false;
-  try {
-    await client.chatWithTools('openai/gpt-5.4', 'sys', 'user', dottedSampleTools);
-  } catch (e: any) {
-    threw = true;
-    assert(
-      e.message.includes('Invalid tool schema'),
-      'provider-side error message should surface to the caller',
-    );
-  }
-  assert(threw, 'chatWithTools should throw when the provider reports an error');
-
-  __setPiImplementationsForTest(null);
-});
-
-await test('resolveApiKey: codex auth reads browser-login access token from ~/.codex/auth.json', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-auth-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-
-  try {
-    const result = resolveApiKey({ provider: 'openai', authMode: 'codex' });
-    assertEqual(result, jwt, 'browser-login access token should be returned');
-  } finally {
-    if (originalHome === undefined) {
-      delete process.env.HOME;
-    } else {
-      process.env.HOME = originalHome;
-    }
-  }
-});
-
-await test('resolveApiCredential: static OPENAI_API_KEY in tokens object returns source env', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-static-tokens-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ tokens: { OPENAI_API_KEY: 'sk-static-key-tokens' } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-  try {
-    const result = resolveApiCredential({ provider: 'openai', authMode: 'codex' });
-    assertEqual(result.apiKey, 'sk-static-key-tokens', 'static key from tokens object should be returned');
-    assertEqual(result.source, 'env', 'static key should have source env, not codex');
-  } finally {
-    if (originalHome === undefined) { delete process.env.HOME; } else { process.env.HOME = originalHome; }
-  }
-});
-
-await test('resolveApiCredential: static OPENAI_API_KEY at root level returns source env', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-static-root-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ OPENAI_API_KEY: 'sk-static-key-root' }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-  try {
-    const result = resolveApiCredential({ provider: 'openai', authMode: 'codex' });
-    assertEqual(result.apiKey, 'sk-static-key-root', 'static key from root object should be returned');
-    assertEqual(result.source, 'env', 'static key should have source env, not codex');
-  } finally {
-    if (originalHome === undefined) { delete process.env.HOME; } else { process.env.HOME = originalHome; }
-  }
-});
-
-await test('resolveApiCredential: browser-login JWT still returns source codex (regression)', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-jwt-regression-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-  try {
-    const result = resolveApiCredential({ provider: 'openai', authMode: 'codex' });
-    assertEqual(result.apiKey, jwt, 'JWT access token should be returned');
-    assertEqual(result.source, 'codex', 'JWT should keep source codex so openai-codex transport is used');
-  } finally {
-    if (originalHome === undefined) { delete process.env.HOME; } else { process.env.HOME = originalHome; }
-  }
-});
-
-await test('resolveApiCredential: malformed access_token falls through to static OPENAI_API_KEY', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-malformed-token-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ tokens: { access_token: 'not-a-jwt', OPENAI_API_KEY: 'sk-static-fallback' } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-  try {
-    const result = resolveApiCredential({ provider: 'openai', authMode: 'codex' });
-    assertEqual(result.apiKey, 'sk-static-fallback', 'malformed access_token should fall through to static key');
-    assertEqual(result.source, 'env', 'static fallback key should have source env');
-  } finally {
-    if (originalHome === undefined) { delete process.env.HOME; } else { process.env.HOME = originalHome; }
-  }
-});
-
-await test('openai format: codex auth bridges through pi with openai provider refs', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-bridge-auth-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-
-  let capturedModel: string | null = null;
-
-  __setPiImplementationsForTest({
-    async resolve(modelId) {
-      capturedModel = modelId;
-      return {
-        model: { id: 'gpt-5.4', provider: 'openai-codex', api: 'openai-codex-responses', name: 'GPT-5.4' } as any,
-        auth: { apiKey: 'codex-access-token', headers: {} },
-      };
-    },
-    async completeSimple(_model, context, options) {
-      assertEqual(context.systemPrompt, 'sys', 'system prompt passed through');
-      assertEqual((context.messages[0] as any).content, 'user', 'user content passed through');
-      assertEqual(options?.apiKey, 'codex-access-token', 'codex token should flow through pi');
-      return {
-        role: 'assistant',
-        api: 'openai-codex-responses',
-        provider: 'openai-codex',
-        model: 'gpt-5.4',
-        stopReason: 'stop',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [{ type: 'text', text: 'hello from codex auth' }],
-      } as any;
-    },
-    async complete() {
-      throw new Error('unexpected complete() call');
-    },
-  });
-
-  try {
-    const client = createLLMClient({
-      format: 'openai',
-      baseUrl: 'https://api.openai.com/v1',
-      authMode: 'codex',
-      models: [],
-    });
-    const result = await client.chat('gpt-5.4', 'sys', 'user');
-    assertEqual(capturedModel, 'openai/gpt-5.4', 'openai-format codex auth should bridge to pi using provider/model form');
-    assertEqual(result.content, 'hello from codex auth', 'codex-auth bridged response should be returned');
-  } finally {
-    if (originalHome === undefined) {
-      delete process.env.HOME;
-    } else {
-      process.env.HOME = originalHome;
-    }
-    __setPiImplementationsForTest(null);
-  }
-});
-
-await test('openai format: codex bridge passes authMode:codex (not apiKeyOverride) to pi call', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-override-auth-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-
-  let capturedAuthMode: string | undefined = 'NOT_SET';
-  let capturedApiKeyEnv: string | undefined = 'NOT_SET';
-  let capturedApiKeyOverride: string | undefined = 'NOT_SET';
-
-  __setPiImplementationsForTest({
-    async resolve(_modelId, authOptions) {
-      capturedAuthMode = authOptions?.authMode;
-      capturedApiKeyEnv = authOptions?.apiKeyEnv;
-      capturedApiKeyOverride = authOptions?.apiKeyOverride;
-      return {
-        model: { id: 'gpt-5.4', provider: 'openai-codex', api: 'openai-codex-responses', name: 'GPT-5.4' } as any,
-        auth: { apiKey: jwt, headers: {} },
-      };
-    },
-    async completeSimple() {
-      return {
-        role: 'assistant',
-        api: 'openai-codex-responses',
-        provider: 'openai-codex',
-        model: 'gpt-5.4',
-        stopReason: 'stop',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [{ type: 'text', text: 'ok' }],
-      } as any;
-    },
-    async complete() {
-      throw new Error('unexpected complete() call');
-    },
-  });
-
-  try {
-    const client = createLLMClient({
-      format: 'openai',
-      baseUrl: 'https://api.openai.com/v1',
-      authMode: 'codex',
-      models: [],
-    });
-    await client.chat('gpt-5.4', 'sys', 'user');
-
-    assertEqual(capturedAuthMode, 'codex', 'codex bridge should pass authMode:codex to pi for correct provider routing');
-    assert(capturedApiKeyOverride === undefined, `codex bridge should NOT pass apiKeyOverride to pi (would break source:'codex' routing signal) (got: ${capturedApiKeyOverride})`);
-  } finally {
-    if (originalHome === undefined) {
-      delete process.env.HOME;
-    } else {
-      process.env.HOME = originalHome;
-    }
-    __setPiImplementationsForTest(null);
-  }
-});
-
-await test('openai format: codex bridge passes authMode:codex (not apiKeyOverride) to pi call for chatWithTools', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-override-tools-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-
-  let capturedAuthMode: string | undefined = 'NOT_SET';
-  let capturedApiKeyEnv: string | undefined = 'NOT_SET';
-  let capturedApiKeyOverride: string | undefined = 'NOT_SET';
-
-  __setPiImplementationsForTest({
-    async resolve(_modelId, authOptions) {
-      capturedAuthMode = authOptions?.authMode;
-      capturedApiKeyEnv = authOptions?.apiKeyEnv;
-      capturedApiKeyOverride = authOptions?.apiKeyOverride;
-      return {
-        model: { id: 'gpt-5.4', provider: 'openai-codex', api: 'openai-codex-responses', name: 'GPT-5.4' } as any,
-        auth: { apiKey: jwt, headers: {} },
-      };
-    },
-    async completeSimple() {
-      throw new Error('unexpected completeSimple() call');
-    },
-    async complete() {
-      return {
-        role: 'assistant',
-        api: 'openai-codex-responses',
-        provider: 'openai-codex',
-        model: 'gpt-5.4',
-        stopReason: 'stop',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [{ type: 'text', text: 'ok' }],
-      } as any;
-    },
-  });
-
-  try {
-    const client = createLLMClient({
-      format: 'openai',
-      baseUrl: 'https://api.openai.com/v1',
-      authMode: 'codex',
-      models: [],
-    });
-    await client.chatWithTools('gpt-5.4', 'sys', 'user', sampleTools);
-
-    assertEqual(capturedAuthMode, 'codex', 'codex bridge should pass authMode:codex to pi for correct provider routing (chatWithTools)');
-    assert(capturedApiKeyOverride === undefined, `codex bridge should NOT pass apiKeyOverride to pi (chatWithTools) (got: ${capturedApiKeyOverride})`);
-  } finally {
-    if (originalHome === undefined) {
-      delete process.env.HOME;
-    } else {
-      process.env.HOME = originalHome;
-    }
-    __setPiImplementationsForTest(null);
-  }
-});
-
-await test('openai format: codex bridge passes authMode:codex (not apiKeyOverride) to pi call for chatAgentLoop', async () => {
-  const originalHome = process.env.HOME;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-override-agent-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-
-  let capturedAuthMode: string | undefined = 'NOT_SET';
-  let capturedApiKeyEnv: string | undefined = 'NOT_SET';
-  let capturedApiKeyOverride: string | undefined = 'NOT_SET';
-
-  __setPiImplementationsForTest({
-    async resolve(_modelId, authOptions) {
-      capturedAuthMode = authOptions?.authMode;
-      capturedApiKeyEnv = authOptions?.apiKeyEnv;
-      capturedApiKeyOverride = authOptions?.apiKeyOverride;
-      return {
-        model: { id: 'gpt-5.4', provider: 'openai-codex', api: 'openai-codex-responses', name: 'GPT-5.4' } as any,
-        auth: { apiKey: jwt, headers: {} },
-      };
-    },
-    async completeSimple() {
-      throw new Error('unexpected completeSimple() call');
-    },
-    async complete() {
-      return {
-        role: 'assistant',
-        api: 'openai-codex-responses',
-        provider: 'openai-codex',
-        model: 'gpt-5.4',
-        stopReason: 'stop',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [{ type: 'text', text: 'ok' }],
-      } as any;
-    },
-  });
-
-  try {
-    const client = createLLMClient({
-      format: 'openai',
-      baseUrl: 'https://api.openai.com/v1',
-      authMode: 'codex',
-      models: [],
-    });
-    const dummyExecutor = async () => 'result';
-    await client.chatAgentLoop('gpt-5.4', 'sys', 'user', sampleTools, dummyExecutor);
-
-    assertEqual(capturedAuthMode, 'codex', 'codex bridge should pass authMode:codex to pi for correct provider routing (chatAgentLoop)');
-    assert(capturedApiKeyOverride === undefined, `codex bridge should NOT pass apiKeyOverride to pi (chatAgentLoop) (got: ${capturedApiKeyOverride})`);
-  } finally {
-    if (originalHome === undefined) {
-      delete process.env.HOME;
-    } else {
-      process.env.HOME = originalHome;
-    }
-    __setPiImplementationsForTest(null);
-  }
-});
-
-await test('resolveApiKey: auto mode prefers env var over codex token when both present', async () => {
-  const originalHome = process.env.HOME;
-  const originalEnv = process.env.OPENAI_API_KEY;
-  const dir = await import('node:fs/promises').then(({ mkdtemp, mkdir, writeFile }) => ({ mkdtemp, mkdir, writeFile }));
-  const os = await import('node:os');
-  const path = await import('node:path');
-  const tmpHome = await dir.mkdtemp(path.join(os.tmpdir(), 'codex-auto-priority-'));
-  const codexDir = path.join(tmpHome, '.codex');
-  await dir.mkdir(codexDir, { recursive: true });
-  const futureExp = Math.floor(Date.now() / 1000) + 3600;
-  const jwt = [
-    Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url'),
-    Buffer.from(JSON.stringify({ exp: futureExp })).toString('base64url'),
-    'sig',
-  ].join('.');
-  await dir.writeFile(
-    path.join(codexDir, 'auth.json'),
-    JSON.stringify({ auth_mode: 'chatgpt', tokens: { access_token: jwt } }),
-    'utf-8',
-  );
-  process.env.HOME = tmpHome;
-  process.env.OPENAI_API_KEY = 'env-api-key';
-  try {
-    const result = resolveApiKey({ provider: 'openai', authMode: 'auto' });
-    assertEqual(result, 'env-api-key', 'auto mode should prefer env var over codex token');
-  } finally {
-    if (originalHome === undefined) { delete process.env.HOME; } else { process.env.HOME = originalHome; }
-    if (originalEnv === undefined) { delete process.env.OPENAI_API_KEY; } else { process.env.OPENAI_API_KEY = originalEnv; }
-  }
-});
-
-await test('pi format: agent loop feeds tool results back with original tool call ids', async () => {
-  const toolCallIds: string[] = [];
-  let callCount = 0;
-
-  __setPiImplementationsForTest({
-    async resolve() {
-      return {
-        model: { id: 'openai/gpt-5.4', provider: 'openrouter', api: 'openai-completions', name: 'GPT-5.4' } as any,
-        auth: { apiKey: 'test-key-12345', headers: {} },
-      };
-    },
-    async completeSimple() {
-      throw new Error('unexpected completeSimple() call');
-    },
-    async complete(_model, context) {
-      callCount += 1;
-      if (callCount === 1) {
-        return {
-          role: 'assistant',
-          api: 'openai-completions',
-          provider: 'openrouter',
-          model: 'openai/gpt-5.4',
-          stopReason: 'toolUse',
-          timestamp: Date.now(),
-          usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-          content: [
-            { type: 'toolCall', id: 'tool-123', name: 'get_weather', arguments: { city: 'NYC' } },
-          ],
-        } as any;
-      }
-
-      const toolResult = context.messages[context.messages.length - 1] as any;
-      toolCallIds.push(toolResult.toolCallId);
-      return {
-        role: 'assistant',
-        api: 'openai-completions',
-        provider: 'openrouter',
-        model: 'openai/gpt-5.4',
-        stopReason: 'stop',
-        timestamp: Date.now(),
-        usage: { input: 1, output: 1, cacheRead: 0, cacheWrite: 0, totalTokens: 2, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-        content: [{ type: 'text', text: 'done' }],
-      } as any;
-    },
-  });
-
-  const client = createLLMClient(piConfig);
-  const result = await client.chatAgentLoop('openrouter/openai/gpt-5.4', 'sys', 'user', sampleTools, async () => 'sunny', 3);
-
-  assertEqual(toolCallIds[0], 'tool-123', 'tool result should reference original tool call id');
-  assertEqual(result.content, 'done', 'final agent-loop text should be returned');
-
-  __setPiImplementationsForTest(null);
-});
-
-// --- Group 2: OpenAI format handler ---
-
-await test('openai format: sends correct request body', async () => {
-  let capturedBody: any = null;
-  let capturedHeaders: Record<string, string> = {};
-
-  globalThis.fetch = mockFetch((url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    capturedHeaders = (init.headers as Record<string, string>) ?? {};
-    return {
-      status: 200,
-      body: {
-        choices: [{ message: { content: 'ok' } }],
-        usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(openaiConfig);
-  await client.chat('gpt-4o', 'You are a helper.', 'Hello');
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(capturedBody.model, 'gpt-4o', 'model field');
-  assert(Array.isArray(capturedBody.messages), 'messages should be an array');
-  const roles = capturedBody.messages.map((m: any) => m.role);
-  assert(roles.includes('system'), 'messages should contain a system role');
-  assert(roles.includes('user'), 'messages should contain a user role');
-  const systemMsg = capturedBody.messages.find((m: any) => m.role === 'system');
-  assertEqual(systemMsg.content, 'You are a helper.', 'system message content');
-  const userMsg = capturedBody.messages.find((m: any) => m.role === 'user');
-  assertEqual(userMsg.content, 'Hello', 'user message content');
-  assertEqual(
-    capturedHeaders['Authorization'],
-    'Bearer test-key-12345',
-    'Authorization header',
-  );
-}) ;
-
-await test('openai format: posts to /chat/completions endpoint', async () => {
-  let capturedUrl = '';
-
-  globalThis.fetch = mockFetch((url) => {
-    capturedUrl = url;
-    return {
-      status: 200,
-      body: {
-        choices: [{ message: { content: 'ok' } }],
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(openaiConfig);
-  await client.chat('gpt-4o', 'sys', 'user');
-
-  assert(
-    capturedUrl.endsWith('/chat/completions'),
-    `URL should end with /chat/completions, got: ${capturedUrl}`,
-  );
-});
-
-await test('openai format: parses chat response', async () => {
-  globalThis.fetch = mockFetch(() => ({
-    status: 200,
-    body: {
-      choices: [{ message: { content: 'Hello from OpenAI' } }],
-      usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 },
-    },
-  })) as any;
-
-  const client = createLLMClient(openaiConfig);
-  const result = await client.chat('gpt-4o', 'sys', 'user');
-
-  assertEqual(result.content, 'Hello from OpenAI', 'content');
-  assert(result.usage !== undefined, 'usage should be present');
-  assertEqual(result.usage!.prompt, 10, 'usage.prompt');
-  assertEqual(result.usage!.completion, 5, 'usage.completion');
-  assertEqual(result.usage!.total, 15, 'usage.total');
-});
-
-await test('openai format: parses tool_calls response', async () => {
-  globalThis.fetch = mockFetch(() => ({
-    status: 200,
-    body: {
-      choices: [
-        {
-          message: {
-            content: '',
-            tool_calls: [
-              {
-                function: {
-                  name: 'get_weather',
-                  arguments: '{"city":"NYC"}',
-                },
-              },
-            ],
-          },
-        },
-      ],
-    },
-  })) as any;
-
-  const client = createLLMClient(openaiConfig);
-  const result = await client.chatWithTools('gpt-4o', 'sys', 'user', sampleTools);
-
-  assert(result.toolCalls !== undefined, 'toolCalls should be present');
-  assert(result.toolCalls!.length === 1, 'should have exactly one tool call');
-  assertEqual(result.toolCalls![0].name, 'get_weather', 'tool call name');
-  assertEqual(
-    (result.toolCalls![0].arguments as any).city,
-    'NYC',
-    'tool call argument city',
-  );
-});
-
-await test('openai format: sanitizes dotted tool names in requests and maps them back in responses', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        choices: [
-          {
-            message: {
-              content: '',
-              tool_calls: [
-                {
-                  function: {
-                    name: 'auth_status',
-                    arguments: '{}',
-                  },
-                },
-              ],
-            },
-          },
-        ],
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(openaiConfig);
-  const result = await client.chatWithTools('gpt-4o', 'sys', 'user', dottedSampleTools);
-
-  assertEqual(
-    capturedBody.tools[0].function.name,
-    'auth_status',
-    'dotted tool name should be sanitized in the request body',
-  );
-  assertEqual(
-    result.toolCalls?.[0]?.name,
-    'auth.status',
-    'sanitized tool call names should map back to canonical form',
-  );
-});
-
-await test('openai format: handles non-200 response', async () => {
-  // The retry logic retries once after 3s on any non-AbortError.
-  // To avoid a 3s delay in tests, we make both attempts fail immediately.
-  let callCount = 0;
-  globalThis.fetch = mockFetch(() => {
-    callCount++;
-    return {
-      status: 429,
-      body: { error: { message: 'Rate limit exceeded' } },
-    };
-  }) as any;
-
-  const client = createLLMClient(openaiConfig);
-  let threw = false;
-  try {
-    // Override the retry delay by patching setTimeout — instead, just catch the error.
-    // The retry waits 3s; we accept the delay here since it's a single retry.
-    // To keep tests fast, we mock fetch to succeed on the second call.
-    await client.chat('gpt-4o', 'sys', 'user');
-  } catch (e: any) {
-    threw = true;
-    assert(
-      e.message.includes('429'),
-      `error message should include status code 429, got: ${e.message}`,
-    );
-  }
-  assert(threw, 'should have thrown on non-200 response');
-}, );
-
-// --- Group 3: Anthropic format handler ---
-
-await test('anthropic format: sends correct headers', async () => {
-  let capturedHeaders: Record<string, string> = {};
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedHeaders = (init.headers as Record<string, string>) ?? {};
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'ok' }],
-        usage: { input_tokens: 1, output_tokens: 1 },
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chat('claude-3-5-sonnet-20241022', 'sys', 'user');
-
-  assertEqual(
-    capturedHeaders['x-api-key'],
-    'test-key-12345',
-    'x-api-key header',
-  );
-  assert(
-    capturedHeaders['anthropic-version'] !== undefined,
-    'anthropic-version header should be present',
-  );
-  assert(
-    capturedHeaders['Authorization'] === undefined,
-    'should NOT have Authorization header (Anthropic uses x-api-key)',
-  );
-});
-
-await test('anthropic format: correct request body format', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'ok' }],
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chat('claude-3-5-sonnet-20241022', 'You are a helper.', 'Hello');
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(capturedBody.model, 'claude-3-5-sonnet-20241022', 'model field');
-  // Anthropic puts system as a top-level field, NOT inside messages
-  assertEqual(capturedBody.system, 'You are a helper.', 'system field at top level');
-  assert(Array.isArray(capturedBody.messages), 'messages should be an array');
-  // Only user message in messages array (no system role)
-  const roles = capturedBody.messages.map((m: any) => m.role);
-  assert(!roles.includes('system'), 'messages array should NOT contain a system role');
-  assert(roles.includes('user'), 'messages array should contain a user role');
-  const userMsg = capturedBody.messages.find((m: any) => m.role === 'user');
-  assertEqual(userMsg.content, 'Hello', 'user message content');
-});
-
-await test('anthropic format: posts to /v1/messages endpoint', async () => {
-  let capturedUrl = '';
-
-  globalThis.fetch = mockFetch((url) => {
-    capturedUrl = url;
-    return {
-      status: 200,
-      body: { content: [{ type: 'text', text: 'ok' }] },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chat('claude-3-5-sonnet-20241022', 'sys', 'user');
-
-  assert(
-    capturedUrl.endsWith('/v1/messages'),
-    `URL should end with /v1/messages, got: ${capturedUrl}`,
-  );
-});
-
-await test('anthropic format: parses response', async () => {
-  globalThis.fetch = mockFetch(() => ({
-    status: 200,
-    body: {
-      content: [{ type: 'text', text: 'Hello from Anthropic' }],
-      usage: { input_tokens: 10, output_tokens: 5 },
-    },
-  })) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  const result = await client.chat('claude-3-5-sonnet-20241022', 'sys', 'user');
-
-  assertEqual(result.content, 'Hello from Anthropic', 'content');
-  assert(result.usage !== undefined, 'usage should be present');
-  // Anthropic maps input_tokens -> prompt, output_tokens -> completion, sum -> total
-  assertEqual(result.usage!.prompt, 10, 'usage.prompt (input_tokens)');
-  assertEqual(result.usage!.completion, 5, 'usage.completion (output_tokens)');
-  assertEqual(result.usage!.total, 15, 'usage.total (input + output)');
-});
-
-await test('anthropic format: parses tool_use response', async () => {
-  globalThis.fetch = mockFetch(() => ({
-    status: 200,
-    body: {
-      content: [
-        {
-          type: 'tool_use',
-          id: '123',
-          name: 'get_weather',
-          input: { city: 'NYC' },
-        },
-      ],
-    },
-  })) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  const result = await client.chatWithTools(
-    'claude-3-5-sonnet-20241022',
-    'sys',
-    'user',
-    sampleTools,
-  );
-
-  assert(result.toolCalls !== undefined, 'toolCalls should be present');
-  assert(result.toolCalls!.length === 1, 'should have exactly one tool call');
-  assertEqual(result.toolCalls![0].name, 'get_weather', 'tool call name');
-  assertEqual(
-    (result.toolCalls![0].arguments as any).city,
-    'NYC',
-    'tool call argument city',
-  );
-});
-
-await test('anthropic format: converts McpToolDefinition to Anthropic tool format', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: { content: [{ type: 'text', text: 'ok' }] },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chatWithTools('claude-3-5-sonnet-20241022', 'sys', 'user', sampleTools);
-
-  assert(Array.isArray(capturedBody.tools), 'tools should be an array in request body');
-  const tool = capturedBody.tools[0];
-  assertEqual(tool.name, 'get_weather', 'tool name');
-  assertEqual(tool.description, 'Get the weather for a city', 'tool description');
-  assert(tool.input_schema !== undefined, 'tool should have input_schema (not parameters)');
-  assertEqual(tool.input_schema.type, 'object', 'input_schema.type');
-  assert(
-    tool.parameters === undefined,
-    'Anthropic format should use input_schema, not parameters',
-  );
-});
-
-await test('anthropic format: sanitizes dotted tool names in requests and maps them back in responses', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [
-          {
-            type: 'tool_use',
-            id: 'toolu_01',
-            name: 'auth_status',
-            input: {},
-          },
-        ],
-        usage: { input_tokens: 5, output_tokens: 2 },
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  const result = await client.chatWithTools('claude-3-5-sonnet-20241022', 'sys', 'user', dottedSampleTools);
-
-  assertEqual(
-    capturedBody.tools[0].name,
-    'auth_status',
-    'dotted tool name should be sanitized in the request body (Anthropic uses tools[].name)',
-  );
-  assertEqual(
-    result.toolCalls?.[0]?.name,
-    'auth.status',
-    'sanitized tool call names should map back to canonical form',
-  );
-});
-
-// ---------------------------------------------------------------------------
-// Provider prefix stripping for direct API formats
-// ---------------------------------------------------------------------------
-
-await test('anthropic format: strips provider prefix from model ID', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'ok' }],
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  // Pass a prefixed model ID like the config validation requires
-  await client.chat('anthropic/claude-sonnet-4-6', 'system', 'user');
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(
-    capturedBody.model,
-    'claude-sonnet-4-6',
-    'model field should have provider prefix stripped for direct Anthropic API',
-  );
-});
-
-await test('anthropic format: strips provider prefix in chatWithTools', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'no tools needed' }],
-        stop_reason: 'end_turn',
-      },
-    };
-  }) as any;
-
-  const tools: McpToolDefinition[] = [
-    {
-      type: 'function',
-      function: {
-        name: 'test_tool',
-        description: 'A test tool',
-        parameters: { type: 'object', properties: {} },
-      },
-    },
-  ];
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chatWithTools('anthropic/claude-sonnet-4-6', 'system', 'user', tools);
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(
-    capturedBody.model,
-    'claude-sonnet-4-6',
-    'model field should have provider prefix stripped in chatWithTools',
-  );
-});
-
-await test('anthropic format: strips provider prefix in chatAgentLoop', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'no tools needed' }],
-        stop_reason: 'end_turn',
-      },
-    };
-  }) as any;
-
-  const tools: McpToolDefinition[] = [
-    {
-      type: 'function',
-      function: {
-        name: 'test_tool',
-        description: 'A test tool',
-        parameters: { type: 'object', properties: {} },
-      },
-    },
-  ];
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chatAgentLoop('anthropic/claude-sonnet-4-6', 'system', 'user', tools, async () => 'result');
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(
-    capturedBody.model,
-    'claude-sonnet-4-6',
-    'model field should have provider prefix stripped in chatAgentLoop',
-  );
-});
-
-await test('anthropic format: does not strip when no prefix present', async () => {
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'ok' }],
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chat('claude-sonnet-4-6', 'system', 'user');
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(
-    capturedBody.model,
-    'claude-sonnet-4-6',
-    'model field should remain unchanged when no prefix',
-  );
-});
-
-await test('openai format: strips provider prefix from model ID', async () => {
-  let capturedBody: any = null;
-
-  const openaiConfig: LLMConfig = {
-    format: 'openai',
-    baseUrl: 'https://api.openai.com',
-    apiKeyEnv: 'OPENAI_API_KEY',
-    timeout: 5000,
-    models: [],
-  };
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        choices: [{ message: { role: 'assistant', content: 'ok' } }],
-      },
-    };
-  }) as any;
-
-  // Temporarily set the env var so createLLMClient doesn't throw.
-  // Capture and restore the original value so we don't clobber it in CI.
-  const hadOpenAiApiKey = Object.prototype.hasOwnProperty.call(process.env, 'OPENAI_API_KEY');
-  const previousOpenAiApiKey = process.env.OPENAI_API_KEY;
-  process.env.OPENAI_API_KEY = 'test-key';
-  const client = createLLMClient(openaiConfig);
-  try {
-    await client.chat('openai/gpt-4o', 'system', 'user');
-  } finally {
-    if (hadOpenAiApiKey) {
-      process.env.OPENAI_API_KEY = previousOpenAiApiKey;
-    } else {
-      delete process.env.OPENAI_API_KEY;
-    }
-  }
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(
-    capturedBody.model,
-    'gpt-4o',
-    'model field should have provider prefix stripped for direct OpenAI API',
-  );
-});
-
-await test('anthropic format: leaves openrouter/-prefixed model ID unchanged', async () => {
-  // openrouter/ IDs belong to format:'pi'. If one appears with format:'anthropic',
-  // the config is misconfigured — we pass it through unchanged so the Anthropic API
-  // returns a fast, visible error rather than silently misrouting the request.
-  let capturedBody: any = null;
-
-  globalThis.fetch = mockFetch((_url, init) => {
-    capturedBody = JSON.parse(init.body as string);
-    return {
-      status: 200,
-      body: {
-        content: [{ type: 'text', text: 'ok' }],
-      },
-    };
-  }) as any;
-
-  const client = createLLMClient(anthropicConfig);
-  await client.chat('openrouter/anthropic/claude-sonnet-4.6', 'system', 'user');
-
-  assert(capturedBody !== null, 'fetch should have been called');
-  assertEqual(
-    capturedBody.model,
-    'openrouter/anthropic/claude-sonnet-4.6',
-    'openrouter/ prefix should be left intact — not silently stripped',
-  );
-});
-
-// NOTE: pi format prefix preservation is already covered by existing
-// "pi format: uses provider/model id" tests above. The prefix stripping
-// only applies to anthropic and openai direct formats.
-
-// ---------------------------------------------------------------------------
-// Restore original fetch
-// ---------------------------------------------------------------------------
-
-globalThis.fetch = originalFetch;
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-mcp.ts b/tests/smoke-mcp.ts
deleted file mode 100644
index e165abc..0000000
--- a/tests/smoke-mcp.ts
+++ /dev/null
@@ -1,250 +0,0 @@
-import { extractFromToolCalls } from '../src/benchmark/extractors/mcp-extractor.js';
-import { extract } from '../src/benchmark/extractors/index.js';
-import { evaluateTask, matchActions } from '../src/benchmark/evaluator.js';
-import type {
-  ExtractedCall,
-  LLMResponse,
-  BenchmarkConfig,
-  TaskDefinition,
-  ExpectedAction,
-  ModelConfig,
-  ActionMatch,
-} from '../src/benchmark/types.js';
-
-// ── Test harness ──────────────────────────────────────────────────────────
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(
-      `${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`,
-    );
-  }
-}
-
-// ── Tests ─────────────────────────────────────────────────────────────────
-
-console.log('\n=== MCP Surface Smoke Tests ===\n');
-
-// ── Group 1: extractFromToolCalls ─────────────────────────────────────────
-
-await test('extractFromToolCalls: basic extraction', () => {
-  const response: LLMResponse = {
-    content: '',
-    toolCalls: [
-      { name: 'send_tokens', arguments: { to: 'addr1', amount: '5' } },
-    ],
-  };
-
-  const calls = extractFromToolCalls(response);
-
-  assertEqual(calls.length, 1, 'should return 1 ExtractedCall');
-  assertEqual(calls[0].method, 'send_tokens', 'method should be send_tokens');
-  assertEqual(calls[0].args.to as string, 'addr1', 'args.to should be addr1');
-  assertEqual(calls[0].args.amount as string, '5', 'args.amount should be "5"');
-});
-
-await test('extractFromToolCalls: multiple tool calls', () => {
-  const response: LLMResponse = {
-    content: '',
-    toolCalls: [
-      { name: 'create_wallet', arguments: {} },
-      { name: 'get_balance', arguments: { address: 'addr2' } },
-      { name: 'send_tokens', arguments: { to: 'addr3', amount: '10' } },
-    ],
-  };
-
-  const calls = extractFromToolCalls(response);
-
-  assertEqual(calls.length, 3, 'should return 3 ExtractedCalls');
-  assertEqual(calls[0].method, 'create_wallet', 'first call method');
-  assertEqual(calls[1].method, 'get_balance', 'second call method');
-  assertEqual(calls[2].method, 'send_tokens', 'third call method');
-});
-
-await test('extractFromToolCalls: empty toolCalls', () => {
-  const response: LLMResponse = {
-    content: 'some text',
-    toolCalls: [],
-  };
-
-  const calls = extractFromToolCalls(response);
-
-  assertEqual(calls.length, 0, 'should return empty array for empty toolCalls');
-});
-
-await test('extractFromToolCalls: undefined toolCalls', () => {
-  const response: LLMResponse = {
-    content: 'some text',
-  };
-
-  const calls = extractFromToolCalls(response);
-
-  assertEqual(calls.length, 0, 'should return empty array when toolCalls is undefined');
-});
-
-// ── Group 2: matchActions (arg validation) ──────────────────────────────────
-
-await test('matchActions: regex arg matching', () => {
-  const expectedTools: ExpectedAction[] = [
-    { name: 'send_tokens', args: { to: '/fast1.+/' } },
-  ];
-
-  const extractedCalls: ExtractedCall[] = [
-    { method: 'send_tokens', args: { to: 'fast1abc123def' }, line: 0, raw: '' },
-  ];
-
-  const matches = matchActions(expectedTools, extractedCalls);
-
-  assertEqual(matches.length, 1, 'should return 1 ActionMatch');
-  assert(matches[0].methodFound === true, 'methodFound should be true');
-  assert(matches[0].argsCorrect === true, 'argsCorrect should be true — regex /fast1.+/ should match "fast1abc123def"');
-});
-
-await test('matchActions: dynamic sentinel passes', () => {
-  const expectedTools: ExpectedAction[] = [
-    { name: 'get_balance', args: { address: '<dynamic>' } },
-  ];
-
-  const extractedCalls: ExtractedCall[] = [
-    { method: 'get_balance', args: { address: 'literally_anything' }, line: 0, raw: '' },
-  ];
-
-  const matches = matchActions(expectedTools, extractedCalls);
-
-  assertEqual(matches.length, 1, 'should return 1 ActionMatch');
-  assert(matches[0].methodFound === true, 'methodFound should be true');
-  // <dynamic> in expected acts as a wildcard — any value from the LLM is acceptable.
-  assert(
-    matches[0].argsCorrect === true,
-    'argsCorrect should be true: <dynamic> sentinel in expected means any value matches',
-  );
-});
-
-// ── Group 3: evaluateTask (MCP surface) ──────────────────────────────────
-
-const mockModel: ModelConfig = {
-  id: 'test/model',
-  name: 'TestModel',
-  tier: 'flagship' as const,
-};
-
-const knownMethods = new Set<string>(['send_tokens', 'get_balance', 'create_wallet']);
-
-await test('evaluateTask: MCP perfect match', () => {
-  const task: TaskDefinition = {
-    id: 'task-perfect',
-    prompt: 'Create a wallet and check its balance',
-    expected_actions: [
-      { name: 'create_wallet' },
-      { name: 'get_balance' },
-    ],
-  };
-
-  const extractedCalls: ExtractedCall[] = [
-    { method: 'create_wallet', args: {}, line: 0, raw: '{"name":"create_wallet","arguments":{}}' },
-    { method: 'get_balance', args: { address: 'addr1' }, line: 1, raw: '{"name":"get_balance","arguments":{"address":"addr1"}}' },
-  ];
-
-  const result = evaluateTask({
-    task,
-    model: mockModel,
-    surface: 'mcp',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods,
-  });
-
-  assert(result.metrics.taskPassed === true, 'taskPassed should be true');
-  assertEqual(result.metrics.toolSelectionAccuracy, 1.0, 'toolSelectionAccuracy should be 1.0');
-  assertEqual(result.metrics.hallucinationRate, 0, 'hallucinationRate should be 0');
-});
-
-await test('evaluateTask: MCP hallucinated tool', () => {
-  const task: TaskDefinition = {
-    id: 'task-hallucination',
-    prompt: 'Send some tokens',
-    expected_actions: [
-      { name: 'send_tokens' },
-    ],
-  };
-
-  const extractedCalls: ExtractedCall[] = [
-    { method: 'send_tokens', args: { to: 'addr1', amount: '5' }, line: 0, raw: '{"name":"send_tokens","arguments":{"to":"addr1","amount":"5"}}' },
-    { method: 'delete_everything', args: {}, line: 1, raw: '{"name":"delete_everything","arguments":{}}' },
-  ];
-
-  const result = evaluateTask({
-    task,
-    model: mockModel,
-    surface: 'mcp',
-    generatedCode: null,
-    rawResponse: '',
-    extractedCalls,
-    llmLatencyMs: 0,
-    error: undefined,
-    knownMethods,
-  });
-
-  assert(
-    result.metrics.hallucinatedActions.includes('delete_everything'),
-    'hallucinatedActions should include "delete_everything"',
-  );
-  assert(result.metrics.hallucinationRate > 0, 'hallucinationRate should be > 0');
-});
-
-// ── Group 4: extract factory (MCP surface) ────────────────────────────────
-
-await test('extract factory: MCP surface returns null generatedCode', async () => {
-  const config: BenchmarkConfig = {
-    name: 'test-mcp-benchmark',
-    surface: 'mcp',
-    mcp: { tools: 'tools.json' },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  };
-
-  const response: LLMResponse = {
-    content: '',
-    toolCalls: [
-      { name: 'send_tokens', arguments: { to: 'addr1', amount: '5' } },
-      { name: 'get_balance', arguments: { address: 'addr1' } },
-    ],
-  };
-
-  const { calls, generatedCode } = await extract(response, config);
-
-  assertEqual(generatedCode, null, 'generatedCode should be null in MCP surface');
-  assertEqual(calls.length, 2, 'calls should have 2 items matching the toolCalls');
-});
-
-// ── Summary ───────────────────────────────────────────────────────────────
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-mock-repos.ts b/tests/smoke-mock-repos.ts
deleted file mode 100644
index e90d021..0000000
--- a/tests/smoke-mock-repos.ts
+++ /dev/null
@@ -1,154 +0,0 @@
-import { existsSync, readFileSync, rmSync } from 'node:fs';
-import { mkdtempSync } from 'node:fs';
-import { join } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import { loadConfig } from '../src/benchmark/config.js';
-import { loadOptimizeManifest } from '../src/optimizer/manifest.js';
-import { createValidationRunner } from '../src/optimizer/validation.js';
-import { getMockRepoTemplatePath, listMockRepoTemplates, materializeMockRepo } from '../src/optimizer/mock-repos.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== Mock Repo Smoke Tests ===\n');
-
-await test('listMockRepoTemplates: exposes tracked templates that exist in the worktree', () => {
-  const templates = listMockRepoTemplates();
-  assert(templates.length >= 1, 'should expose at least one template');
-  assert(templates.includes('mcp-tracker-demo'), 'should include mcp-tracker-demo');
-  assert(templates.includes('sdk-counter-demo'), 'should include sdk-counter-demo');
-  assert(templates.includes('cli-taskfile-demo'), 'should include cli-taskfile-demo');
-});
-
-for (const name of listMockRepoTemplates()) {
-  await test(`materializeMockRepo: ${name} becomes a standalone git repo`, async () => {
-    const destRoot = mkdtempSync(join(tmpdir(), 'skill-optimizer-mock-'));
-    try {
-      const materializedPath = await materializeMockRepo(name, destRoot);
-      const projectConfigPath = join(materializedPath, 'skill-optimizer.json');
-
-      assert(existsSync(join(materializedPath, '.git')), 'materialized mock repo should be git-initialized');
-      assert(existsSync(projectConfigPath), 'unified project config should exist');
-
-      const { config: benchmarkConfig } = await loadConfig(projectConfigPath);
-
-      if (name === 'mcp-tracker-demo') {
-        assertEqual(benchmarkConfig.surface, 'mcp', 'tracker demo should materialize an MCP benchmark');
-        const projectConfigRaw = JSON.parse(readFileSync(projectConfigPath, 'utf-8')) as {
-          target?: { repoPath?: string };
-          benchmark?: { taskGeneration?: { outputDir?: string } };
-          optimize?: { validation?: string[] };
-        };
-        assert(Array.isArray(projectConfigRaw.optimize?.validation), 'tracker demo optimize config should define validation array');
-        assertEqual(projectConfigRaw.optimize?.validation?.length, 0, 'tracker demo should allow empty validation commands');
-        assertEqual(projectConfigRaw.target?.repoPath, '.', 'tracker demo config should keep repoPath relative');
-        assertEqual(
-          projectConfigRaw.benchmark?.taskGeneration?.outputDir,
-          './.skill-optimizer',
-          'tracker demo should declare task generation output directory',
-        );
-        assert(existsSync(join(materializedPath, 'SKILL.md')), 'tracker demo should include SKILL.md');
-        assert(existsSync(join(materializedPath, 'tools.json')), 'tracker demo should include tools.json');
-        const optimizeManifest = await loadOptimizeManifest(projectConfigPath);
-        assertEqual(optimizeManifest.targetRepo.path, materializedPath, 'optimize target should point at the materialized repo');
-
-        const validation = await createValidationRunner().run(optimizeManifest.targetRepo);
-        assert(validation.ok, 'materialized mock repo validation should pass');
-      }
-
-      if (name === 'sdk-counter-demo') {
-        assertEqual(benchmarkConfig.surface, 'sdk', 'sdk-counter-demo should materialize an SDK benchmark');
-        const projectConfigRaw = JSON.parse(readFileSync(projectConfigPath, 'utf-8')) as {
-          target?: { surface?: string; scope?: { include?: string[] } };
-          benchmark?: { verdict?: { perModelFloor?: number } };
-        };
-        assert(projectConfigRaw.target?.surface === 'sdk', 'sdk-counter-demo should have sdk surface');
-        assert(Array.isArray(projectConfigRaw.target?.scope?.include), 'sdk-counter-demo should define scope.include');
-        assert(typeof projectConfigRaw.benchmark?.verdict?.perModelFloor === 'number', 'sdk-counter-demo should define verdict.perModelFloor');
-        assert(existsSync(join(materializedPath, 'SKILL.md')), 'sdk-counter-demo should include SKILL.md');
-        assert(existsSync(join(materializedPath, 'src', 'counter.ts')), 'sdk-counter-demo should include src/counter.ts');
-      }
-
-      if (name === 'cli-taskfile-demo') {
-        assertEqual(benchmarkConfig.surface, 'cli', 'cli-taskfile-demo should materialize a CLI benchmark');
-        const projectConfigRaw = JSON.parse(readFileSync(projectConfigPath, 'utf-8')) as {
-          target?: { surface?: string; scope?: { include?: string[] } };
-          benchmark?: { verdict?: { perModelFloor?: number } };
-        };
-        assert(projectConfigRaw.target?.surface === 'cli', 'cli-taskfile-demo should have cli surface');
-        assert(Array.isArray(projectConfigRaw.target?.scope?.include), 'cli-taskfile-demo should define scope.include');
-        assert(typeof projectConfigRaw.benchmark?.verdict?.perModelFloor === 'number', 'cli-taskfile-demo should define verdict.perModelFloor');
-        assert(existsSync(join(materializedPath, 'SKILL.md')), 'cli-taskfile-demo should include SKILL.md');
-        assert(existsSync(join(materializedPath, 'src', 'commands.ts')), 'cli-taskfile-demo should include src/commands.ts');
-      }
-    } finally {
-      rmSync(destRoot, { recursive: true, force: true });
-    }
-  });
-}
-
-await test('materializeMockRepo: replacing an existing destination stays deterministic', async () => {
-  const destRoot = mkdtempSync(join(tmpdir(), 'skill-optimizer-mock-'));
-  try {
-    const template = listMockRepoTemplates()[0]!;
-    const materializedPath = await materializeMockRepo(template, destRoot);
-    const staleFilePath = join(materializedPath, 'stale.txt');
-    await import('node:fs/promises').then(({ writeFile }) => writeFile(staleFilePath, 'stale\n', 'utf-8'));
-
-    const rematerializedPath = await materializeMockRepo(template, destRoot);
-    assertEqual(rematerializedPath, materializedPath, 'materialized path should be stable');
-    assert(!existsSync(staleFilePath), 'stale files should be removed before re-materializing');
-  } finally {
-    rmSync(destRoot, { recursive: true, force: true });
-  }
-});
-
-await test('materializeMockRepo: rejects destinations that overlap the tracked template path', async () => {
-  const template = listMockRepoTemplates()[0]!;
-  const templatePath = getMockRepoTemplatePath(template);
-  const destinationRoot = join(templatePath, '..');
-
-  let threw = false;
-  try {
-    await materializeMockRepo(template, destinationRoot);
-  } catch (error: any) {
-    threw = true;
-    assert(error.message.includes('overlaps'), 'error should explain the overlap');
-  }
-
-  assert(threw, 'should reject overlapping destination paths');
-});
-
-await test('mock repo templates keep benchmark and target files together', () => {
-  for (const name of listMockRepoTemplates()) {
-    const readmePath = join(process.cwd(), 'mock-repos', name, 'README.md');
-    const readme = readFileSync(readmePath, 'utf-8');
-    assert(readme.includes('skill-optimizer.json'), `${name} README should mention skill-optimizer.json`);
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-model-ids.ts b/tests/smoke-model-ids.ts
deleted file mode 100644
index 6ca1ba8..0000000
--- a/tests/smoke-model-ids.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-import { test } from 'node:test';
-import assert from 'node:assert/strict';
-
-// Helper: run validate+fix pipeline on a minimal config with the given model ID.
-async function pipeline(modelId: string): Promise<string> {
-  const { checkConfig } = await import('../src/project/validate.js');
-  const { applyFixes } = await import('../src/project/fix.js');
-  const raw = {
-    name: 'test',
-    target: { surface: 'mcp' as const, repoPath: '.' },
-    benchmark: {
-      format: 'pi',
-      models: [{ id: modelId, name: 'Test Model', tier: 'mid' as const }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  const issues = await checkConfig(raw as never, '/tmp/fake.json');
-  const fixed = applyFixes(raw as never, issues, '/tmp');
-  const models = (fixed as { benchmark: { models: Array<{ id: string }> } }).benchmark.models;
-  return models[0]!.id;
-}
-
-await test('openrouter/ model IDs with dots are NOT rewritten by validate+fix', async () => {
-  const OPENROUTER_IDS = [
-    'openrouter/deepseek/deepseek-v3.2',
-    'openrouter/anthropic/claude-sonnet-4.6',
-    'openrouter/anthropic/claude-opus-4.6',
-    'openrouter/minimax/minimax-m2.7',
-    'openrouter/minimax/minimax-m2.5',
-    'openrouter/qwen/qwen3.5-397b-a17b',
-    'openrouter/qwen/qwen3.6-plus',
-    'openrouter/moonshotai/kimi-k2.5',
-    'openrouter/x-ai/grok-4.1-fast',
-    'openrouter/openai/gpt-5.4',
-    'openrouter/z-ai/glm-5.1',
-    'openrouter/google/gemini-2.5-flash',
-    'openrouter/google/gemini-2.5-flash-lite',
-    'openrouter/google/gemini-3.1-pro-preview',
-  ];
-
-  for (const id of OPENROUTER_IDS) {
-    const result = await pipeline(id);
-    assert.equal(result, id, `openrouter/ ID "${id}" was rewritten to "${result}" — must be preserved`);
-  }
-});
-
-await test('anthropic/ direct-API IDs with dots ARE rewritten to hyphens', async () => {
-  const result = await pipeline('anthropic/claude-sonnet-4.6');
-  assert.equal(result, 'anthropic/claude-sonnet-4-6',
-    'anthropic/ direct API dots should be rewritten to hyphens (Anthropic API convention)');
-});
-
-await test('openrouter/ model IDs without dots are preserved as-is', async () => {
-  const result = await pipeline('openrouter/google/gemini-3-flash-preview');
-  assert.equal(result, 'openrouter/google/gemini-3-flash-preview');
-});
-
-await test('openai/ direct-API IDs with dots are NOT rewritten', async () => {
-  const result = await pipeline('openai/gpt-5.4');
-  assert.equal(result, 'openai/gpt-5.4',
-    'openai/ direct API dots must be preserved (OpenAI uses gpt-5.4 not gpt-5-4)');
-});
-
-await test('applyFixes directly: openai/ IDs are NOT rewritten even if model-id-bad-format issue is present', async () => {
-  const { applyFixes } = await import('../src/project/fix.js');
-  const raw = {
-    name: 'test',
-    target: { surface: 'mcp' as const, repoPath: '.' },
-    benchmark: {
-      format: 'pi',
-      models: [{ id: 'openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' as const }],
-      taskGeneration: { enabled: true, maxTasks: 5 },
-    },
-  };
-  // Manufacture the issue that validate.ts would normally never emit for openai/,
-  // so we exercise fix.ts's defense-in-depth exemption directly.
-  // fixable: true is required so the filter in applyFixes actually processes it.
-  const issues = [
-    { code: 'model-id-bad-format' as const, field: 'benchmark.models[0].id', message: 'synthetic', severity: 'warning' as const, fixable: true },
-  ];
-  const fixed = applyFixes(raw as never, issues as never, '/tmp');
-  const id = (fixed as { benchmark: { models: Array<{ id: string }> } }).benchmark.models[0]!.id;
-  assert.equal(id, 'openai/gpt-5.4',
-    'fix.ts must exempt openai/ from dot→hyphen rewrite (defense-in-depth; OpenAI API uses dots)');
-});
diff --git a/tests/smoke-optimize.ts b/tests/smoke-optimize.ts
deleted file mode 100644
index eb0325a..0000000
--- a/tests/smoke-optimize.ts
+++ /dev/null
@@ -1,1041 +0,0 @@
-import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
-import { execFileSync } from 'node:child_process';
-import { join } from 'node:path';
-import { tmpdir } from 'node:os';
-
-import { loadOptimizeManifest } from '../src/optimizer/manifest.js';
-import { analyzeFailures } from '../src/optimizer/failure-analysis.js';
-import { runOptimizeLoop } from '../src/optimizer/loop.js';
-import { createJsonLedger } from '../src/optimizer/ledger.js';
-import { createRepoStateManager } from '../src/optimizer/repo-state.js';
-import { collectGitChangedFiles } from '../src/optimizer/mutation/git-changes.js';
-import type { BenchmarkReport } from '../src/benchmark/types.js';
-import type { OptimizeManifest, OptimizeLoopDependencies } from '../src/optimizer/types.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-function makeReport(overallPassRate: number, kinds: Array<'missing' | 'args' | 'hallucination' | 'error'> = []): BenchmarkReport {
-  const baseResult = {
-    task: {
-      id: `task-${overallPassRate}-${kinds.join('-') || 'ok'}`,
-      prompt: 'Do the thing',
-      expected_actions: [{ name: 'Client.doThing', args: { id: '123' } }],
-    },
-    model: {
-      id: 'openai/test',
-      name: 'Test Model',
-      tier: 'flagship' as const,
-    },
-    generatedCode: 'client.doThing({ id: "123" })',
-    rawResponse: 'ok',
-    extractedCalls: [],
-    actionMatches: [],
-    metrics: {
-      toolPrecision: 1,
-      toolRecall: 1,
-      taskPassed: true,
-      toolSelectionAccuracy: 1,
-      argAccuracy: 1,
-      unnecessaryActions: [],
-      hallucinatedActions: [],
-      hallucinationRate: 0,
-    },
-    llmLatencyMs: 10,
-    error: undefined as string | undefined,
-  };
-
-  const results = kinds.length === 0
-    ? [baseResult]
-    : kinds.map((kind, index) => {
-        const result = {
-          ...baseResult,
-          task: {
-            ...baseResult.task,
-            id: `task-${kind}-${index}`,
-          },
-          actionMatches: [
-            {
-              expected: { name: 'Client.doThing', args: { id: '123' } },
-              found: kind === 'missing' ? null : { method: 'Client.doThing', args: { id: kind === 'args' ? '999' : '123' }, line: 1, raw: 'mock' },
-              methodFound: kind !== 'missing',
-              argsCorrect: kind !== 'args',
-              matched: kind !== 'missing' && kind !== 'args',
-            },
-          ],
-          metrics: {
-            ...baseResult.metrics,
-            taskPassed: false,
-            toolSelectionAccuracy: kind === 'missing' ? 0 : 1,
-            argAccuracy: kind === 'args' ? 0 : 1,
-            hallucinatedActions: kind === 'hallucination' ? ['Client.deleteEverything'] : [],
-            hallucinationRate: kind === 'hallucination' ? 1 : 0,
-          },
-          error: kind === 'error' ? 'provider failed' : undefined,
-        };
-
-        return result;
-      });
-
-  return {
-    timestamp: '2026-04-09T12:00:00.000Z',
-    config: { name: 'demo', surface: 'sdk' },
-    skillVersion: {
-      source: 'local',
-      commitSha: 'local',
-      ref: 'file',
-      fetchedAt: '2026-04-09T12:00:00.000Z',
-    },
-    results,
-    coverage: [],
-    summary: {
-      totalTasks: results.length,
-      totalModels: 1,
-      totalEvaluations: results.length,
-      overallPassRate,
-      avgToolRecall: overallPassRate,
-      avgToolPrecision: overallPassRate,
-      avgToolSelectionAccuracy: overallPassRate,
-      avgArgAccuracy: overallPassRate,
-      avgHallucinationRate: 0,
-      methodCoveragePercent: 1,
-      weightedAverage: overallPassRate,
-      perModel: {},
-      perTask: {},
-      perTier: {
-        flagship: { passRate: overallPassRate, avgRecall: overallPassRate, avgToolSelectionAccuracy: overallPassRate, avgArgAccuracy: overallPassRate },
-        mid: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-        low: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-      },
-    },
-  };
-}
-
-function makeBenchmarkRunResult(
-  report: BenchmarkReport,
-  opts: { outputDir: string; label: string },
-  persist = false,
-): { report: BenchmarkReport; reportPath: string } {
-  const reportDir = join(opts.outputDir, opts.label);
-  const reportPath = join(reportDir, 'report.json');
-  if (persist) {
-    mkdirSync(reportDir, { recursive: true });
-    writeFileSync(reportPath, JSON.stringify(report, null, 2), 'utf-8');
-  }
-  return { report, reportPath };
-}
-
-function makeManifest(): OptimizeManifest {
-  return {
-    benchmarkConfig: '/tmp/benchmark.config.json',
-    targetRepo: {
-      path: '/tmp/target-repo',
-      surface: 'sdk',
-      allowedPaths: ['src', 'README.md'],
-      validation: ['npm test'],
-      requireCleanGit: true,
-    },
-    optimizer: {
-      mode: 'stable-surface' as any,
-      maxIterations: 5,
-      stabilityWindow: 2,
-      minImprovement: 0.01,
-      taskGeneration: {
-        enabled: false,
-        maxGenerated: 10,
-        seed: 1,
-        outputDir: '/tmp/skill-optimizer',
-      },
-    },
-  };
-}
-
-console.log('\n=== Optimizer Smoke Tests ===\n');
-
-await test('loadOptimizeManifest: applies defaults', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-'));
-  try {
-    const repoDir = join(dir, 'sdk');
-    mkdirSync(join(repoDir, 'src'), { recursive: true });
-    const file = join(dir, 'skill-optimizer.json');
-    writeFileSync(file, JSON.stringify({
-      name: 'opt-defaults',
-      target: {
-        surface: 'sdk',
-        repoPath: './sdk',
-        sdk: { language: 'typescript', apiSurface: ['Client.doThing'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-      optimize: {
-        model: 'openrouter/openai/gpt-5.4',
-        allowedPaths: ['./sdk/src'],
-        validation: ['npm test'],
-      },
-    }), 'utf-8');
-
-    const manifest = await loadOptimizeManifest(file);
-    assertEqual(manifest.optimizer.maxIterations, 5, 'maxIterations default');
-    assertEqual(manifest.optimizer.stabilityWindow, 2, 'stabilityWindow default');
-    assertEqual(manifest.optimizer.taskGeneration.enabled, false, 'taskGeneration.enabled default');
-    assertEqual(manifest.optimizer.taskGeneration.maxGenerated, 10, 'taskGeneration.maxGenerated default');
-    assertEqual(manifest.optimizer.taskGeneration.outputDir, join(dir, '.skill-optimizer'), 'taskGeneration.outputDir default');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('loadOptimizeManifest: defaults optimize.model to the first benchmark model', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-'));
-  try {
-    const repoDir = join(dir, 'sdk');
-    mkdirSync(join(repoDir, 'src'), { recursive: true });
-    const file = join(dir, 'skill-optimizer.json');
-    writeFileSync(file, JSON.stringify({
-      name: 'opt-default-model',
-      target: {
-        surface: 'sdk',
-        repoPath: './sdk',
-        sdk: { language: 'typescript', apiSurface: ['Client.doThing'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-      optimize: {
-        allowedPaths: ['./sdk/src'],
-        validation: ['npm test'],
-      },
-    }), 'utf-8');
-
-    const manifest = await loadOptimizeManifest(file);
-    assertEqual(manifest.mutation?.provider, 'openrouter', 'mutation provider should default from the first benchmark model');
-    assertEqual(manifest.mutation?.model, 'openai/gpt-5.4', 'mutation model should default from the first benchmark model');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('loadOptimizeManifest: allows empty target validation commands', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-'));
-  try {
-    const repoDir = join(dir, 'sdk');
-    mkdirSync(join(repoDir, 'src'), { recursive: true });
-    const file = join(dir, 'skill-optimizer.json');
-    writeFileSync(file, JSON.stringify({
-      name: 'opt-validation',
-      target: {
-        surface: 'sdk',
-        repoPath: './sdk',
-        sdk: { language: 'typescript', apiSurface: ['Client.doThing'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-      optimize: {
-        model: 'openrouter/openai/gpt-5.4',
-        allowedPaths: ['./sdk/src'],
-        validation: [],
-      },
-    }), 'utf-8');
-
-    const manifest = await loadOptimizeManifest(file);
-    assertEqual(manifest.targetRepo.validation.length, 0, 'empty validation array should be preserved');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('loadOptimizeManifest: rejects requireCleanGit=false', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-'));
-  try {
-    const file = join(dir, 'skill-optimizer.json');
-    writeFileSync(file, JSON.stringify({
-      name: 'opt-clean-git',
-      target: {
-        surface: 'sdk',
-        repoPath: '../sdk',
-        sdk: { language: 'typescript', apiSurface: ['Client.doThing'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-      },
-      optimize: {
-        model: 'openrouter/openai/gpt-5.4',
-        allowedPaths: ['src'],
-        validation: ['npm test'],
-        requireCleanGit: false,
-      },
-    }), 'utf-8');
-
-    let threw = false;
-    try {
-      await loadOptimizeManifest(file);
-    } catch (error: any) {
-      threw = true;
-      assert(error.message.includes('requireCleanGit'), 'error should mention requireCleanGit');
-    }
-
-    assert(threw, 'should reject requireCleanGit=false');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('loadOptimizeManifest: rejects invalid optimizer numeric values', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-'));
-  try {
-    const file = join(dir, 'skill-optimizer.json');
-    writeFileSync(file, JSON.stringify({
-      name: 'opt-invalid-values',
-      target: {
-        surface: 'sdk',
-        repoPath: '../sdk',
-        sdk: { language: 'typescript', apiSurface: ['Client.doThing'] },
-      },
-      benchmark: {
-        tasks: './tasks.json',
-        format: 'pi',
-        models: [{ id: 'openrouter/openai/gpt-5.4', name: 'GPT-5.4', tier: 'flagship' }],
-        taskGeneration: {
-          enabled: false,
-          maxTasks: 0,
-          seed: -1,
-          outputDir: '',
-        },
-      },
-      optimize: {
-        model: 'openrouter/openai/gpt-5.4',
-        allowedPaths: ['src'],
-        validation: ['npm test'],
-        maxIterations: 0,
-        stabilityWindow: 0,
-        minImprovement: -0.1,
-        reportContextMaxBytes: 0,
-      },
-    }), 'utf-8');
-
-    let threw = false;
-    try {
-      await loadOptimizeManifest(file);
-    } catch (error: any) {
-        threw = true;
-        assert(
-        error.message.includes('maxIterations') || error.message.includes('stabilityWindow') || error.message.includes('minImprovement') || error.message.includes('maxTasks') || error.message.includes('seed') || error.message.includes('outputDir') || error.message.includes('reportContextMaxBytes'),
-        'error should mention invalid optimizer numeric field',
-      );
-    }
-
-    assert(threw, 'should reject invalid optimizer numeric values');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('analyzeFailures: ranks buckets by count', () => {
-  const report = makeReport(0.2, ['missing', 'missing', 'args', 'hallucination', 'error']);
-  const buckets = analyzeFailures(report);
-  assertEqual(buckets[0].kind, 'missing-tool', 'missing-tool should rank first');
-  assertEqual(buckets[0].count, 2, 'missing-tool count');
-  assertEqual(buckets[1].kind, 'bad-args', 'bad-args should rank second');
-  assertEqual(buckets[2].kind, 'hallucination', 'hallucination should rank third');
-  assertEqual(buckets[3].kind, 'error', 'error should rank fourth');
-});
-
-await test('runOptimizeLoop: stops after max iterations', async () => {
-  const manifest = makeManifest();
-  manifest.optimizer!.maxIterations = 3;
-  manifest.optimizer!.stabilityWindow = 10;
-
-  let runCount = 0;
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => {
-        runCount++;
-        return makeBenchmarkRunResult(makeReport(0.4), opts);
-      },
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => {},
-      updateAcceptedCheckpoint: async () => 'checkpoint-1',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'noop', changedFiles: ['src/index.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.iterations.length, 3, 'should run 3 iterations');
-  assertEqual(result.stopReason, 'max-iterations', 'should stop on max iterations');
-  assertEqual(runCount, 4, 'baseline plus one benchmark run per iteration');
-});
-
-await test('runOptimizeLoop: starts a new epoch baseline after accepted surface change', async () => {
-  const manifest = makeManifest() as OptimizeManifest & {
-    optimizer: OptimizeManifest['optimizer'] & { mode: 'surface-changing' };
-    targetRepo: OptimizeManifest['targetRepo'] & { surfacePaths: string[] };
-  };
-  manifest.optimizer.mode = 'surface-changing';
-  manifest.optimizer.maxIterations = 1;
-  manifest.optimizer.taskGeneration!.enabled = true;
-  manifest.targetRepo.surfacePaths = ['src/server.ts'];
-
-  const benchmarkLabels: string[] = [];
-  let generationCount = 0;
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => {
-        benchmarkLabels.push(opts.label);
-        const score = opts.label === 'baseline' ? 0.4 : opts.label === 'iteration-1' ? 0.6 : 0.55;
-        return makeBenchmarkRunResult(makeReport(score), opts);
-      },
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => {},
-      updateAcceptedCheckpoint: async () => 'checkpoint-2',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'rename tool', changedFiles: ['src/server.ts'] }),
-    },
-    taskGenerator: {
-      generate: async () => {
-        generationCount += 1;
-        return {
-          benchmarkConfigPath: `/tmp/generated-${generationCount}.json`,
-          taskCount: 3,
-          rejectedCount: 0,
-        };
-      },
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(generationCount, 2, 'surface-changing mode should regenerate tasks after accepted surface changes');
-  assertEqual(benchmarkLabels.join(','), 'baseline,epoch-2-baseline', 'should start a new epoch baseline after the surface change is accepted');
-  assertEqual(result.bestReport.summary.overallPassRate, 0.55, 'best report should become the new epoch baseline');
-});
-
-await test('runOptimizeLoop: rejects surface changes in stable-surface mode', async () => {
-  const manifest = makeManifest() as OptimizeManifest & {
-    targetRepo: OptimizeManifest['targetRepo'] & { surfacePaths: string[] };
-  };
-  manifest.optimizer!.maxIterations = 1;
-  manifest.targetRepo.surfacePaths = ['src/server.ts'];
-
-  let restoreCalls = 0;
-  let runCount = 0;
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => {
-        runCount += 1;
-        return makeBenchmarkRunResult(makeReport(runCount === 1 ? 0.4 : 0.9), opts);
-      },
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => { restoreCalls += 1; },
-      updateAcceptedCheckpoint: async () => 'checkpoint-2',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'rename tool anyway', changedFiles: ['src/server.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.iterations[0]?.accepted, false, 'stable-surface mode should reject surface changes');
-  assert(result.iterations[0]?.validation.commands[0]?.stderr.includes('stable-surface'), 'validation should explain why the change was rejected');
-  assertEqual(restoreCalls, 1, 'restoreCheckpoint should run after rejecting a surface change');
-  assertEqual(runCount, 1, 'benchmark rerun should be skipped when the callable surface changed');
-});
-
-await test('runOptimizeLoop: rejects surface-changing mode when task generation is disabled', async () => {
-  const manifest = makeManifest() as OptimizeManifest & {
-    optimizer: OptimizeManifest['optimizer'] & { mode: 'surface-changing' };
-  };
-  manifest.optimizer.mode = 'surface-changing';
-  manifest.optimizer.taskGeneration!.enabled = false;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => makeBenchmarkRunResult(makeReport(0.4), opts),
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => {},
-      updateAcceptedCheckpoint: async () => 'checkpoint-2',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'noop', changedFiles: [] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  let threw = false;
-  try {
-    await runOptimizeLoop(manifest, deps);
-  } catch (error: any) {
-    threw = true;
-    assert(error.message.includes('surface-changing optimize mode requires task generation'), 'error should explain the invariant');
-  }
-  assert(threw, 'surface-changing mode without generation should throw');
-});
-
-await test('runOptimizeLoop: applies defaults to partially specified manifests', async () => {
-  const manifest: OptimizeManifest = {
-    benchmarkConfig: '/tmp/benchmark.config.json',
-    targetRepo: {
-      path: '/tmp/target-repo',
-      surface: 'sdk',
-      allowedPaths: ['src'],
-      validation: ['npm test'],
-    },
-    optimizer: {
-      maxIterations: 1,
-      taskGeneration: {
-        enabled: false,
-      },
-    },
-  };
-
-  const scores = [0.40, 0.50];
-  let index = 0;
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => makeBenchmarkRunResult(makeReport(scores[Math.min(index++, scores.length - 1)]!), opts),
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => {},
-      updateAcceptedCheckpoint: async () => 'checkpoint-2',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'improve names', changedFiles: ['src/client.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.iterations[0]?.accepted, true, 'default minImprovement should still allow acceptance');
-});
-
-await test('runOptimizeLoop: rejects requireCleanGit=false even for programmatic manifests', async () => {
-  const manifest = makeManifest();
-  manifest.targetRepo.requireCleanGit = false;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => makeBenchmarkRunResult(makeReport(0.4), opts),
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => {},
-      updateAcceptedCheckpoint: async () => 'checkpoint-1',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'noop', changedFiles: ['src/index.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  let threw = false;
-  try {
-    await runOptimizeLoop(manifest, deps);
-  } catch (error: any) {
-    threw = true;
-    assert(error.message.includes('requireCleanGit'), 'error should mention requireCleanGit');
-  }
-
-  assert(threw, 'runOptimizeLoop should reject requireCleanGit=false');
-});
-
-await test('runOptimizeLoop: stops early when stable', async () => {
-  const manifest = makeManifest();
-  manifest.optimizer!.maxIterations = 5;
-  manifest.optimizer!.stabilityWindow = 2;
-
-  const scores = [0.40, 0.50, 0.50, 0.50];
-  let index = 0;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => makeBenchmarkRunResult(makeReport(scores[Math.min(index++, scores.length - 1)]!), opts),
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => {},
-      updateAcceptedCheckpoint: async () => 'checkpoint-2',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'tweak sdk docs', changedFiles: ['README.md'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.stopReason, 'stable', 'should stop once score is stable');
-  assertEqual(result.iterations.length, 3, 'one improvement followed by two stable iterations');
-});
-
-await test('runOptimizeLoop: rejects validation failures and restores checkpoint', async () => {
-  const manifest = makeManifest();
-  let restoreCalls = 0;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => makeBenchmarkRunResult(makeReport(0.4), opts),
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => { restoreCalls++; },
-      updateAcceptedCheckpoint: async () => 'checkpoint-1',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'break it', changedFiles: ['src/index.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: false, commands: [{ command: 'npm test', ok: false, exitCode: 1, stdout: '', stderr: 'failed' }] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.iterations[0]?.accepted, false, 'iteration should be rejected');
-  assert(restoreCalls > 0, 'restoreCheckpoint should be called after validation failure');
-});
-
-await test('runOptimizeLoop: restores checkpoint when benchmark rerun throws after mutation', async () => {
-  const manifest = makeManifest();
-  let restoreCalls = 0;
-  let runCount = 0;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => {
-        runCount += 1;
-        if (runCount === 1) return makeBenchmarkRunResult(makeReport(0.4), opts);
-        throw new Error('rerun failed');
-      },
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => { restoreCalls++; },
-      updateAcceptedCheckpoint: async () => 'checkpoint-1',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'change sdk code', changedFiles: ['src/index.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  let threw = false;
-  try {
-    await runOptimizeLoop(manifest, deps);
-  } catch (error: any) {
-    threw = true;
-    assert(error.message.includes('rerun failed'), 'should surface benchmark rerun error');
-  }
-
-  assert(threw, 'loop should throw when benchmark rerun fails');
-  assertEqual(restoreCalls, 1, 'restoreCheckpoint should run before propagating error');
-});
-
-await test('runOptimizeLoop: restores checkpoint when mutation executor throws', async () => {
-  const manifest = makeManifest();
-  let restoreCalls = 0;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => makeBenchmarkRunResult(makeReport(0.4), opts),
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => { restoreCalls++; },
-      updateAcceptedCheckpoint: async () => 'checkpoint-1',
-    },
-    mutation: {
-      apply: async () => {
-        throw new Error('mutation transport failed');
-      },
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  let threw = false;
-  try {
-    await runOptimizeLoop(manifest, deps);
-  } catch (error: any) {
-    threw = true;
-    assert(error.message.includes('mutation transport failed'), 'should surface mutation error');
-  }
-
-  assert(threw, 'loop should throw when mutation executor fails');
-  assertEqual(restoreCalls, 1, 'restoreCheckpoint should run on mutation errors too');
-});
-
-await test('runOptimizeLoop: rejects changes outside allowed paths', async () => {
-  const manifest = makeManifest();
-  manifest.optimizer!.maxIterations = 1;
-  let restoreCalls = 0;
-  let runCount = 0;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => {
-        runCount += 1;
-        return makeBenchmarkRunResult(makeReport(0.4), opts);
-      },
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => { restoreCalls++; },
-      updateAcceptedCheckpoint: async () => 'checkpoint-1',
-    },
-    mutation: {
-      apply: async () => ({ summary: 'edit forbidden path', changedFiles: ['scripts/release.sh'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.iterations[0]?.accepted, false, 'iteration should be rejected');
-  assert(result.iterations[0]?.validation.commands[0]?.stderr.includes('allowed paths'), 'validation message should explain the rejection');
-  assertEqual(restoreCalls, 1, 'restoreCheckpoint should run for out-of-scope edits');
-  assertEqual(runCount, 1, 'benchmark rerun should be skipped when changed files are out of scope');
-});
-
-await test('runOptimizeLoop: rejects validation side effects outside allowed paths', async () => {
-  const manifest = makeManifest();
-  manifest.optimizer!.maxIterations = 1;
-  let restoreCalls = 0;
-  let commitCalls = 0;
-  let benchmarkRuns = 0;
-
-  const deps: OptimizeLoopDependencies = {
-    benchmark: {
-      run: async (_configPath, opts) => {
-        benchmarkRuns += 1;
-        return makeBenchmarkRunResult(makeReport(0.4), opts);
-      },
-    },
-    repo: {
-      ensureReady: async () => 'clean',
-      captureCheckpoint: async () => 'checkpoint-1',
-      restoreCheckpoint: async () => { restoreCalls++; },
-      updateAcceptedCheckpoint: async () => {
-        commitCalls += 1;
-        return 'checkpoint-2';
-      },
-      listChangedFiles: async () => ['src/index.ts', 'scripts/generated-report.json'],
-    },
-    mutation: {
-      apply: async () => ({ summary: 'edit allowed path', changedFiles: ['src/index.ts'] }),
-    },
-    validation: {
-      run: async () => ({ ok: true, commands: [{ command: 'npm test', ok: true, exitCode: 0, stdout: '', stderr: '' }] }),
-    },
-    ledger: {
-      record: async () => {},
-    },
-  };
-
-  const result = await runOptimizeLoop(manifest, deps);
-  assertEqual(result.iterations[0]?.accepted, false, 'iteration should be rejected');
-  assertEqual(result.iterations[0]?.changedFiles.includes('scripts/generated-report.json'), true, 'iteration should record final changed files');
-  assert(result.iterations[0]?.validation.commands[0]?.stderr.includes('allowed paths'), 'validation should explain out-of-scope side effect');
-  assertEqual(restoreCalls, 1, 'restoreCheckpoint should run for validation side effects');
-  assertEqual(commitCalls, 0, 'out-of-scope side effects must not be committed');
-  assertEqual(benchmarkRuns, 1, 'benchmark rerun should be skipped when post-validation scope check fails');
-});
-
-await test('createJsonLedger: recovers from corrupted ledger file', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-ledger-'));
-  try {
-    const ledgerPath = join(dir, 'optimize-ledger.json');
-    writeFileSync(ledgerPath, '{not-json', 'utf-8');
-    const ledger = createJsonLedger(ledgerPath);
-    await ledger.record({ type: 'baseline', score: 0.5 });
-
-    const saved = JSON.parse(readFileSync(ledgerPath, 'utf-8')) as { version: number; events: Array<Record<string, unknown>> };
-    assertEqual(saved.version, 1, 'recovered ledger should reset version');
-    assertEqual(saved.events.length, 1, 'recovered ledger should record the new event');
-    assert(existsSync(`${ledgerPath}.corrupt`), 'corrupt ledger should be preserved with .corrupt suffix');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('createRepoStateManager: commits accepted changes without git identity config', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-repo-state-'));
-  const previousHome = process.env.HOME;
-  const previousGitConfigGlobal = process.env.GIT_CONFIG_GLOBAL;
-  try {
-    execFileSync('git', ['init'], { cwd: dir, encoding: 'utf-8' });
-    writeFileSync(join(dir, 'tracked.txt'), 'v1\n', 'utf-8');
-    execFileSync('git', ['add', 'tracked.txt'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['commit', '-m', 'init'], {
-      cwd: dir,
-      encoding: 'utf-8',
-      env: {
-        ...process.env,
-        HOME: dir,
-        GIT_CONFIG_GLOBAL: '/dev/null',
-        GIT_AUTHOR_NAME: 'Init User',
-        GIT_AUTHOR_EMAIL: 'init@example.com',
-        GIT_COMMITTER_NAME: 'Init User',
-        GIT_COMMITTER_EMAIL: 'init@example.com',
-      },
-    });
-
-    process.env.HOME = dir;
-    process.env.GIT_CONFIG_GLOBAL = '/dev/null';
-
-    writeFileSync(join(dir, 'tracked.txt'), 'v2\n', 'utf-8');
-    const manager = createRepoStateManager();
-    const targetRepo = {
-      path: dir,
-      surface: 'sdk' as const,
-      allowedPaths: ['tracked.txt'],
-      validation: ['true'],
-      requireCleanGit: true,
-    };
-
-    const headBefore = await manager.captureCheckpoint(targetRepo);
-    const headAfter = await manager.updateAcceptedCheckpoint(
-      targetRepo,
-      headBefore,
-      { summary: 'update tracked file', changedFiles: ['tracked.txt'] },
-      ['tracked.txt'],
-    );
-
-    assert(headAfter !== headBefore, 'commit should advance HEAD even without configured git identity');
-  } finally {
-    if (previousHome === undefined) {
-      delete process.env.HOME;
-    } else {
-      process.env.HOME = previousHome;
-    }
-    if (previousGitConfigGlobal === undefined) {
-      delete process.env.GIT_CONFIG_GLOBAL;
-    } else {
-      process.env.GIT_CONFIG_GLOBAL = previousGitConfigGlobal;
-    }
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('createRepoStateManager: ignores optimizer artifacts during clean-git check', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-clean-ignore-'));
-  try {
-    execFileSync('git', ['init'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.name', 'OpenCode'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.email', 'opencode@example.com'], { cwd: dir, encoding: 'utf-8' });
-
-    writeFileSync(join(dir, 'tracked.txt'), 'v1\n', 'utf-8');
-    execFileSync('git', ['add', '.'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, encoding: 'utf-8' });
-
-    mkdirSync(join(dir, '.skill-optimizer'), { recursive: true });
-    writeFileSync(join(dir, '.skill-optimizer', 'report.json'), '{}\n', 'utf-8');
-
-    const manager = createRepoStateManager();
-    const targetRepo = {
-      path: dir,
-      surface: 'sdk' as const,
-      allowedPaths: ['tracked.txt'],
-      validation: ['true'],
-      requireCleanGit: true,
-      cleanIgnorePaths: ['.skill-optimizer'],
-    } as any;
-
-    const result = await manager.ensureReady(targetRepo);
-    assertEqual(result, 'ready', 'optimizer artifacts should not block reruns');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('createRepoStateManager: restoreCheckpoint removes ignored files outside preserved paths', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-restore-ignore-'));
-  try {
-    execFileSync('git', ['init'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.name', 'OpenCode'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.email', 'opencode@example.com'], { cwd: dir, encoding: 'utf-8' });
-
-    writeFileSync(join(dir, '.gitignore'), '.skill-optimizer/\ndist/\n', 'utf-8');
-    writeFileSync(join(dir, 'tracked.txt'), 'v1\n', 'utf-8');
-    execFileSync('git', ['add', '.'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, encoding: 'utf-8' });
-
-    mkdirSync(join(dir, '.skill-optimizer'), { recursive: true });
-    mkdirSync(join(dir, 'dist'), { recursive: true });
-    writeFileSync(join(dir, '.skill-optimizer', 'report.json'), '{}\n', 'utf-8');
-    writeFileSync(join(dir, 'dist', 'leak.txt'), 'secret\n', 'utf-8');
-
-    const manager = createRepoStateManager();
-    const targetRepo = {
-      path: dir,
-      surface: 'sdk' as const,
-      allowedPaths: ['tracked.txt'],
-      validation: ['true'],
-      requireCleanGit: true,
-      cleanIgnorePaths: ['.skill-optimizer'],
-    } as any;
-
-    const checkpoint = await manager.captureCheckpoint(targetRepo);
-    await manager.restoreCheckpoint(targetRepo, checkpoint);
-
-    assert(!existsSync(join(dir, 'dist', 'leak.txt')), 'restoreCheckpoint should remove ignored files outside preserved paths');
-    assert(existsSync(join(dir, '.skill-optimizer', 'report.json')), 'restoreCheckpoint should preserve optimizer artifacts');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('collectGitChangedFiles: includes ignored files', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-git-ignored-'));
-  try {
-    execFileSync('git', ['init'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.name', 'OpenCode'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.email', 'opencode@example.com'], { cwd: dir, encoding: 'utf-8' });
-
-    writeFileSync(join(dir, '.gitignore'), 'dist/\n', 'utf-8');
-    writeFileSync(join(dir, 'tracked.txt'), 'v1\n', 'utf-8');
-    execFileSync('git', ['add', '.'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, encoding: 'utf-8' });
-
-    mkdirSync(join(dir, 'dist'), { recursive: true });
-    writeFileSync(join(dir, 'dist', 'leak.txt'), 'oops\n', 'utf-8');
-
-    const files = await collectGitChangedFiles(dir);
-    assert(files.includes('dist/leak.txt'), 'should include ignored file changes for scope enforcement');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-await test('collectGitChangedFiles: includes unstaged, staged, and untracked files', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'skill-optimizer-git-'));
-  try {
-    execFileSync('git', ['init'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.name', 'OpenCode'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['config', 'user.email', 'opencode@example.com'], { cwd: dir, encoding: 'utf-8' });
-
-    writeFileSync(join(dir, 'tracked.txt'), 'v1\n', 'utf-8');
-    writeFileSync(join(dir, 'staged.txt'), 'v1\n', 'utf-8');
-    execFileSync('git', ['add', '.'], { cwd: dir, encoding: 'utf-8' });
-    execFileSync('git', ['commit', '-m', 'init'], { cwd: dir, encoding: 'utf-8' });
-
-    writeFileSync(join(dir, 'tracked.txt'), 'v2\n', 'utf-8');
-    writeFileSync(join(dir, 'staged.txt'), 'v2\n', 'utf-8');
-    writeFileSync(join(dir, 'untracked.txt'), 'v1\n', 'utf-8');
-    execFileSync('git', ['add', 'staged.txt'], { cwd: dir, encoding: 'utf-8' });
-
-    const files = await collectGitChangedFiles(dir);
-    assert(files.includes('tracked.txt'), 'should include unstaged file');
-    assert(files.includes('staged.txt'), 'should include staged file');
-    assert(files.includes('untracked.txt'), 'should include untracked file');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-prompt-criteria.ts b/tests/smoke-prompt-criteria.ts
deleted file mode 100644
index becd39f..0000000
--- a/tests/smoke-prompt-criteria.ts
+++ /dev/null
@@ -1,104 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { resolveCriteriaForTask } from '../src/benchmark/prompt-criteria.js';
-import type { GeneratedTask } from '../src/tasks/types.js';
-import type { PromptCapabilityWithSection } from '../src/project/discover-prompt.js';
-import type { ActionDefinition } from '../src/actions/types.js';
-
-function cap(actionKey: string, section: string): PromptCapabilityWithSection {
-  const action: ActionDefinition = { key: actionKey, name: actionKey, args: [], source: 'prompt' };
-  return { action, section };
-}
-
-function task(id: string, capabilityId: string): GeneratedTask {
-  return { id, prompt: `do ${capabilityId}`, expected_actions: [], capabilityId };
-}
-
-function testResolvesCriteriaForMatchingCapability() {
-  const caps = [
-    cap('summarize', '## summarize\n\nInclude: date, author. Use a numbered list.'),
-    cap('translate', '## translate\n\nInclude: source language, target.'),
-  ];
-  const result = resolveCriteriaForTask(task('t1', 'summarize'), caps);
-  assert.ok(result.criteria, 'criteria must be returned for matched capability');
-  assert.strictEqual(result.noActiveCriteria, false);
-  console.log('PASS: resolves criteria for matching capability');
-}
-
-function testDistinctCriteriaPerCapability() {
-  const caps = [
-    cap('alpha', '## alpha\n\nInclude: x, y. Numbered list required.'),
-    cap('beta', '## beta\n\nInclude: totally-different-thing.'),
-  ];
-  const a = resolveCriteriaForTask(task('t1', 'alpha'), caps);
-  const b = resolveCriteriaForTask(task('t2', 'beta'), caps);
-  assert.notDeepStrictEqual(a.criteria, b.criteria,
-    'different capabilities must produce different criteria (caps[0]-collapse guard)');
-  console.log('PASS: distinct criteria per capability');
-}
-
-function testThrowsOnUnknownCapabilityId() {
-  const caps = [cap('known', '## known\n\nInclude: foo.')];
-  assert.throws(
-    () => resolveCriteriaForTask(task('t1', 'unknown'), caps),
-    /capabilityId "unknown"/,
-    'unknown capabilityId must throw loudly — no silent fallback',
-  );
-  console.log('PASS: throws on unknown capabilityId');
-}
-
-function testThrowsOnMissingCapabilityId() {
-  const caps = [cap('known', '## known\n\nInclude: foo.')];
-  const taskWithoutId: GeneratedTask = { id: 't1', prompt: 'test', expected_actions: [] };
-  assert.throws(
-    () => resolveCriteriaForTask(taskWithoutId, caps),
-    /missing capabilityId/,
-    'task without capabilityId must throw loudly',
-  );
-  console.log('PASS: throws on missing capabilityId');
-}
-
-function testNoActiveCriteriaFlag() {
-  const caps = [cap('empty', '')];
-  const result = resolveCriteriaForTask(task('t1', 'empty'), caps);
-  assert.strictEqual(result.noActiveCriteria, true,
-    'capability with no extractable criteria must set noActiveCriteria: true');
-  console.log('PASS: flags noActiveCriteria when criteria are empty');
-}
-
-function testOutputCapabilityWithCodeBlockSectionProducesCriteria() {
-  // Regression guard for Issue 1: _output capabilities store section: section.body
-  // (full markdown with fences), not section: snippet (stripped content without fences).
-  // generateCriteriaFromCapability requires fences to extract format patterns.
-  const outputSection = [
-    '## Output Format',
-    '',
-    'Respond with this structure:',
-    '',
-    '```json',
-    '{',
-    '  "name": "<string>",',
-    '  "count": <number>',
-    '}',
-    '```',
-  ].join('\n');
-  const caps = [cap('my_output', outputSection)];
-  const result = resolveCriteriaForTask(task('t1', 'my_output'), caps);
-  assert.strictEqual(result.noActiveCriteria, false,
-    'output capability whose section contains a fenced code block must produce non-empty criteria');
-  console.log('PASS: output capability with code block section produces criteria (Issue 1 guard)');
-}
-
-async function main() {
-  testResolvesCriteriaForMatchingCapability();
-  testDistinctCriteriaPerCapability();
-  testThrowsOnUnknownCapabilityId();
-  testThrowsOnMissingCapabilityId();
-  testNoActiveCriteriaFlag();
-  testOutputCapabilityWithCodeBlockSectionProducesCriteria();
-  console.log('\nALL PASS: smoke-prompt-criteria');
-}
-
-main().catch((err) => {
-  console.error('FAIL: smoke-prompt-criteria', err);
-  process.exit(1);
-});
diff --git a/tests/smoke-prompt-evaluator.ts b/tests/smoke-prompt-evaluator.ts
deleted file mode 100644
index 8024404..0000000
--- a/tests/smoke-prompt-evaluator.ts
+++ /dev/null
@@ -1,357 +0,0 @@
-/**
- * Smoke tests for the prompt surface evaluator (benchmark/prompt-evaluator.ts).
- * Mirrors the structure of smoke-scoring.ts and smoke-llm.ts.
- */
-
-import {
-  evaluatePromptResponse,
-  type PromptEvaluationCriteria,
-} from '../src/benchmark/prompt-evaluator.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  + ${name}`);
-  } catch (error: any) {
-    failed++;
-    console.log(`  - ${name}`);
-    console.log(`    ${error.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string): void {
-  if (!condition) {
-    throw new Error(`Assertion failed: ${message}`);
-  }
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string): void {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-function assertInRange(actual: number, min: number, max: number, message: string): void {
-  if (actual < min || actual > max) {
-    throw new Error(`${message}: expected ${actual} to be in [${min}, ${max}]`);
-  }
-}
-
-console.log('\n=== Prompt Evaluator Smoke Tests ===\n');
-
-// ---------------------------------------------------------------------------
-// Required sections
-// ---------------------------------------------------------------------------
-
-await test('required sections present -> high section score', () => {
-  const criteria: PromptEvaluationCriteria = {
-    requiredSections: ['Overview', 'Implementation'],
-  };
-
-  const response = `## Overview
-
-Here is the overview of the system.
-
-## Implementation
-
-The implementation uses a factory pattern.
-`;
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.sections, 1.0, 'all required sections present should yield section score 1.0');
-  assertInRange(result.score, 0.8, 1.0, 'overall score should be high when sections are all present');
-});
-
-await test('required sections missing -> low section score', () => {
-  const criteria: PromptEvaluationCriteria = {
-    requiredSections: ['Overview', 'Implementation', 'Testing'],
-  };
-
-  const response = `## Overview
-
-Here is the overview.
-
-Some text about implementation without a heading.
-`;
-
-  const result = evaluatePromptResponse(response, criteria);
-  // Only 1 of 3 sections found
-  assertInRange(result.categoryScores.sections, 0.3, 0.4, 'only 1/3 sections found should yield ~0.33');
-  assert(result.details.some((d) => d.includes('MISSING')), 'details should mention missing sections');
-});
-
-// ---------------------------------------------------------------------------
-// Format patterns
-// ---------------------------------------------------------------------------
-
-await test('format pattern match -> positive format score', () => {
-  const criteria: PromptEvaluationCriteria = {
-    formatPatterns: [
-      { name: 'has-yaml-key', pattern: '^\\w+:\\s+' },
-      { name: 'has-heading', pattern: '^#+\\s+' },
-    ],
-  };
-
-  const response = `# Configuration
-
-name: my-service
-replicas: 3
-`;
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.format, 1.0, 'both format patterns matched should yield 1.0');
-});
-
-await test('format pattern mismatch -> low format score', () => {
-  const criteria: PromptEvaluationCriteria = {
-    formatPatterns: [
-      { name: 'has-json', pattern: '\\{[\\s\\S]*"\\w+"\\s*:' },
-      { name: 'has-array', pattern: '\\[\\s*\\{' },
-    ],
-  };
-
-  const response = 'Just plain text without any JSON or arrays.';
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.format, 0, 'no format patterns matched should yield 0');
-});
-
-// ---------------------------------------------------------------------------
-// Forbidden keywords
-// ---------------------------------------------------------------------------
-
-await test('forbidden keywords present -> keyword score penalty', () => {
-  const criteria: PromptEvaluationCriteria = {
-    forbiddenKeywords: ['deprecated', 'latest'],
-  };
-
-  const response = 'Use the deprecated API with the latest image tag.';
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.keywords, 0, 'both forbidden keywords present should yield 0');
-  assert(
-    result.checks.filter((c) => c.check.startsWith('forbidden:') && !c.passed).length === 2,
-    'should have 2 failed forbidden checks',
-  );
-});
-
-await test('forbidden keywords absent -> keyword score 1.0', () => {
-  const criteria: PromptEvaluationCriteria = {
-    forbiddenKeywords: ['deprecated', 'hack', 'workaround'],
-  };
-
-  const response = 'A clean implementation using standard patterns.';
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.keywords, 1.0, 'no forbidden keywords should yield 1.0');
-});
-
-// ---------------------------------------------------------------------------
-// Code blocks (structural)
-// ---------------------------------------------------------------------------
-
-await test('code blocks detected -> structural score boost', () => {
-  const criteria: PromptEvaluationCriteria = {
-    hasCodeBlocks: true,
-  };
-
-  const response = `Here is the implementation:
-
-\`\`\`go
-func main() {
-    fmt.Println("hello")
-}
-\`\`\`
-`;
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.structure, 1.0, 'expected code blocks found should yield structure 1.0');
-});
-
-await test('code blocks expected but missing -> structural score drop', () => {
-  const criteria: PromptEvaluationCriteria = {
-    hasCodeBlocks: true,
-  };
-
-  const response = 'Just text, no code blocks at all.';
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.structure, 0, 'expected code blocks missing should yield structure 0');
-});
-
-// ---------------------------------------------------------------------------
-// Empty response
-// ---------------------------------------------------------------------------
-
-await test('empty response -> score 0', () => {
-  const criteria: PromptEvaluationCriteria = {
-    requiredSections: ['Overview'],
-    forbiddenKeywords: ['error'],
-  };
-
-  // With required sections set, an empty string scores 0 on section checks.
-  const resultEmpty = evaluatePromptResponse('', criteria);
-  // Empty string: heading regex won't match, so sections score = 0
-  assertInRange(resultEmpty.categoryScores.sections, 0, 0, 'empty response should have section score 0');
-
-  const resultWhitespace = evaluatePromptResponse('   \n  \n  ', criteria);
-  assertInRange(resultWhitespace.categoryScores.sections, 0, 0, 'whitespace-only response should have section score 0');
-});
-
-// ---------------------------------------------------------------------------
-// All criteria met -> score 1.0
-// ---------------------------------------------------------------------------
-
-await test('all criteria met -> score 1.0', () => {
-  const criteria: PromptEvaluationCriteria = {
-    requiredSections: ['Setup', 'Deploy'],
-    formatPatterns: [{ name: 'has-heading', pattern: '^##\\s+' }],
-    requiredKeywords: ['kubernetes', 'namespace'],
-    forbiddenKeywords: ['deprecated'],
-    hasCodeBlocks: true,
-    hasNumberedList: true,
-  };
-
-  const response = `## Setup
-
-Configure the kubernetes cluster and target namespace.
-
-1. Create the namespace
-2. Apply RBAC rules
-
-## Deploy
-
-Deploy the service:
-
-\`\`\`bash
-kubectl apply -f manifests/
-\`\`\`
-`;
-
-  const result = evaluatePromptResponse(response, criteria);
-  assertEqual(result.categoryScores.sections, 1.0, 'all sections found');
-  assertEqual(result.categoryScores.format, 1.0, 'format pattern matched');
-  assertEqual(result.categoryScores.keywords, 1.0, 'all keywords present, no forbidden');
-  assertEqual(result.categoryScores.structure, 1.0, 'code blocks and numbered list found');
-  assertEqual(result.score, 1.0, 'overall score should be 1.0 when all criteria met');
-  assertEqual(result.noActiveCriteria, false, 'populated criteria must have noActiveCriteria: false');
-});
-
-// ---------------------------------------------------------------------------
-// Mixed criteria: partial scores
-// ---------------------------------------------------------------------------
-
-await test('mixed criteria produce weighted intermediate score', () => {
-  const criteria: PromptEvaluationCriteria = {
-    requiredSections: ['Setup', 'Missing Section'],
-    requiredKeywords: ['deploy'],
-    forbiddenKeywords: ['hack'],
-    hasCodeBlocks: true,
-  };
-
-  const response = `## Setup
-
-We will deploy the service.
-
-\`\`\`bash
-kubectl apply -f deploy.yaml
-\`\`\`
-`;
-
-  const result = evaluatePromptResponse(response, criteria);
-  // Sections: 1/2 = 0.5
-  assertInRange(result.categoryScores.sections, 0.49, 0.51, 'sections should be ~0.5 (1/2 found)');
-  // Keywords: deploy found + hack absent = 2/2 = 1.0
-  assertEqual(result.categoryScores.keywords, 1.0, 'keywords should be 1.0');
-  // Structure: code blocks present = 1.0
-  assertEqual(result.categoryScores.structure, 1.0, 'structure should be 1.0');
-  // Overall: weighted mix, should be between 0.5 and 1.0
-  assertInRange(result.score, 0.5, 1.0, 'overall score should be between 0.5 and 1.0');
-});
-
-// ---------------------------------------------------------------------------
-// Numbered list and table detection
-// ---------------------------------------------------------------------------
-
-await test('numbered list detection works', () => {
-  const criteria: PromptEvaluationCriteria = {
-    hasNumberedList: true,
-  };
-
-  const withList = `Steps:
-
-1. First step
-2. Second step
-3. Third step
-`;
-
-  const withoutList = 'No numbered items here, just prose.';
-
-  const resultWith = evaluatePromptResponse(withList, criteria);
-  assertEqual(resultWith.categoryScores.structure, 1.0, 'numbered list found should yield 1.0');
-
-  const resultWithout = evaluatePromptResponse(withoutList, criteria);
-  assertEqual(resultWithout.categoryScores.structure, 0, 'no numbered list should yield 0');
-});
-
-await test('table detection works', () => {
-  const criteria: PromptEvaluationCriteria = {
-    hasTable: true,
-  };
-
-  const withTable = `| Name | Value |
-|------|-------|
-| foo  | bar   |
-`;
-
-  const withoutTable = 'No table here.';
-
-  const resultWith = evaluatePromptResponse(withTable, criteria);
-  assertEqual(resultWith.categoryScores.structure, 1.0, 'table found should yield 1.0');
-
-  const resultWithout = evaluatePromptResponse(withoutTable, criteria);
-  assertEqual(resultWithout.categoryScores.structure, 0, 'no table should yield 0');
-});
-
-// ---------------------------------------------------------------------------
-// minLength check
-// ---------------------------------------------------------------------------
-
-await test('minLength enforced as format check', () => {
-  const criteria: PromptEvaluationCriteria = {
-    minLength: 100,
-  };
-
-  const shortResponse = 'Too short.';
-  const longResponse = 'A'.repeat(150) + ' — this response is well over the minimum length requirement and should pass the check.';
-
-  const resultShort = evaluatePromptResponse(shortResponse, criteria);
-  assertEqual(resultShort.categoryScores.format, 0, 'short response should fail minLength');
-
-  const resultLong = evaluatePromptResponse(longResponse, criteria);
-  assertEqual(resultLong.categoryScores.format, 1.0, 'long response should pass minLength');
-});
-
-// ---------------------------------------------------------------------------
-// No criteria -> noActiveCriteria + score 0 (regression guard for P3)
-// ---------------------------------------------------------------------------
-
-await test('no criteria specified -> noActiveCriteria: true, score: 0', () => {
-  const criteria: PromptEvaluationCriteria = {};
-  const result = evaluatePromptResponse('Any response at all.', criteria);
-  assertEqual(result.noActiveCriteria, true, 'empty criteria must set noActiveCriteria: true');
-  assertEqual(result.score, 0, 'empty criteria must score 0, not vacuously pass');
-});
-
-await test('non-empty criteria -> noActiveCriteria: false', () => {
-  const criteria: PromptEvaluationCriteria = { requiredKeywords: ['alpha'] };
-  const result = evaluatePromptResponse('alpha present here.', criteria);
-  assertEqual(result.noActiveCriteria, false, 'non-empty criteria must not set noActiveCriteria');
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-release.ts b/tests/smoke-release.ts
deleted file mode 100644
index df53ad3..0000000
--- a/tests/smoke-release.ts
+++ /dev/null
@@ -1,72 +0,0 @@
-import { existsSync, readFileSync } from 'node:fs';
-import { join } from 'node:path';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-console.log('\n=== Release Hygiene Smoke Tests ===\n');
-
-await test('package.json includes OSS metadata and publish guardrails', () => {
-  const packageJson = JSON.parse(readFileSync(join(process.cwd(), 'package.json'), 'utf-8')) as {
-    license?: string;
-    repository?: unknown;
-    bugs?: unknown;
-    homepage?: string;
-    keywords?: string[];
-    exports?: Record<string, unknown>;
-    bin?: Record<string, string>;
-    scripts?: Record<string, string>;
-  };
-
-  assert(packageJson.license === 'MIT', 'package.json should declare MIT license');
-  assert(typeof packageJson.repository === 'object' && packageJson.repository !== null, 'package.json should declare repository metadata');
-  assert(typeof packageJson.bugs === 'object' && packageJson.bugs !== null, 'package.json should declare bugs metadata');
-  assert(typeof packageJson.homepage === 'string' && packageJson.homepage.length > 0, 'package.json should declare homepage metadata');
-  assert(Array.isArray(packageJson.keywords) && packageJson.keywords.length >= 4, 'package.json should declare discoverable keywords');
-  assert(typeof packageJson.exports?.['.'] === 'object', 'package.json should constrain the public root export');
-  assert(typeof packageJson.bin?.['skill-optimizer'] === 'string', 'package.json should declare skill-optimizer bin entry');
-  assert(packageJson.bin?.['skill-optimizer'] === './dist/cli.js', 'skill-optimizer bin should point at ./dist/cli.js');
-  assert(typeof packageJson.scripts?.clean === 'string', 'package.json should include a clean script');
-  assert(typeof packageJson.scripts?.prepack === 'string', 'package.json should include a prepack script');
-});
-
-await test('repo includes a root LICENSE file', () => {
-  assert(existsSync(join(process.cwd(), 'LICENSE')), 'root LICENSE file should exist');
-});
-
-await test('mock-repos README matches the tracked templates', () => {
-  const readme = readFileSync(join(process.cwd(), 'mock-repos', 'README.md'), 'utf-8');
-  assert(readme.includes('mcp-tracker-demo'), 'mock-repos README should mention mcp-tracker-demo');
-  assert(readme.includes('cli-taskfile-demo'), 'mock-repos README should mention cli-taskfile-demo');
-  assert(!readme.includes('sdk-demo'), 'mock-repos README should not mention removed sdk-demo template');
-  assert(!readme.includes('cli-demo'), 'mock-repos README should not mention removed cli-demo template');
-  assert(!readme.includes('mcp-demo'), 'mock-repos README should not mention removed mcp-demo template');
-});
-
-// CHANGELOG must have a heading for current package version
-await test('CHANGELOG.md has a section for the current package version', () => {
-  const pkgVersion = (JSON.parse(readFileSync(join(process.cwd(), 'package.json'), 'utf-8')) as { version: string }).version;
-  const changelogContent = readFileSync(join(process.cwd(), 'CHANGELOG.md'), 'utf-8');
-  const versionHeaderRe = new RegExp(`^##\\s*\\[?${pkgVersion.replace(/\./g, '\\.')}\\]?`, 'm');
-  assert(versionHeaderRe.test(changelogContent), `CHANGELOG.md must have a section for version ${pkgVersion}`);
-  console.log(`PASS: CHANGELOG has section for v${pkgVersion}`);
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-scope.ts b/tests/smoke-scope.ts
deleted file mode 100644
index 81d5c39..0000000
--- a/tests/smoke-scope.ts
+++ /dev/null
@@ -1,68 +0,0 @@
-import { strict as assert } from 'node:assert';
-
-import { resolveScope, matchesGlob } from '../src/tasks/scope.js';
-import type { ActionDefinition } from '../src/actions/types.js';
-
-function mk(name: string): ActionDefinition {
-  return { key: name, name, args: [] };
-}
-
-function testDefaultIncludeEverything() {
-  const actions = [mk('Wallet.send'), mk('Wallet.receive'), mk('Token.mint')];
-  const { inScope, outOfScope } = resolveScope(actions, { include: ['*'], exclude: [] });
-  assert.strictEqual(inScope.length, 3);
-  assert.strictEqual(outOfScope.length, 0);
-  console.log('PASS: default ["*"] includes everything');
-}
-
-function testIncludeNarrowsToPrefix() {
-  const actions = [mk('Wallet.send'), mk('Wallet.receive'), mk('Token.mint')];
-  const { inScope, outOfScope } = resolveScope(actions, { include: ['Wallet.*'], exclude: [] });
-  assert.deepStrictEqual(inScope.map((a) => a.name).sort(), ['Wallet.receive', 'Wallet.send']);
-  assert.deepStrictEqual(outOfScope.map((a) => a.name), ['Token.mint']);
-  console.log('PASS: include narrows to prefix');
-}
-
-function testExcludeSubtracts() {
-  const actions = [mk('Wallet.send'), mk('Wallet.internalDebit'), mk('Token.mint')];
-  const { inScope } = resolveScope(actions, { include: ['*'], exclude: ['*.internal*'] });
-  assert.deepStrictEqual(inScope.map((a) => a.name).sort(), ['Token.mint', 'Wallet.send']);
-  console.log('PASS: exclude removes matches');
-}
-
-function testStarMatchesSeparators() {
-  assert.strictEqual(matchesGlob('Wallet.send', '*'), true);
-  assert.strictEqual(matchesGlob('Wallet.send', 'Wallet.*'), true);
-  assert.strictEqual(matchesGlob('Wallet.Inner.send', 'Wallet.*'), true); // * matches dots
-  console.log('PASS: * matches any sequence including separators');
-}
-
-function testEmptyScopeIsAnError() {
-  const actions = [mk('Wallet.send')];
-  const { inScope } = resolveScope(actions, { include: ['NoMatch.*'], exclude: [] });
-  assert.strictEqual(inScope.length, 0);
-  console.log('PASS: scope can resolve to empty (caller decides if that is an error)');
-}
-
-function testEmptyIncludeDefaultsToStar() {
-  const actions = [mk('Wallet.send'), mk('Token.mint')];
-  const { inScope, outOfScope } = resolveScope(actions, { include: [], exclude: [] });
-  assert.strictEqual(inScope.length, 2);
-  assert.strictEqual(outOfScope.length, 0);
-  console.log('PASS: empty include defaults to ["*"]');
-}
-
-async function main() {
-  testDefaultIncludeEverything();
-  testIncludeNarrowsToPrefix();
-  testExcludeSubtracts();
-  testStarMatchesSeparators();
-  testEmptyScopeIsAnError();
-  testEmptyIncludeDefaultsToStar();
-  console.log('\nALL PASS: smoke-scope');
-}
-
-main().catch((err) => {
-  console.error('FAIL: smoke-scope', err);
-  process.exit(1);
-});
diff --git a/tests/smoke-scoring.ts b/tests/smoke-scoring.ts
deleted file mode 100644
index 3b37f92..0000000
--- a/tests/smoke-scoring.ts
+++ /dev/null
@@ -1,159 +0,0 @@
-import { strict as assert } from 'node:assert';
-
-import {
-  computePerModelPassRates,
-  computeWeightedAverage,
-  computeVerdict,
-  accept,
-} from '../src/benchmark/scoring.js';
-import type { BenchmarkReport, ModelConfig } from '../src/benchmark/types.js';
-
-function syntheticReport(perModel: Record<string, number>, models: ModelConfig[]): BenchmarkReport {
-  const entries = Object.entries(perModel);
-  const summaryPerModel: Record<string, { passRate: number; avgRecall: number; avgPrecision: number; avgToolSelectionAccuracy: number; avgArgAccuracy: number; avgHallucinationRate: number; tasksRun: number }> = {};
-  for (const [id, rate] of entries) {
-    summaryPerModel[id] = { passRate: rate, avgRecall: 0, avgPrecision: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0, avgHallucinationRate: 0, tasksRun: 10 };
-  }
-  const overall = entries.reduce((a, [, r]) => a + r, 0) / Math.max(1, entries.length);
-  return {
-    timestamp: new Date().toISOString(),
-    config: { name: 'syn', surface: 'mcp' },
-    skillVersion: { source: 'local', commitSha: 'local', ref: 'file', fetchedAt: new Date().toISOString() },
-    results: [],
-    coverage: [],
-    summary: {
-      totalTasks: 10,
-      totalModels: entries.length,
-      totalEvaluations: 10 * entries.length,
-      overallPassRate: overall,
-      weightedAverage: 0, // filled in by scoring
-      avgToolRecall: 0,
-      avgToolPrecision: 0,
-      avgToolSelectionAccuracy: 0,
-      avgArgAccuracy: 0,
-      avgHallucinationRate: 0,
-      methodCoveragePercent: 1,
-      perModel: summaryPerModel,
-      perTask: {},
-      perTier: { flagship: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 }, mid: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 }, low: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 } },
-    },
-  };
-}
-
-function testEqualWeightsCollapseToMean() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const report = syntheticReport({ a: 0.6, b: 0.8 }, models);
-  const wavg = computeWeightedAverage(report, models);
-  assert.strictEqual(wavg, 0.7);
-  console.log('PASS: equal weights collapse to mean');
-}
-
-function testWeightedAverageWithExplicitWeights() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship', weight: 3 },
-    { id: 'b', name: 'B', tier: 'mid', weight: 1 },
-  ];
-  const report = syntheticReport({ a: 1.0, b: 0.0 }, models);
-  const wavg = computeWeightedAverage(report, models);
-  assert.strictEqual(wavg, 0.75);
-  console.log('PASS: weighted average honors explicit weights');
-}
-
-function testPerModelPassRates() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const report = syntheticReport({ a: 0.42, b: 0.99 }, models);
-  const rates = computePerModelPassRates(report);
-  assert.strictEqual(rates.a, 0.42);
-  assert.strictEqual(rates.b, 0.99);
-  console.log('PASS: per-model pass rates echo summary');
-}
-
-function testVerdictPassWhenAllAboveFloorAndTargetHit() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const report = syntheticReport({ a: 0.7, b: 0.75 }, models);
-  report.summary.weightedAverage = 0.725;
-  const verdict = computeVerdict(report, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7 });
-  assert.strictEqual(verdict.result, 'PASS');
-  console.log('PASS: verdict PASS when all above floor and target hit');
-}
-
-function testVerdictFailWhenOneBelowFloor() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const report = syntheticReport({ a: 0.9, b: 0.5 }, models);
-  report.summary.weightedAverage = 0.7;
-  const verdict = computeVerdict(report, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7 });
-  assert.strictEqual(verdict.result, 'FAIL');
-  assert.ok(verdict.reasons.some((r) => r.includes('b')));
-  console.log('PASS: verdict FAIL when one model below floor');
-}
-
-function testAcceptBelowFloorButImproving() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const before = syntheticReport({ a: 0.8, b: 0.3 }, models);
-  before.summary.weightedAverage = 0.55;
-  const after = syntheticReport({ a: 0.8, b: 0.4 }, models);
-  after.summary.weightedAverage = 0.6;
-  const result = accept(before, after, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7, minImprovement: 0.02 });
-  assert.strictEqual(result, true);
-  console.log('PASS: accept below-floor but improving');
-}
-
-function testRejectCrossingBelowFloor() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const before = syntheticReport({ a: 0.8, b: 0.8 }, models);
-  before.summary.weightedAverage = 0.8;
-  const after = syntheticReport({ a: 0.8, b: 0.55 }, models);
-  after.summary.weightedAverage = 0.675;
-  const result = accept(before, after, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7, minImprovement: 0.02 });
-  assert.strictEqual(result, false);
-  console.log('PASS: reject crossing below floor');
-}
-
-function testRejectNoMinImprovement() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const before = syntheticReport({ a: 0.7, b: 0.7 }, models);
-  before.summary.weightedAverage = 0.7;
-  const after = syntheticReport({ a: 0.71, b: 0.71 }, models);
-  after.summary.weightedAverage = 0.71;
-  const result = accept(before, after, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7, minImprovement: 0.02 });
-  assert.strictEqual(result, false);
-  console.log('PASS: reject when weighted improvement below minImprovement');
-}
-
-async function main() {
-  testEqualWeightsCollapseToMean();
-  testWeightedAverageWithExplicitWeights();
-  testPerModelPassRates();
-  testVerdictPassWhenAllAboveFloorAndTargetHit();
-  testVerdictFailWhenOneBelowFloor();
-  testAcceptBelowFloorButImproving();
-  testRejectCrossingBelowFloor();
-  testRejectNoMinImprovement();
-  console.log('\nALL PASS: smoke-scoring');
-}
-
-main().catch((err) => {
-  console.error('FAIL: smoke-scoring', err);
-  process.exit(1);
-});
diff --git a/tests/smoke-sdk-python.ts b/tests/smoke-sdk-python.ts
deleted file mode 100644
index 49cc484..0000000
--- a/tests/smoke-sdk-python.ts
+++ /dev/null
@@ -1,135 +0,0 @@
-import { extractSdkCodeBlock } from '../src/benchmark/extractors/code-extractor.js';
-import { extractSdkFromCode } from '../src/benchmark/extractors/sdk/registry.js';
-import { extract } from '../src/benchmark/extractors/index.js';
-import { evaluateTask } from '../src/benchmark/evaluator.js';
-import type { BenchmarkConfig, LLMResponse } from '../src/benchmark/types.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== Python SDK Smoke Tests ===\n');
-
-await test('python block extraction', () => {
-  const result = extractSdkCodeBlock('```python\nclient = FastClient()\n```', 'python');
-  assertEqual(result, 'client = FastClient()', 'should extract python fenced block');
-});
-
-await test('python: constructor call', async () => {
-  const { calls } = await extractSdkFromCode('client = FastClient("testnet")', 'python');
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'FastClient.constructor', 'constructor normalized');
-  assertEqual(calls[0].args._positional_0 as string, 'testnet', 'constructor arg preserved');
-});
-
-await test('python: class and instance methods', async () => {
-  const code = [
-    'wallet = FastWallet.from_keyfile(provider)',
-    'await wallet.send(to="fast1abc", amount="5")',
-  ].join('\n');
-  const { calls } = await extractSdkFromCode(code, 'python');
-  assertEqual(calls.length, 2, 'two calls expected');
-  assertEqual(calls[0].method, 'FastWallet.from_keyfile', 'class method normalized');
-  assertEqual(calls[1].method, 'FastWallet.send', 'instance method resolved through binding');
-  assertEqual(calls[1].args.to as string, 'fast1abc', 'keyword arg to preserved');
-  assertEqual(calls[1].args.amount as string, '5', 'keyword arg amount preserved');
-});
-
-await test('python: async assignment preserves instance binding', async () => {
-  const code = [
-    'wallet = await FastWallet.from_keyfile(provider)',
-    'await wallet.send(to="fast1abc")',
-  ].join('\n');
-  const { calls } = await extractSdkFromCode(code, 'python');
-  assertEqual(calls.length, 2, 'two calls expected');
-  assertEqual(calls[0].method, 'FastWallet.from_keyfile', 'async assignment source call preserved');
-  assertEqual(calls[1].method, 'FastWallet.send', 'instance call should resolve through async binding');
-});
-
-await test('python: factory methods do not mislabel returned instance type', async () => {
-  const code = [
-    'client = FastClient("testnet")',
-    'wallet = client.wallet()',
-    'await wallet.send(to="fast1abc")',
-  ].join('\n');
-  const extraction = await extractSdkFromCode(code, 'python');
-
-  const task = {
-    id: 'python-factory-binding',
-    prompt: 'send a payment',
-    expected_actions: [
-      { name: 'FastClient.constructor', args: { _positional_0: 'testnet' } },
-      { name: 'FastWallet.send', args: { to: 'fast1abc' } },
-    ],
-  };
-  const result = evaluateTask({
-    task,
-    model: { id: 'test/model', name: 'Test Model', tier: 'flagship' },
-    generatedCode: code,
-    rawResponse: code,
-    extractedCalls: extraction.calls,
-    llmLatencyMs: 1,
-    knownMethods: new Set(['FastClient.constructor', 'FastWallet.send']),
-    bindings: extraction.bindings,
-    surface: 'sdk',
-  });
-
-  assertEqual(result.metrics.taskPassed, true, 'factory-bound wallet.send should resolve to FastWallet.send');
-});
-
-await test('python: standalone function with nested literals', async () => {
-  const code = 'result = fast(network="testnet", wallets=[{"type": "fast"}])';
-  const { calls } = await extractSdkFromCode(code, 'python');
-  assertEqual(calls.length, 1, 'one function call expected');
-  assertEqual(calls[0].method, 'fast', 'standalone function kept');
-  assertEqual(calls[0].args.network as string, 'testnet', 'keyword arg preserved');
-  const wallets = calls[0].args.wallets as Array<Record<string, unknown>>;
-  assert(Array.isArray(wallets), 'wallets should be an array');
-  assertEqual(wallets[0].type as string, 'fast', 'nested dict value preserved');
-});
-
-await test('python: extract() dispatches sdk language', async () => {
-  const config: BenchmarkConfig = {
-    name: 'python-sdk',
-    surface: 'sdk',
-    sdk: { language: 'python' },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  };
-  const response: LLMResponse = {
-    content: '```python\nwallet = FastWallet.from_keyfile(provider)\n```',
-  };
-  const { calls, generatedCode } = await extract(response, config);
-  assertEqual(generatedCode, 'wallet = FastWallet.from_keyfile(provider)', 'should preserve python code block');
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'FastWallet.from_keyfile', 'python sdk call extracted');
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-sdk-rust.ts b/tests/smoke-sdk-rust.ts
deleted file mode 100644
index 66b8925..0000000
--- a/tests/smoke-sdk-rust.ts
+++ /dev/null
@@ -1,138 +0,0 @@
-import { extractSdkCodeBlock } from '../src/benchmark/extractors/code-extractor.js';
-import { extractSdkFromCode } from '../src/benchmark/extractors/sdk/registry.js';
-import { extract } from '../src/benchmark/extractors/index.js';
-import { evaluateTask } from '../src/benchmark/evaluator.js';
-import type { BenchmarkConfig, LLMResponse } from '../src/benchmark/types.js';
-
-let passed = 0;
-let failed = 0;
-
-async function test(name: string, fn: () => Promise<void> | void) {
-  try {
-    await fn();
-    passed++;
-    console.log(`  ✓ ${name}`);
-  } catch (e: any) {
-    failed++;
-    console.log(`  ✗ ${name}`);
-    console.log(`    ${e.message}`);
-  }
-}
-
-function assert(condition: boolean, message: string) {
-  if (!condition) throw new Error(`Assertion failed: ${message}`);
-}
-
-function assertEqual<T>(actual: T, expected: T, message: string) {
-  if (actual !== expected) {
-    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
-  }
-}
-
-console.log('\n=== Rust SDK Smoke Tests ===\n');
-
-await test('rust block extraction', () => {
-  const result = extractSdkCodeBlock('```rust\nlet client = FastClient::new("testnet");\n```', 'rust');
-  assertEqual(result, 'let client = FastClient::new("testnet");', 'should extract rust fenced block');
-});
-
-await test('rust: associated function call', async () => {
-  const { calls } = await extractSdkFromCode('let client = FastClient::new("testnet");', 'rust');
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'FastClient.new', 'associated function normalized');
-  assertEqual(calls[0].args._positional_0 as string, 'testnet', 'associated function arg preserved');
-});
-
-await test('rust: instance method with struct arg', async () => {
-  const code = [
-    'let wallet = FastWallet::from_keyfile(provider);',
-    'wallet.send(SendArgs { to: "fast1abc".into(), amount: "5".into() })?;',
-  ].join('\n');
-  const { calls } = await extractSdkFromCode(code, 'rust');
-  assertEqual(calls.length, 2, 'two calls expected');
-  assertEqual(calls[0].method, 'FastWallet.from_keyfile', 'associated constructor-like method normalized');
-  assertEqual(calls[1].method, 'FastWallet.send', 'instance method resolved through binding');
-  assertEqual(calls[1].args.to as string, 'fast1abc', 'struct field to preserved');
-  assertEqual(calls[1].args.amount as string, '5', 'struct field amount preserved');
-});
-
-await test('rust: chained receiver preserves owner type', async () => {
-  const code = 'FastWallet::from_keyfile(provider).send(SendArgs { to: "fast1abc".into(), amount: "5".into() })?;';
-  const { calls } = await extractSdkFromCode(code, 'rust');
-  assertEqual(calls.length, 2, 'two calls expected for chained receiver');
-  assertEqual(calls[0].method, 'FastWallet.from_keyfile', 'inner associated call preserved');
-  assertEqual(calls[1].method, 'FastWallet.send', 'outer method should resolve from chained receiver');
-});
-
-await test('rust: try-wrapped binding preserves instance type', async () => {
-  const code = [
-    'let wallet = FastWallet::from_keyfile(provider)?;',
-    'wallet.send(SendArgs { to: "fast1abc".into(), amount: "5".into() })?;',
-  ].join('\n');
-  const { calls } = await extractSdkFromCode(code, 'rust');
-  assertEqual(calls.length, 2, 'two calls expected');
-  assertEqual(calls[0].method, 'FastWallet.from_keyfile', 'try-wrapped source call preserved');
-  assertEqual(calls[1].method, 'FastWallet.send', 'try-wrapped binding should resolve instance method');
-});
-
-await test('rust: factory methods on instances resolve through evaluator bindings', async () => {
-  const code = [
-    'let client = FastClient::new("testnet");',
-    'let wallet = client.wallet();',
-    'wallet.send(SendArgs { to: "fast1abc".into(), amount: "5".into() })?;',
-  ].join('\n');
-  const extraction = await extractSdkFromCode(code, 'rust');
-  const task = {
-    id: 'rust-factory-binding',
-    prompt: 'send a payment',
-    expected_actions: [
-      { name: 'FastClient.new', args: { _positional_0: 'testnet' } },
-      { name: 'FastWallet.send', args: { to: 'fast1abc', amount: '5' } },
-    ],
-  };
-  const result = evaluateTask({
-    task,
-    model: { id: 'test/model', name: 'Test Model', tier: 'flagship' },
-    generatedCode: code,
-    rawResponse: code,
-    extractedCalls: extraction.calls,
-    llmLatencyMs: 1,
-    knownMethods: new Set(['FastClient.new', 'FastWallet.send']),
-    bindings: extraction.bindings,
-    surface: 'sdk',
-  });
-
-  assertEqual(result.metrics.taskPassed, true, 'instance factory bindings should resolve to FastWallet.send');
-});
-
-await test('rust: standalone function call', async () => {
-  const { calls } = await extractSdkFromCode('let result = fast("testnet");', 'rust');
-  assertEqual(calls.length, 1, 'one function call expected');
-  assertEqual(calls[0].method, 'fast', 'standalone function kept');
-  assertEqual(calls[0].args._positional_0 as string, 'testnet', 'function arg preserved');
-});
-
-await test('rust: extract() dispatches sdk language', async () => {
-  const config: BenchmarkConfig = {
-    name: 'rust-sdk',
-    surface: 'sdk',
-    sdk: { language: 'rust' },
-    tasks: 'tasks.json',
-    llm: {
-      baseUrl: '',
-      apiKeyEnv: 'OPENROUTER_API_KEY',
-      format: 'openai',
-      models: [],
-    },
-  };
-  const response: LLMResponse = {
-    content: '```rust\nlet client = FastClient::new("testnet");\n```',
-  };
-  const { calls, generatedCode } = await extract(response, config);
-  assertEqual(generatedCode, 'let client = FastClient::new("testnet");', 'should preserve rust code block');
-  assertEqual(calls.length, 1, 'one call expected');
-  assertEqual(calls[0].method, 'FastClient.new', 'rust sdk call extracted');
-});
-
-console.log(`\n${passed} passed, ${failed} failed\n`);
-process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-skill-distribution.ts b/tests/smoke-skill-distribution.ts
new file mode 100644
index 0000000..0594fea
--- /dev/null
+++ b/tests/smoke-skill-distribution.ts
@@ -0,0 +1,182 @@
+import assert from 'node:assert/strict';
+import { existsSync, readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { pathToFileURL } from 'node:url';
+import { test } from 'node:test';
+
+const root = process.cwd();
+const pluginDescription = 'Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs';
+
+function readJson(relativePath: string): any {
+  return JSON.parse(readFileSync(join(root, relativePath), 'utf-8'));
+}
+
+function readText(relativePath: string): string {
+  return readFileSync(join(root, relativePath), 'utf-8');
+}
+
+test('canonical skill follows the portable agent skills contract', () => {
+  const skillPath = 'skills/skill-optimizer/SKILL.md';
+  assert.equal(existsSync(join(root, skillPath)), true);
+
+  const body = readText(skillPath);
+  assert.match(body, /^---\n[\s\S]*?\n---\n/);
+  assert.match(body, /^name: skill-optimizer$/m);
+  assert.match(body, /^description: .+/m);
+  assert.doesNotMatch(body, /^description: .{1025,}$/m);
+});
+
+test('canonical skill documents current workbench command and live CLI patterns', () => {
+  const skill = readText('skills/skill-optimizer/SKILL.md');
+  const reference = readText('skills/skill-optimizer/references/workbench.md');
+
+  for (const text of [skill, reference]) {
+    assert.doesNotMatch(text, /verify-suite/);
+    assert.doesNotMatch(text, /runWorkbenchReferenceSolutions/);
+  }
+
+  assert.match(reference, /Live CLI\/API Skills/);
+  assert.match(reference, /Use dedicated test credentials/);
+  assert.match(reference, /Grade command names, flags, output files, and trace behavior/);
+  assert.match(reference, /Include a no-tool-needed control case/);
+  assert.match(reference, /Include a prompt-injection or unsafe-instruction case/);
+});
+
+test('workbench reference documents bin directory visibility accurately', () => {
+  const reference = readText('skills/skill-optimizer/references/workbench.md');
+
+  assert.match(
+    reference,
+    /`bin\/` \| yes, copied into `\/work\/bin` and mounted as `\/case\/bin` during setup and grading/,
+  );
+});
+
+test('packaged MCP example omits unsupported mcpService ports', () => {
+  const suite = readText('examples/workbench/mcp/suite.yml');
+
+  assert.doesNotMatch(suite, /^\s+port:/m);
+});
+
+test('package metadata exposes plugin and skill distribution files', () => {
+  const pkg = readJson('package.json');
+
+  assert.equal(pkg.name, 'skill-optimizer');
+  assert.equal(pkg.description, pluginDescription);
+  assert.equal(pkg.main, './dist/index.js');
+  assert.equal(pkg.exports['.'].import, './dist/index.js');
+  assert.equal(pkg.exports['./server'].import, './.opencode/plugins/skill-optimizer.js');
+  assert.ok(pkg.files.includes('skills/'));
+  assert.ok(pkg.files.includes('.agents/plugins/marketplace.json'));
+  assert.ok(pkg.files.includes('.claude-plugin/'));
+  assert.ok(pkg.files.includes('.codex-plugin/'));
+  assert.ok(pkg.files.includes('.cursor-plugin/'));
+  assert.ok(pkg.files.includes('.opencode/plugins/skill-optimizer.js'));
+  assert.ok(pkg.files.includes('.opencode/INSTALL.md'));
+  assert.ok(pkg.files.includes('.codex/INSTALL.md'));
+  assert.ok(pkg.files.includes('.cursor/INSTALL.md'));
+  assert.ok(pkg.files.includes('AGENTS.md'));
+  assert.ok(pkg.files.includes('CLAUDE.md'));
+  assert.ok(pkg.files.includes('CONTRIBUTING.md'));
+  assert.ok(pkg.files.includes('docs/README.codex.md'));
+  assert.ok(pkg.files.includes('docs/README.opencode.md'));
+  assert.ok(pkg.files.includes('gemini-extension.json'));
+  assert.ok(pkg.files.includes('GEMINI.md'));
+});
+
+test('package metadata does not include broad example result directories', () => {
+  const pkg = readJson('package.json');
+
+  assert.equal(pkg.files.includes('examples/'), false);
+  assert.ok(pkg.files.includes('examples/workbench/README.md'));
+  assert.ok(pkg.files.includes('examples/workbench/pdf/README.md'));
+  assert.ok(pkg.files.includes('examples/workbench/pdf/suite.yml'));
+  assert.ok(pkg.files.includes('examples/workbench/pdf/checks/'));
+  assert.ok(pkg.files.includes('examples/workbench/pdf/references/'));
+  assert.equal(
+    pkg.files.some((entry: string) => entry.startsWith('examples/workbench/firecrawl-search')),
+    false,
+  );
+});
+
+test('Claude plugin and marketplace metadata point at the canonical skill', () => {
+  const pkg = readJson('package.json');
+  const plugin = readJson('.claude-plugin/plugin.json');
+  const marketplace = readJson('.claude-plugin/marketplace.json');
+
+  assert.equal(plugin.name, 'skill-optimizer');
+  assert.equal(plugin.description, pluginDescription);
+  assert.equal(plugin.version, pkg.version);
+  assert.equal(plugin.skills, './skills/');
+
+  assert.equal(marketplace.name, 'skill-optimizer');
+  assert.equal(marketplace.description, pluginDescription);
+  assert.equal(marketplace.plugins.length, 1);
+  assert.equal(marketplace.plugins[0].name, 'skill-optimizer');
+  assert.equal(marketplace.plugins[0].description, pluginDescription);
+  assert.equal(marketplace.plugins[0].source, './');
+  assert.deepEqual(marketplace.plugins[0].skills, ['./skills/skill-optimizer']);
+});
+
+test('Codex and Cursor plugin metadata point at the canonical skill', () => {
+  const pkg = readJson('package.json');
+  const codex = readJson('.codex-plugin/plugin.json');
+  const cursor = readJson('.cursor-plugin/plugin.json');
+
+  for (const manifest of [codex, cursor]) {
+    assert.equal(manifest.name, 'skill-optimizer');
+    assert.equal(manifest.description, pluginDescription);
+    assert.equal(manifest.version, pkg.version);
+    assert.equal(manifest.skills, './skills/');
+    assert.equal(manifest.interface.displayName, 'Skill Optimizer');
+    assert.equal(manifest.interface.shortDescription, pluginDescription);
+    assert.equal(manifest.interface.longDescription, pluginDescription);
+    assert.ok(manifest.interface.defaultPrompt.length > 0);
+  }
+});
+
+test('Codex marketplace metadata exposes the root plugin with install policy', () => {
+  const marketplace = readJson('.agents/plugins/marketplace.json');
+
+  assert.equal(marketplace.name, 'skill-optimizer');
+  assert.equal(marketplace.interface.displayName, 'Skill Optimizer');
+  assert.equal(marketplace.interface.shortDescription, pluginDescription);
+  assert.equal(marketplace.interface.longDescription, pluginDescription);
+  assert.equal(marketplace.plugins.length, 1);
+
+  const plugin = marketplace.plugins[0];
+  assert.equal(plugin.name, 'skill-optimizer');
+  assert.deepEqual(plugin.source, { source: 'local', path: './' });
+  assert.deepEqual(plugin.policy, {
+    installation: 'AVAILABLE',
+    authentication: 'ON_INSTALL',
+  });
+  assert.equal(plugin.category, 'Coding');
+});
+
+test('OpenCode plugin registers the canonical skills directory', async () => {
+  const pluginUrl = pathToFileURL(join(root, '.opencode', 'plugins', 'skill-optimizer.js')).href;
+  const mod = await import(`${pluginUrl}?cacheBust=${Date.now()}`);
+  const server = mod.default?.server ?? mod.SkillOptimizerPlugin;
+  assert.equal(typeof server, 'function');
+
+  const hooks = await server({});
+  const config: any = {};
+  await hooks.config(config);
+
+  assert.deepEqual(config.skills.paths, [join(root, 'skills')]);
+});
+
+test('Gemini extension metadata points at the canonical context file', () => {
+  const pkg = readJson('package.json');
+  const extension = readJson('gemini-extension.json');
+  const geminiInstructions = readText('GEMINI.md');
+
+  assert.equal(extension.name, 'skill-optimizer');
+  assert.equal(extension.description, pluginDescription);
+  assert.equal(extension.version, pkg.version);
+  assert.equal(extension.contextFileName, 'GEMINI.md');
+  assert.match(geminiInstructions, /^@\.\/AGENTS\.md$/m);
+  assert.match(geminiInstructions, /^@\.\/README\.md$/m);
+  assert.match(geminiInstructions, /^@\.\/CONTRIBUTING\.md$/m);
+  assert.match(geminiInstructions, /^@\.\/skills\/skill-optimizer\/SKILL\.md$/m);
+});
diff --git a/tests/smoke-snapshot-prompt.ts b/tests/smoke-snapshot-prompt.ts
deleted file mode 100644
index 16ec4d6..0000000
--- a/tests/smoke-snapshot-prompt.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-import { test } from 'node:test';
-import assert from 'node:assert/strict';
-import { writeFileSync, mkdtempSync, rmSync } from 'node:fs';
-import { tmpdir } from 'node:os';
-import { join } from 'node:path';
-import { loadActionSnapshotFile } from '../src/actions/snapshot.js';
-
-await test('loadActionSnapshotFile accepts prompt surface', async () => {
-  const dir = mkdtempSync(join(tmpdir(), 'smoke-snapshot-prompt-'));
-  try {
-    const p = join(dir, 'snapshot.json');
-    writeFileSync(p, JSON.stringify({
-      version: 1,
-      catalog: {
-        surface: 'prompt',
-        actions: [],
-      },
-    }));
-    // Must not throw — previously threw "catalog.surface must be one of sdk|cli|mcp"
-    const result = loadActionSnapshotFile(p);
-    assert.equal(result.catalog.surface, 'prompt');
-  } finally {
-    rmSync(dir, { recursive: true, force: true });
-  }
-});
diff --git a/tests/smoke-verdict-prompt.ts b/tests/smoke-verdict-prompt.ts
deleted file mode 100644
index f6b7112..0000000
--- a/tests/smoke-verdict-prompt.ts
+++ /dev/null
@@ -1,213 +0,0 @@
-import { strict as assert } from 'node:assert';
-import { computeVerdict } from '../src/benchmark/scoring.js';
-import type { BenchmarkReport, ModelConfig, BenchmarkSurface } from '../src/benchmark/types.js';
-import { resolveCriteriaForTask } from '../src/benchmark/prompt-criteria.js';
-import { evaluatePromptResponse } from '../src/benchmark/prompt-evaluator.js';
-import type { PromptCapabilityWithSection } from '../src/project/discover-prompt.js';
-
-function syntheticReport(
-  perModel: Record<string, number>,
-  surface: BenchmarkSurface,
-  opts?: { coverageViolation?: boolean; weightedAverage?: number },
-): BenchmarkReport {
-  const entries = Object.entries(perModel);
-  const summaryPerModel: Record<string, {
-    passRate: number; avgRecall: number; avgPrecision: number;
-    avgToolSelectionAccuracy: number; avgArgAccuracy: number;
-    avgHallucinationRate: number; tasksRun: number;
-  }> = {};
-  for (const [id, rate] of entries) {
-    summaryPerModel[id] = {
-      passRate: rate, avgRecall: 0, avgPrecision: 0,
-      avgToolSelectionAccuracy: 0, avgArgAccuracy: 0,
-      avgHallucinationRate: 0, tasksRun: 10,
-    };
-  }
-  const overall = entries.reduce((a, [, r]) => a + r, 0) / Math.max(1, entries.length);
-  const wavg = opts?.weightedAverage ?? overall;
-  return {
-    timestamp: new Date().toISOString(),
-    config: { name: 'syn', surface },
-    skillVersion: { source: 'local', commitSha: 'local', ref: 'file', fetchedAt: new Date().toISOString() },
-    results: [],
-    coverage: [],
-    scopeCoverage: opts?.coverageViolation
-      ? {
-          coverageViolation: true,
-          inScopeActions: ['a', 'b'],
-          outOfScopeActions: [],
-          coveredActions: ['a'],
-          uncoveredActions: ['b'],
-          tasksPerAction: { a: 3, b: 0 },
-        }
-      : undefined,
-    summary: {
-      totalTasks: 10, totalModels: entries.length, totalEvaluations: 10 * entries.length,
-      overallPassRate: overall, weightedAverage: wavg,
-      avgToolRecall: 0, avgToolPrecision: 0, avgToolSelectionAccuracy: 0,
-      avgArgAccuracy: 0, avgHallucinationRate: 0, methodCoveragePercent: 1,
-      perModel: summaryPerModel, perTask: {},
-      perTier: {
-        flagship: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-        mid: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-        low: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 },
-      },
-    },
-  };
-}
-
-function testPromptSurfaceIgnoresCoverageViolation() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const report = syntheticReport({ a: 0.9, b: 0.85 }, 'prompt', {
-    coverageViolation: true,
-    weightedAverage: 0.875,
-  });
-  const verdict = computeVerdict(report, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7 });
-  assert.strictEqual(verdict.result, 'PASS',
-    'prompt surface with scores above floor must PASS despite coverageViolation=true');
-  assert.ok(!verdict.reasons.some(r => r.includes('coverage')),
-    'verdict reasons must not mention coverage for prompt surface');
-  console.log('PASS: prompt surface ignores coverage violation');
-}
-
-function testMcpSurfaceStillBlocksOnCoverageViolation() {
-  const models: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const report = syntheticReport({ a: 0.9, b: 0.85 }, 'mcp', {
-    coverageViolation: true,
-    weightedAverage: 0.875,
-  });
-  const verdict = computeVerdict(report, models, { perModelFloor: 0.6, targetWeightedAverage: 0.7 });
-  assert.strictEqual(verdict.result, 'FAIL',
-    'mcp surface must still FAIL on coverage violation (regression guard)');
-  assert.ok(verdict.reasons.some(r => r.includes('coverage')),
-    'verdict reasons must mention coverage for non-prompt surfaces');
-  console.log('PASS: mcp surface still blocks on coverage violation');
-}
-
-// ── Helpers for scenarios 6-7 ────────────────────────────────────────────────
-
-function makeCap(key: string, section: string): PromptCapabilityWithSection {
-  return {
-    action: {
-      key,
-      name: key,
-      description: `Capability: ${key}`,
-      args: [],
-    },
-    section,
-  };
-}
-
-// ── Scenario 6: distinct criteria per capability (caps[0]-collapse guard) ────
-
-function testDistinctCriteriaPerCapability() {
-  const caps: PromptCapabilityWithSection[] = [
-    makeCap('summarize', '## Summary\nProvide a summary section.\nInclude key points.'),
-    makeCap('translate', '## Translation\nList the translated output.\nSpecify the target language.'),
-    makeCap('classify', '## Classification\nShow a numbered list of categories.\nInclude confidence score.'),
-  ];
-
-  const tasks = caps.map((cap) => ({
-    id: `task_${cap.action.key}`,
-    prompt: `Perform ${cap.action.key} on the given text.`,
-    expected_actions: [] as Array<{ name: string; args?: Record<string, unknown> }>,
-    capabilityId: cap.action.key,
-  }));
-
-  const criteriaList = tasks.map((task) => resolveCriteriaForTask(task, caps).criteria);
-
-  // All 3 criteria must be mutually distinct — none should be equal to another.
-  for (let i = 0; i < criteriaList.length; i++) {
-    for (let j = i + 1; j < criteriaList.length; j++) {
-      const ci = JSON.stringify(criteriaList[i]);
-      const cj = JSON.stringify(criteriaList[j]);
-      assert.notStrictEqual(
-        ci,
-        cj,
-        `caps[${i}] and caps[${j}] criteria must be distinct (caps[0]-collapse guard): ` +
-          `got identical criteria ${ci}`,
-      );
-    }
-  }
-  console.log('PASS: distinct criteria per capability (caps[0] collapse guard)');
-}
-
-// ── Scenario 7: empty criteria → noActiveCriteria via evaluator (P3 guard) ───
-
-function testNoActiveCriteriaViaEvaluator() {
-  // A cap with empty section produces no extractable criteria.
-  const cap = makeCap('empty_cap', '');
-  const task = {
-    id: 'task_empty',
-    prompt: 'Do something with empty_cap.',
-    expected_actions: [] as Array<{ name: string; args?: Record<string, unknown> }>,
-    capabilityId: 'empty_cap',
-  };
-
-  const { criteria } = resolveCriteriaForTask(task, [cap]);
-  const result = evaluatePromptResponse('any response text', criteria);
-
-  assert.strictEqual(result.score, 0, 'empty criteria → score must be 0');
-  assert.strictEqual(result.noActiveCriteria, true, 'empty criteria → noActiveCriteria must be true');
-  console.log('PASS: empty criteria → noActiveCriteria via evaluator (P3 regression guard)');
-}
-
-// ── Scenario 8: mock-LLM verdict matrix (threshold off-by-one + weight math) ─
-
-function testVerdictMatrix() {
-  const models2: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship' },
-    { id: 'b', name: 'B', tier: 'mid' },
-  ];
-  const policy = { perModelFloor: 0.6, targetWeightedAverage: 0.7 };
-
-  // 8a: both 1.0 → PASS
-  const r8a = syntheticReport({ a: 1.0, b: 1.0 }, 'prompt', { weightedAverage: 1.0 });
-  assert.strictEqual(computeVerdict(r8a, models2, policy).result, 'PASS', 'both 1.0 → PASS');
-
-  // 8b: floor inclusive (0.60 == floor)
-  const r8b = syntheticReport({ a: 1.0, b: 0.60 }, 'prompt', { weightedAverage: 0.80 });
-  assert.strictEqual(computeVerdict(r8b, models2, policy).result, 'PASS', 'floor inclusive at 0.60');
-
-  // 8c: below floor (0.59)
-  const r8c = syntheticReport({ a: 1.0, b: 0.59 }, 'prompt', { weightedAverage: 0.795 });
-  assert.strictEqual(computeVerdict(r8c, models2, policy).result, 'FAIL', '0.59 < floor → FAIL');
-
-  // 8d: weights 2:1 → wavg 0.733 (a=0.80, b=0.60, weights 2:1)
-  const models2d: ModelConfig[] = [
-    { id: 'a', name: 'A', tier: 'flagship', weight: 2 },
-    { id: 'b', name: 'B', tier: 'mid', weight: 1 },
-  ];
-  const r8d = syntheticReport({ a: 0.80, b: 0.60 }, 'prompt', { weightedAverage: (0.80 * 2 + 0.60 * 1) / 3 });
-  assert.strictEqual(
-    computeVerdict(r8d, models2d, { perModelFloor: 0.6, targetWeightedAverage: 0.7 }).result,
-    'PASS',
-    'weight 2:1 wavg 0.733 > 0.7 → PASS',
-  );
-
-  // 8e: wavg below target (a=0.70, b=0.60, weights 1:1 → wavg 0.65)
-  const r8e = syntheticReport({ a: 0.70, b: 0.60 }, 'prompt', { weightedAverage: 0.65 });
-  assert.strictEqual(computeVerdict(r8e, models2, policy).result, 'FAIL', 'wavg 0.65 < target 0.70 → FAIL');
-
-  console.log('PASS: verdict matrix (threshold off-by-one + weight math guards)');
-}
-
-async function main() {
-  testPromptSurfaceIgnoresCoverageViolation();
-  testMcpSurfaceStillBlocksOnCoverageViolation();
-  testDistinctCriteriaPerCapability();
-  testNoActiveCriteriaViaEvaluator();
-  testVerdictMatrix();
-  console.log('\nALL PASS: smoke-verdict-prompt');
-}
-
-main().catch((err) => {
-  console.error('FAIL: smoke-verdict-prompt', err);
-  process.exit(1);
-});
diff --git a/tests/smoke-verdict.ts b/tests/smoke-verdict.ts
deleted file mode 100644
index af65ab0..0000000
--- a/tests/smoke-verdict.ts
+++ /dev/null
@@ -1,117 +0,0 @@
-import { strict as assert } from 'node:assert';
-import type { BenchmarkReport } from '../src/benchmark/types.js';
-import { generateRecommendations } from '../src/verdict/recommendations.js';
-import { renderVerdictConsole, renderVerdictMarkdown } from '../src/verdict/render.js';
-
-function syntheticFailReport(): BenchmarkReport {
-  return {
-    timestamp: new Date().toISOString(),
-    config: { name: 'syn', surface: 'mcp' },
-    skillVersion: { source: 'local', commitSha: 'local', ref: 'file', fetchedAt: new Date().toISOString() },
-    results: [],
-    coverage: [],
-    summary: {
-      totalTasks: 2, totalModels: 2, totalEvaluations: 4,
-      overallPassRate: 0.5, weightedAverage: 0.5,
-      avgToolRecall: 0, avgToolPrecision: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0, avgHallucinationRate: 0,
-      methodCoveragePercent: 1,
-      perModel: { a: { passRate: 0.4, avgRecall: 0, avgPrecision: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0, avgHallucinationRate: 0, tasksRun: 2 } },
-      perTask: {},
-      perTier: { flagship: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 }, mid: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 }, low: { passRate: 0, avgRecall: 0, avgToolSelectionAccuracy: 0, avgArgAccuracy: 0 } },
-    },
-    verdict: { result: 'FAIL', reasons: ['a below floor'], policy: { perModelFloor: 0.6, targetWeightedAverage: 0.7 } },
-  };
-}
-
-async function testPassSkipsCritic() {
-  const report = syntheticFailReport();
-  report.verdict!.result = 'PASS';
-  report.verdict!.reasons = [];
-  let called = 0;
-  const recs = await generateRecommendations(report, { complete: async () => { called += 1; return '[]'; } });
-  assert.strictEqual(called, 0);
-  assert.deepStrictEqual(recs, []);
-  console.log('PASS: PASS verdict skips critic call');
-}
-
-async function testFailInvokesCriticOnce() {
-  const report = syntheticFailReport();
-  let called = 0;
-  const recs = await generateRecommendations(report, {
-    complete: async () => {
-      called += 1;
-      return JSON.stringify([
-        { priority: 'high', area: 'docs', action: 'Document Wallet.send args', rationale: 'models consistently missing amount arg' },
-      ]);
-    },
-  });
-  assert.strictEqual(called, 1);
-  assert.strictEqual(recs.length, 1);
-  assert.strictEqual(recs[0]!.priority, 'high');
-  console.log('PASS: FAIL verdict invokes critic exactly once');
-}
-
-async function testMalformedOutputReturnsEmpty() {
-  const report = syntheticFailReport();
-  const recs = await generateRecommendations(report, { complete: async () => 'not json' });
-  assert.deepStrictEqual(recs, []);
-  console.log('PASS: malformed critic output returns empty list, not throw');
-}
-
-function testRenderVerdictConsolePass() {
-  const report = syntheticFailReport();
-  report.verdict!.result = 'PASS';
-  report.verdict!.reasons = [];
-  const out = renderVerdictConsole(report, []);
-  assert.ok(out.includes('=== Verdict ==='), 'console output has header');
-  assert.ok(out.includes('Result: PASS'), 'console output shows PASS');
-  assert.ok(!out.includes('Surface coverage:'), 'no coverage block without scopeCoverage');
-  console.log('PASS: renderVerdictConsole PASS verdict');
-}
-
-function testRenderVerdictConsoleWithCoverage() {
-  const report = syntheticFailReport();
-  report.scopeCoverage = {
-    inScopeActions: ['a', 'b'],
-    outOfScopeActions: ['c'],
-    coveredActions: ['a'],
-    uncoveredActions: ['b'],
-    tasksPerAction: { a: 1, b: 0 },
-    coverageViolation: true,
-  };
-  const out = renderVerdictConsole(report, []);
-  assert.ok(out.includes('Surface coverage:'), 'coverage block present when scopeCoverage set');
-  assert.ok(out.includes('Uncovered:'), 'uncovered list shown');
-  console.log('PASS: renderVerdictConsole shows coverage block');
-}
-
-function testRenderVerdictMarkdown() {
-  const report = syntheticFailReport();
-  const md = renderVerdictMarkdown(report, [{ priority: 'high', area: 'docs', action: 'Add examples', rationale: 'missing' }]);
-  assert.ok(md.includes('## Verdict'), 'markdown has Verdict heading');
-  assert.ok(md.includes('FAIL'), 'markdown shows FAIL');
-  assert.ok(md.includes('60.0%'), 'markdown shows perModelFloor as percentage');
-  assert.ok(md.includes('## Recommendations'), 'markdown has Recommendations section');
-  console.log('PASS: renderVerdictMarkdown contains expected sections');
-}
-
-function testRenderVerdictMarkdownNoVerdict() {
-  const report = syntheticFailReport();
-  delete (report as { verdict?: unknown }).verdict;
-  const md = renderVerdictMarkdown(report, []);
-  assert.strictEqual(md, '', 'empty string when no verdict');
-  console.log('PASS: renderVerdictMarkdown returns empty string when no verdict');
-}
-
-async function main() {
-  await testPassSkipsCritic();
-  await testFailInvokesCriticOnce();
-  await testMalformedOutputReturnsEmpty();
-  testRenderVerdictConsolePass();
-  testRenderVerdictConsoleWithCoverage();
-  testRenderVerdictMarkdown();
-  testRenderVerdictMarkdownNoVerdict();
-  console.log('\nALL PASS: smoke-verdict');
-}
-
-main().catch((err) => { console.error('FAIL: smoke-verdict', err); process.exit(1); });
diff --git a/tests/smoke-workbench-case.ts b/tests/smoke-workbench-case.ts
new file mode 100644
index 0000000..2b2d6c2
--- /dev/null
+++ b/tests/smoke-workbench-case.ts
@@ -0,0 +1,462 @@
+import assert from 'node:assert/strict';
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join, resolve } from 'node:path';
+import { test } from 'node:test';
+
+import { loadWorkbenchCase } from '../src/workbench/case-loader.js';
+import type { WorkbenchCaseConfig } from '../src/workbench/types.js';
+
+function makeTempCaseDir(prefix: string): string {
+  return mkdtempSync(join(tmpdir(), prefix));
+}
+
+function writeCaseFile(root: string, filename: string, body: string): string {
+  const casePath = join(root, filename);
+  writeFileSync(casePath, body, 'utf-8');
+  return casePath;
+}
+
+test('type supports minimal fields', () => {
+  const minimal: WorkbenchCaseConfig = {
+    name: 'merge-pdfs',
+    references: './references',
+    task: 'Merge files',
+    graders: [
+      { name: 'merged-output', command: 'node $CASE/check.js' },
+    ],
+  };
+
+  assert.equal(minimal.name, 'merge-pdfs');
+});
+
+test('YAML case loads and resolves relative references', () => {
+  const root = makeTempCaseDir('skill-workbench-yaml-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yaml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge the PDFs in inputs/ into outputs/book.pdf.',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.js',
+    ].join('\n'));
+
+    const loaded = loadWorkbenchCase(casePath);
+    assert.equal(loaded.name, 'merge-pdfs');
+    assert.equal(loaded.referencesDir, resolve(root, 'references'));
+    assert.equal(loaded.configPath, resolve(casePath));
+    assert.deepEqual(loaded.graders, [
+      { name: 'merged-output', command: 'node $CASE/checks/merge-pdfs.js' },
+    ]);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('JSON case loads', () => {
+  const root = makeTempCaseDir('skill-workbench-json-');
+  try {
+    mkdirSync(join(root, 'refs'));
+    const casePath = writeCaseFile(root, 'case.json', JSON.stringify({
+      name: 'merge-pdfs-json',
+      references: './refs',
+      task: 'Merge the PDFs.',
+      graders: [
+        { name: 'merged-output', command: 'node $CASE/checks/merge-pdfs.js' },
+      ],
+      env: ['OPENROUTER_API_KEY'],
+    }, null, 2));
+
+    const loaded = loadWorkbenchCase(casePath);
+    assert.equal(loaded.name, 'merge-pdfs-json');
+    assert.deepEqual(loaded.env, ['OPENROUTER_API_KEY']);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('YAML case loads MCP server definitions', () => {
+  const root = makeTempCaseDir('skill-workbench-mcp-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: mcp-docs',
+      'references: ./references',
+      'task: Use the configured MCP docs server.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'env:',
+      '  - CONTEXT7_API_KEY',
+      'mcpServers:',
+      '  context7:',
+      '    baseUrl: https://mcp.context7.com/mcp',
+      '    headers:',
+      '      Authorization: "Bearer ${CONTEXT7_API_KEY}"',
+      '  local-tools:',
+      '    command: node',
+      '    args:',
+      '      - mcp/local-server.mjs',
+      '    env:',
+      '      FIXTURE_TOKEN: "${FIXTURE_TOKEN}"',
+    ].join('\n'));
+
+    const loaded = loadWorkbenchCase(casePath);
+
+    assert.deepEqual(loaded.mcpServers, {
+      context7: {
+        baseUrl: 'https://mcp.context7.com/mcp',
+        headers: {
+          Authorization: 'Bearer ${CONTEXT7_API_KEY}',
+        },
+      },
+      'local-tools': {
+        command: 'node',
+        args: ['mcp/local-server.mjs'],
+        env: {
+          FIXTURE_TOKEN: '${FIXTURE_TOKEN}',
+        },
+      },
+    });
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid MCP server without transport throws', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-mcp-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: mcp-docs',
+      'references: ./references',
+      'task: Use MCP.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'mcpServers:',
+      '  missing-transport:',
+      '    description: Missing URL or command',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "mcpServers" server "missing-transport" must define a non-empty url, baseUrl, serverUrl, or command/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('MCP service without matching MCP server throws', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-mcp-service-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: mcp-docs',
+      'references: ./references',
+      'task: Use MCP.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'mcpServices:',
+      '  calculator:',
+      '    command: node',
+      '    args:',
+      '      - calculator-server.mjs',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "mcpServices" service "calculator" requires a matching "mcpServers" entry/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid MCP service command reports mcpServices field', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-mcp-service-command-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: mcp-docs',
+      'references: ./references',
+      'task: Use MCP.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'mcpServers:',
+      '  calculator:',
+      '    baseUrl: http://calculator:3000/mcp',
+      'mcpServices:',
+      '  calculator:',
+      '    command: ""',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "mcpServices" service "calculator" command must be a non-empty string/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('MCP service port is rejected because mcpServers URL owns the port', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-mcp-service-port-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: mcp-docs',
+      'references: ./references',
+      'task: Use MCP.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'mcpServers:',
+      '  calculator:',
+      '    baseUrl: http://calculator:3000/mcp',
+      'mcpServices:',
+      '  calculator:',
+      '    command: node',
+      '    args:',
+      '      - calculator-server.mjs',
+      '    port: 3000',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "mcpServices" service "calculator" port is not supported; set the port in the matching mcpServers URL/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('defaults are applied', () => {
+  const root = makeTempCaseDir('skill-workbench-defaults-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge files',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.js',
+    ].join('\n'));
+
+    const loaded = loadWorkbenchCase(casePath);
+    assert.equal(loaded.model, 'openrouter/google/gemini-2.5-flash');
+    assert.equal(loaded.timeoutSeconds, 600);
+    assert.deepEqual(loaded.env, []);
+    assert.deepEqual(loaded.setup, []);
+    assert.deepEqual(loaded.cleanup, []);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid case model ref is rejected while loading', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-model-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge files',
+      'model: anthropic/claude-3-5-haiku-latest',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.js',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /Workbench only supports OpenRouter model refs, got: anthropic\/claude-3-5-haiku-latest/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid missing references throws', () => {
+  const root = makeTempCaseDir('skill-workbench-missing-refs-');
+  try {
+    const casePath = writeCaseFile(root, 'case.yaml', [
+      'name: merge-pdfs',
+      'task: Merge files',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.js',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "references" must be a non-empty string/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid non-array env throws', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-env-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.json', JSON.stringify({
+      name: 'merge-pdfs',
+      references: './references',
+      task: 'Merge files',
+      graders: [
+        { name: 'merged-output', command: 'node $CASE/checks/merge-pdfs.js' },
+      ],
+      env: 'OPENROUTER_API_KEY',
+    }, null, 2));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "env" must be an array of non-empty strings/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid env variable names are rejected', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-env-name-');
+  try {
+    mkdirSync(join(root, 'references'));
+
+    for (const envName of ['OPENROUTER_API_KEY;touch /tmp/pwned', 'BAD-NAME', '1BAD']) {
+      const casePath = writeCaseFile(root, `case-${envName.length}.json`, JSON.stringify({
+        name: 'merge-pdfs',
+        references: './references',
+        task: 'Merge files',
+        graders: [
+          { name: 'merged-output', command: 'node $CASE/checks/merge-pdfs.js' },
+        ],
+        env: [envName],
+      }, null, 2));
+
+      assert.throws(
+        () => loadWorkbenchCase(casePath),
+        /field "env" item at index 0 must match \^\[A-Za-z_\]\[A-Za-z0-9_\]\*\$/,
+      );
+    }
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('valid env variable names still load', () => {
+  const root = makeTempCaseDir('skill-workbench-valid-env-name-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.json', JSON.stringify({
+      name: 'merge-pdfs',
+      references: './references',
+      task: 'Merge files',
+      graders: [
+        { name: 'merged-output', command: 'node $CASE/checks/merge-pdfs.js' },
+      ],
+      env: ['OPENROUTER_API_KEY', '_TOKEN1'],
+    }, null, 2));
+
+    const loaded = loadWorkbenchCase(casePath);
+    assert.deepEqual(loaded.env, ['OPENROUTER_API_KEY', '_TOKEN1']);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid missing graders throws', () => {
+  const root = makeTempCaseDir('skill-workbench-missing-graders-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge files',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "graders" must be a non-empty array/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('unsupported check field is rejected when graders are present', () => {
+  const root = makeTempCaseDir('skill-workbench-unsupported-check-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge files',
+      'check: node $CASE/checks/old-check.js',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.js',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "check" is invalid; define graders as a non-empty array of \{ name, command \} objects/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('unsupported artifacts field is rejected', () => {
+  const root = makeTempCaseDir('skill-workbench-unsupported-artifacts-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge files',
+      'artifacts:',
+      '  - output.pdf',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.js',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "artifacts" is invalid; inspect outputs in the workspace or use --keep-workspace/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('invalid grader command throws', () => {
+  const root = makeTempCaseDir('skill-workbench-invalid-grader-');
+  try {
+    mkdirSync(join(root, 'references'));
+    const casePath = writeCaseFile(root, 'case.yml', [
+      'name: merge-pdfs',
+      'references: ./references',
+      'task: Merge files',
+      'graders:',
+      '  - name: merged-output',
+      '    command: ""',
+    ].join('\n'));
+
+    assert.throws(
+      () => loadWorkbenchCase(casePath),
+      /field "graders" item at index 0 command must be a non-empty string/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-checks.ts b/tests/smoke-workbench-checks.ts
new file mode 100644
index 0000000..357b149
--- /dev/null
+++ b/tests/smoke-workbench-checks.ts
@@ -0,0 +1,172 @@
+import { strict as assert } from 'node:assert';
+import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { normalizeCheckResult, runCheckCommand, runGraderCommands } from '../src/workbench/check-runner.js';
+import { runShellCommand } from '../src/workbench/process.js';
+
+let passed = 0;
+let failed = 0;
+
+async function test(name: string, fn: () => Promise<void> | void) {
+  try {
+    await fn();
+    passed += 1;
+    console.log(`  ✓ ${name}`);
+  } catch (error: any) {
+    failed += 1;
+    console.log(`  ✗ ${name}`);
+    console.log(`    ${error.message}`);
+  }
+}
+
+console.log('\n=== Workbench Check Runner Smoke Tests ===\n');
+
+await test('normalizes valid JSON pass result', () => {
+  const grade = normalizeCheckResult({
+    exitCode: 0,
+    stdout: '{"pass":true,"score":1,"evidence":["ok"]}',
+    stderr: '',
+  });
+
+  assert.equal(grade.pass, true);
+  assert.equal(grade.score, 1);
+  assert.deepEqual(grade.evidence, ['ok']);
+});
+
+await test('normalizes valid JSON fail result', () => {
+  const grade = normalizeCheckResult({
+    exitCode: 0,
+    stdout: '{"pass":false,"score":0.2,"evidence":["missing output"]}',
+    stderr: '',
+  });
+
+  assert.equal(grade.pass, false);
+  assert.equal(grade.score, 0.2);
+  assert.deepEqual(grade.evidence, ['missing output']);
+});
+
+await test('treats exit 0 with plain stdout as pass', () => {
+  const grade = normalizeCheckResult({
+    exitCode: 0,
+    stdout: 'all checks looked good\n',
+    stderr: '',
+  });
+
+  assert.equal(grade.pass, true);
+  assert.equal(grade.score, 1);
+  assert.deepEqual(grade.evidence, ['all checks looked good']);
+});
+
+await test('treats non-zero without JSON as fail with stdout/stderr evidence', async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-check-'));
+  try {
+    const scriptPath = join(dir, 'nonzero.js');
+    writeFileSync(
+      scriptPath,
+      [
+        "process.stdout.write('stdout evidence\\n');",
+        "process.stderr.write('stderr evidence\\n');",
+        'process.exit(2);',
+      ].join('\n'),
+      'utf-8',
+    );
+
+    const grade = await runCheckCommand(`node "${scriptPath}"`, { cwd: dir });
+    assert.equal(grade.pass, false);
+    assert.equal(grade.score, 0);
+    assert.ok(grade.evidence.some((line) => line.includes('stderr evidence')));
+    assert.ok(grade.evidence.some((line) => line.includes('stdout evidence')));
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+await test('parses JSON object embedded in surrounding logs', () => {
+  const grade = normalizeCheckResult({
+    exitCode: 0,
+    stdout: 'start log\\n{"pass":true,"score":0.8,"evidence":["found marker"]}\\nend log',
+    stderr: '',
+  });
+
+  assert.equal(grade.pass, true);
+  assert.equal(grade.score, 0.8);
+  assert.deepEqual(grade.evidence, ['found marker']);
+});
+
+await test('runShellCommand marks timeout and non-zero semantics', async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-timeout-'));
+  try {
+    const scriptPath = join(dir, 'timeout.js');
+    writeFileSync(scriptPath, 'setTimeout(() => process.exit(0), 2000);', 'utf-8');
+
+    const result = await runShellCommand(`node "${scriptPath}"`, {
+      cwd: dir,
+      timeoutSeconds: 0.1,
+    });
+
+    assert.equal(result.timedOut, true);
+    assert.notEqual(result.exitCode, 0);
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+await test('timed-out JSON pass check is forced to fail', async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-json-timeout-'));
+  try {
+    const scriptPath = join(dir, 'json-timeout.js');
+    writeFileSync(scriptPath, [
+      'process.stdout.write(JSON.stringify({ pass: true, score: 1, evidence: ["premature pass"] }));',
+      'setTimeout(() => process.exit(0), 2000);',
+    ].join('\n'), 'utf-8');
+
+    const grade = await runCheckCommand(`node "${scriptPath}"`, {
+      cwd: dir,
+      timeoutSeconds: 0.1,
+    });
+
+    assert.equal(grade.pass, false);
+    assert.equal(grade.score, 0);
+    assert.ok(grade.evidence.some((line) => line.includes('check command timed out')));
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+await test('runGraderCommands requires every grader to pass and scores passed graders', async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-graders-'));
+  try {
+    const passScript = join(dir, 'pass.js');
+    const failScript = join(dir, 'fail.js');
+    writeFileSync(passScript, 'process.stdout.write(JSON.stringify({ pass: true, evidence: ["ok"] }));', 'utf-8');
+    writeFileSync(failScript, 'process.stdout.write(JSON.stringify({ pass: false, evidence: ["missing output"] }));', 'utf-8');
+
+    const grade = await runGraderCommands([
+      { name: 'uses-tool', command: `node "${passScript}"` },
+      { name: 'saves-output', command: `node "${failScript}"` },
+    ], { cwd: dir });
+
+    assert.equal(grade.pass, false);
+    assert.equal(grade.score, 0.5);
+    assert.deepEqual(grade.evidence, [
+      'uses-tool: ok',
+      'saves-output: missing output',
+    ]);
+    assert.equal(grade.graders?.length, 2);
+    assert.equal(grade.graders?.[0]?.name, 'uses-tool');
+    assert.equal(grade.graders?.[0]?.pass, true);
+    assert.equal(grade.graders?.[1]?.name, 'saves-output');
+    assert.equal(grade.graders?.[1]?.pass, false);
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+if (failed > 0) {
+  console.log(`\nFAIL: ${failed} test(s) failed, ${passed} passed`);
+  process.exit(1);
+}
+
+console.log(`\nALL PASS: smoke-workbench-checks (${passed} tests)`);
diff --git a/tests/smoke-workbench-container.ts b/tests/smoke-workbench-container.ts
new file mode 100644
index 0000000..b6f9362
--- /dev/null
+++ b/tests/smoke-workbench-container.ts
@@ -0,0 +1,241 @@
+import assert from 'node:assert/strict';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import {
+  buildAgentSystemPrompt,
+  buildContainerWorkbenchEnv,
+  parseContainerRunnerArgs,
+  prepareWorkbenchDirectory,
+  runContainerWorkbenchCase,
+  runAgentPromptWithTimeout,
+  writeBestEffortTrace,
+} from '../src/workbench/container-runner.js';
+import { createTraceRecorder } from '../src/workbench/trace.js';
+
+test('buildAgentSystemPrompt describes operating constraints without eval/sandbox hints', () => {
+  const prompt = buildAgentSystemPrompt();
+
+  assert.match(prompt, /Current working directory is \/work/);
+  assert.match(prompt, /Do not use global pip installs/);
+  assert.match(prompt, /python -m venv \/work\/\.venv/);
+  assert.match(prompt, /Write all outputs under \/work/);
+  assert.doesNotMatch(prompt, /sandbox/i);
+  assert.doesNotMatch(prompt, /skill\/reference/i);
+  assert.doesNotMatch(prompt, /grader/i);
+  assert.doesNotMatch(prompt, /expected answer/i);
+  assert.doesNotMatch(prompt, /suite metadata/i);
+  assert.doesNotMatch(prompt, /\/case/);
+  assert.doesNotMatch(prompt, /Task:/);
+});
+
+test('buildContainerWorkbenchEnv exposes CASE as the mounted case directory', () => {
+  const env = buildContainerWorkbenchEnv({
+    casePath: '/case/case.yml',
+    workDir: '/work',
+    resultsDir: '/results',
+    baseEnv: {},
+  });
+
+  assert.equal(env.CASE, '/case');
+  assert.equal(env.WORK, '/work');
+  assert.equal(env.RESULTS, '/results');
+});
+
+test('buildContainerWorkbenchEnv prepends work and case bin to PATH when present', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-env-'));
+  try {
+    const caseDir = join(root, 'case');
+    const workDir = join(root, 'work');
+    mkdirSync(join(caseDir, 'bin'), { recursive: true });
+    mkdirSync(workDir, { recursive: true });
+
+    const env = buildContainerWorkbenchEnv({
+      casePath: join(caseDir, 'case.yml'),
+      workDir,
+      resultsDir: join(root, 'results'),
+      baseEnv: { PATH: '/usr/bin' },
+    });
+
+    assert.equal(env.PATH, `${join(workDir, 'bin')}:${join(caseDir, 'bin')}:/usr/bin`);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('prepareWorkbenchDirectory copies references then optional workspace seed', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-prepare-'));
+  try {
+    const referencesDir = join(root, 'references');
+    const workspaceDir = join(root, 'workspace');
+    const workDir = join(root, 'work');
+    mkdirSync(referencesDir, { recursive: true });
+    mkdirSync(workspaceDir, { recursive: true });
+    mkdirSync(workDir, { recursive: true });
+    writeFileSync(join(workDir, 'stale.txt'), 'stale\n', 'utf-8');
+    writeFileSync(join(referencesDir, 'SKILL.md'), '# Skill\n', 'utf-8');
+    writeFileSync(join(workspaceDir, 'seed.txt'), 'seed\n', 'utf-8');
+
+    prepareWorkbenchDirectory({ referencesDir, workspaceDir, workDir });
+
+    assert.equal(existsSync(join(workDir, 'stale.txt')), false);
+    assert.equal(readFileSync(join(workDir, 'SKILL.md'), 'utf-8'), '# Skill\n');
+    assert.equal(readFileSync(join(workDir, 'seed.txt'), 'utf-8'), 'seed\n');
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('parseContainerRunnerArgs reads optional MCP config path for agent mode', () => {
+  const parsed = parseContainerRunnerArgs([
+    '--agent',
+    '--case-name', 'mcp-case',
+    '--model', 'openrouter/google/gemini-2.5-flash',
+    '--task-base64', Buffer.from('Use MCP.', 'utf-8').toString('base64'),
+    '--timeout-seconds', '600',
+    '--work', '/work',
+    '--results', '/tmp/workbench-results',
+    '--mcp-config', '/work/mcporter.json',
+  ]);
+
+  assert.equal(parsed.mode, 'agent');
+  assert.equal(parsed.mcpConfigPath, '/work/mcporter.json');
+});
+
+test('runContainerWorkbenchCase restores global WORK/RESULTS/MCPORTER_CONFIG after agent mode failure', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-env-restore-'));
+  const workDir = join(root, 'work');
+  const resultsDir = join(root, 'results');
+  mkdirSync(workDir, { recursive: true });
+  mkdirSync(resultsDir, { recursive: true });
+
+  const previousWork = process.env.WORK;
+  const previousResults = process.env.RESULTS;
+  const previousMcporterConfig = process.env.MCPORTER_CONFIG;
+
+  process.env.WORK = 'existing-work';
+  process.env.RESULTS = 'existing-results';
+  delete process.env.MCPORTER_CONFIG;
+
+  try {
+    const exitCode = await runContainerWorkbenchCase([
+      '--agent',
+      '--case-name', 'env-restore',
+      '--model', 'openrouter/google/gemini-2.5-flash',
+      '--task-base64', Buffer.from('test', 'utf-8').toString('base64'),
+      '--timeout-seconds', '1',
+      '--work', workDir,
+      '--results', resultsDir,
+      '--mcp-config', join(root, 'missing-mcporter-config.json'),
+    ]);
+
+    assert.equal(exitCode, 1);
+    assert.equal(process.env.WORK, 'existing-work');
+    assert.equal(process.env.RESULTS, 'existing-results');
+    assert.equal(process.env.MCPORTER_CONFIG, undefined);
+  } finally {
+    if (previousWork === undefined) {
+      delete process.env.WORK;
+    } else {
+      process.env.WORK = previousWork;
+    }
+
+    if (previousResults === undefined) {
+      delete process.env.RESULTS;
+    } else {
+      process.env.RESULTS = previousResults;
+    }
+
+    if (previousMcporterConfig === undefined) {
+      delete process.env.MCPORTER_CONFIG;
+    } else {
+      process.env.MCPORTER_CONFIG = previousMcporterConfig;
+    }
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runAgentPromptWithTimeout rejects when agent exceeds timeout', async () => {
+  await assert.rejects(
+    runAgentPromptWithTimeout({ prompt: () => new Promise(() => {}) }, 'task', 0.001),
+    /Agent timed out after 0.001 seconds/,
+  );
+});
+
+test('runAgentPromptWithTimeout rejects when agent ends with provider error', async () => {
+  await assert.rejects(
+    runAgentPromptWithTimeout({
+      prompt: async () => undefined,
+      state: {
+        messages: [
+          { role: 'assistant', content: [], stopReason: 'error', errorMessage: 'Upstream request failed' },
+        ],
+      },
+    }, 'task', 1),
+    /Upstream request failed/,
+  );
+});
+
+test('writeBestEffortTrace writes trace from available session messages', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-trace-'));
+  try {
+    const tracePath = join(root, 'trace.jsonl');
+    const wrote = writeBestEffortTrace({
+      tracePath,
+      caseName: 'partial-trace',
+      model: 'openrouter/test/model',
+      startedAt: '2026-04-27T10:11:12.000Z',
+      endedAt: '2026-04-27T10:11:13.000Z',
+      session: {
+        state: {
+          messages: [
+            { role: 'user', content: [{ type: 'text', text: 'hello' }] },
+          ],
+        },
+      },
+    });
+
+    assert.equal(wrote, true);
+    const lines = readFileSync(tracePath, 'utf-8').trim().split('\n').map((line) => JSON.parse(line) as { type: string; caseName?: string });
+    assert.equal(lines[0]?.caseName, 'partial-trace');
+    assert.equal(lines.filter((line) => line.type === 'message').length, 1);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('writeBestEffortTrace prefers recorded Pi events when available', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-event-trace-'));
+  try {
+    const tracePath = join(root, 'trace.jsonl');
+    const recorder = createTraceRecorder({ now: () => '2026-04-27T10:11:12.500Z' });
+    recorder.record({
+      type: 'tool_execution_start',
+      toolCallId: 'call-1',
+      toolName: 'bash',
+      args: { command: 'npm test' },
+    });
+
+    const wrote = writeBestEffortTrace({
+      tracePath,
+      caseName: 'event-trace',
+      model: 'openrouter/test/model',
+      startedAt: '2026-04-27T10:11:12.000Z',
+      endedAt: '2026-04-27T10:11:13.000Z',
+      recorder,
+      session: { state: { messages: [] } },
+    });
+
+    assert.equal(wrote, true);
+    const lines = readFileSync(tracePath, 'utf-8').trim().split('\n').map((line) => JSON.parse(line) as {
+      type: string;
+      arguments?: { command?: string };
+    });
+    assert.equal(lines[1]?.type, 'tool_call');
+    assert.equal(lines[1]?.arguments?.command, 'npm test');
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-docker-runner.ts b/tests/smoke-workbench-docker-runner.ts
new file mode 100644
index 0000000..1285595
--- /dev/null
+++ b/tests/smoke-workbench-docker-runner.ts
@@ -0,0 +1,433 @@
+import assert from 'node:assert/strict';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import {
+  buildDockerAgentCommand,
+  buildDockerGradeCommand,
+  buildDockerMcpServiceCommand,
+  buildDockerMcpServiceProbeCommand,
+  buildDockerSetupCommand,
+  packageRootFromModuleUrl,
+  prepareDockerWorkbenchRun,
+  startMcpServices,
+} from '../src/workbench/docker-runner.js';
+
+test('packageRootFromModuleUrl resolves repo root independently of cwd', () => {
+  assert.equal(
+    packageRootFromModuleUrl('file:///tmp/installed-package/dist/workbench/docker-runner.js'),
+    '/tmp/installed-package',
+  );
+  assert.equal(
+    packageRootFromModuleUrl('file:///tmp/source-package/src/workbench/docker-runner.ts'),
+    '/tmp/source-package',
+  );
+});
+
+test('workbench image runs agents as non-root with venv-only pip installs', () => {
+  const dockerfile = readFileSync(join(process.cwd(), 'docker', 'workbench-runner.Dockerfile'), 'utf-8');
+
+  assert.match(dockerfile, /useradd .* agent/);
+  assert.match(dockerfile, /USER agent/);
+  assert.match(dockerfile, /ENTRYPOINT \["node", "\/app\/dist\/workbench\/container-runner\.js"\]/);
+  assert.match(dockerfile, /PIP_REQUIRE_VIRTUALENV=1/);
+  assert.match(dockerfile, /PATH="\/app\/node_modules\/\.bin:\/work\/\.venv\/bin:/);
+  assert.doesNotMatch(dockerfile, /PIP_BREAK_SYSTEM_PACKAGES/);
+});
+
+test('prepareDockerWorkbenchRun writes results under case .results and keeps bundle temp-only', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-bundle-'));
+  try {
+    const sourceCaseDir = join(root, 'source-case');
+    mkdirSync(join(sourceCaseDir, 'checks'), { recursive: true });
+    mkdirSync(join(sourceCaseDir, 'references'), { recursive: true });
+    writeFileSync(join(sourceCaseDir, 'checks', 'merge-pdfs.mjs'), 'process.exit(0);\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'references', 'SKILL.md'), '# Test Skill\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'case.yml'), [
+      'name: pdf-merge',
+      'references: ./references',
+      'task: Merge PDFs.',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.mjs',
+      'env:',
+      '  - OPENROUTER_API_KEY',
+    ].join('\n'));
+
+    const prepared = prepareDockerWorkbenchRun({
+      casePath: join(sourceCaseDir, 'case.yml'),
+      tempRoot: join(root, 'temp'),
+      now: new Date('2026-04-27T10:11:12.000Z'),
+    });
+
+    assert.equal(prepared.resultsDir, join(sourceCaseDir, '.results', '20260427-101112'));
+    assert.ok(prepared.caseDir.startsWith(join(root, 'temp')));
+    assert.ok(prepared.workDir.startsWith(join(root, 'temp')));
+    assert.ok(existsSync(join(prepared.caseDir, 'checks', 'merge-pdfs.mjs')));
+    assert.ok(existsSync(join(prepared.caseDir, 'references', 'SKILL.md')));
+    assert.ok(existsSync(prepared.bundledCasePath));
+
+    const bundledCase = readFileSync(prepared.bundledCasePath, 'utf-8');
+    assert.match(bundledCase, /references: \.\/references/);
+    assert.match(bundledCase, /graders:/);
+    assert.match(bundledCase, /name: merged-output/);
+    assert.match(bundledCase, /command: node \$CASE\/checks\/merge-pdfs\.mjs/);
+    prepared.cleanup();
+    assert.equal(existsSync(prepared.tempDir), false);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('prepareDockerWorkbenchRun bundles case support directories', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-support-'));
+  try {
+    const sourceCaseDir = join(root, 'source-case');
+    mkdirSync(join(sourceCaseDir, 'references'), { recursive: true });
+    mkdirSync(join(sourceCaseDir, 'checks'), { recursive: true });
+    mkdirSync(join(sourceCaseDir, 'fixtures'), { recursive: true });
+    mkdirSync(join(sourceCaseDir, 'bin'), { recursive: true });
+    mkdirSync(join(sourceCaseDir, 'workspace'), { recursive: true });
+    writeFileSync(join(sourceCaseDir, 'references', 'SKILL.md'), '# Test Skill\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'checks', 'check.mjs'), 'process.exit(0);\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'fixtures', 'input.json'), '{}\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'bin', 'fixture-tool'), '#!/bin/sh\nexit 0\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'workspace', 'seed.txt'), 'seed\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'case.yml'), [
+      'name: support-case',
+      'references: ./references',
+      'task: Test support dirs.',
+      'graders:',
+      '  - name: passes',
+      '    command: node $CASE/checks/check.mjs',
+    ].join('\n'));
+
+    const prepared = prepareDockerWorkbenchRun({
+      casePath: join(sourceCaseDir, 'case.yml'),
+      tempRoot: join(root, 'temp'),
+      now: new Date('2026-04-27T10:11:12.000Z'),
+    });
+
+    assert.ok(existsSync(join(prepared.caseDir, 'checks', 'check.mjs')));
+    assert.ok(existsSync(join(prepared.caseDir, 'fixtures', 'input.json')));
+    assert.ok(existsSync(join(prepared.caseDir, 'bin', 'fixture-tool')));
+    assert.equal(existsSync(join(prepared.caseDir, 'mcp')), false);
+    assert.ok(existsSync(join(prepared.caseDir, 'workspace', 'seed.txt')));
+    assert.ok(existsSync(join(prepared.workDir, 'SKILL.md')));
+    assert.ok(existsSync(join(prepared.workDir, 'seed.txt')));
+    assert.ok(existsSync(join(prepared.workDir, 'bin', 'fixture-tool')));
+    assert.equal(existsSync(join(prepared.workDir, 'case.yml')), false);
+    assert.equal(existsSync(join(prepared.workDir, 'checks', 'check.mjs')), false);
+    assert.equal(existsSync(join(prepared.workDir, 'fixtures', 'input.json')), false);
+    prepared.cleanup();
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('prepareDockerWorkbenchRun writes isolated mcporter config for MCP servers', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-mcp-bundle-'));
+  try {
+    const sourceCaseDir = join(root, 'source-case');
+    mkdirSync(join(sourceCaseDir, 'references'), { recursive: true });
+    writeFileSync(join(sourceCaseDir, 'references', 'SKILL.md'), '# Test Skill\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'case.yml'), [
+      'name: mcp-case',
+      'references: ./references',
+      'task: Use MCP.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'mcpServers:',
+      '  local-tools:',
+      '    command: node',
+      '    args:',
+      '      - mcp/server.mjs',
+      '  context7:',
+      '    baseUrl: https://mcp.context7.com/mcp',
+    ].join('\n'));
+
+    const prepared = prepareDockerWorkbenchRun({
+      casePath: join(sourceCaseDir, 'case.yml'),
+      tempRoot: join(root, 'temp'),
+      now: new Date('2026-04-27T10:11:12.000Z'),
+    });
+
+    assert.equal(prepared.mcpConfigPath, join(prepared.workDir, 'mcporter.json'));
+    assert.ok(existsSync(prepared.mcpConfigPath));
+    assert.ok(existsSync(join(prepared.workDir, 'bin', 'mcp')));
+    const mcpCommand = readFileSync(join(prepared.workDir, 'bin', 'mcp'), 'utf-8');
+    assert.match(mcpCommand, /mcporter --config "\$MCPORTER_CONFIG" --root \/work "\$@"/);
+    const mcporterConfig = JSON.parse(readFileSync(prepared.mcpConfigPath, 'utf-8')) as unknown;
+    assert.deepEqual(mcporterConfig, {
+      imports: [],
+      mcpServers: {
+        'local-tools': {
+          command: 'node',
+          args: ['mcp/server.mjs'],
+        },
+        context7: {
+          baseUrl: 'https://mcp.context7.com/mcp',
+        },
+      },
+    });
+
+    const bundledCase = readFileSync(prepared.bundledCasePath, 'utf-8');
+    assert.match(bundledCase, /mcpServers:/);
+    assert.match(bundledCase, /local-tools:/);
+    prepared.cleanup();
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('prepareDockerWorkbenchRun bundles hidden MCP service support outside work', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-mcp-service-'));
+  try {
+    const sourceCaseDir = join(root, 'source-case');
+    mkdirSync(join(sourceCaseDir, 'references'), { recursive: true });
+    mkdirSync(join(sourceCaseDir, 'mcp'), { recursive: true });
+    writeFileSync(join(sourceCaseDir, 'references', 'SKILL.md'), '# Test Skill\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'mcp', 'server.mjs'), 'console.log("mcp");\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'case.yml'), [
+      'name: mcp-service-case',
+      'references: ./references',
+      'task: Use MCP.',
+      'graders:',
+      '  - name: output',
+      '    command: test -f answer.json',
+      'mcpServices:',
+      '  calculator:',
+      '    command: node',
+      '    args:',
+      '      - server.mjs',
+      'mcpServers:',
+      '  calculator:',
+      '    baseUrl: http://calculator:3000/mcp',
+    ].join('\n'));
+
+    const prepared = prepareDockerWorkbenchRun({
+      casePath: join(sourceCaseDir, 'case.yml'),
+      tempRoot: join(root, 'temp'),
+      now: new Date('2026-04-27T10:11:12.000Z'),
+    });
+
+    assert.ok(existsSync(join(prepared.caseDir, 'mcp', 'server.mjs')));
+    assert.equal(existsSync(join(prepared.workDir, 'server.mjs')), false);
+    assert.equal(existsSync(join(prepared.workDir, 'mcp', 'server.mjs')), false);
+    prepared.cleanup();
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('startMcpServices records services started before a later service fails', async () => {
+  const startedContainers: string[] = [];
+
+  await assert.rejects(
+    startMcpServices({
+      image: 'skill-optimizer-workbench:local',
+      networkName: 'skill-optimizer-mcp-test',
+      caseDir: '/tmp/case',
+      tempDir: '/tmp/skill-optimizer-workbench-test',
+      services: {
+        ok: { command: 'node', args: ['server.mjs'] },
+        fail: { command: 'node', args: ['server.mjs'] },
+      },
+      repoRoot: '/tmp/repo',
+      startedContainers,
+      runCommand: async (command) => ({
+        exitCode: command.includes('-fail') ? 7 : 0,
+        stdout: '',
+        stderr: command.includes('-fail') ? 'boom' : '',
+      }),
+    }),
+    /Failed to start MCP service fail/,
+  );
+
+  assert.deepEqual(startedContainers, ['skill-optimizer-mcp-skill-optimizer-workbench-test-ok']);
+});
+
+test('setup docker command mounts case and work before agent phase', () => {
+  const command = buildDockerSetupCommand({
+    image: 'skill-optimizer-workbench:local',
+    caseDir: '/tmp/case',
+    workDir: '/tmp/work',
+    envNames: [],
+  });
+
+  assert.match(command, /--setup/);
+  assert.match(command, /-v '\/tmp\/case:\/case:ro'/);
+  assert.match(command, /-v '\/tmp\/work:\/work:rw'/);
+  assert.doesNotMatch(command, /\/results/);
+  assert.doesNotMatch(command, /docker\.sock/);
+});
+
+test('agent docker command mounts only work and uses sandbox hardening flags', () => {
+  const command = buildDockerAgentCommand({
+    image: 'skill-optimizer-workbench:local',
+    containerName: 'skill-optimizer-agent-test',
+    workDir: '/tmp/work',
+    caseName: 'extract-pdf-facts',
+    model: 'openrouter/google/gemini-2.5-flash',
+    task: 'Read the PDF and write answer.json.',
+    timeoutSeconds: 600,
+    envNames: ['OPENROUTER_API_KEY'],
+  });
+
+  assert.match(command, /--agent/);
+  assert.match(command, /--name 'skill-optimizer-agent-test'/);
+  assert.match(command, /-v '\/tmp\/work:\/work:rw'/);
+  assert.match(command, /--workdir \/work/);
+  assert.match(command, /-e PATH=\/work\/bin:\/app\/node_modules\/\.bin:\/work\/\.venv\/bin:\/usr\/local\/sbin:\/usr\/local\/bin:\/usr\/sbin:\/usr\/bin:\/sbin:\/bin/);
+  assert.match(command, /--cap-drop=ALL/);
+  assert.match(command, /--security-opt no-new-privileges/);
+  assert.match(command, /-e OPENROUTER_API_KEY/);
+  assert.doesNotMatch(command, /\/case/);
+  assert.doesNotMatch(command, /\/results/);
+  assert.doesNotMatch(command, /docker\.sock/);
+});
+
+test('agent docker command passes optional appended system prompt', () => {
+  const command = buildDockerAgentCommand({
+    image: 'skill-optimizer-workbench:local',
+    containerName: 'skill-optimizer-agent-test',
+    workDir: '/tmp/work',
+    caseName: 'prompted-case',
+    model: 'openrouter/google/gemini-2.5-flash',
+    task: 'Write output.txt.',
+    timeoutSeconds: 600,
+    envNames: [],
+    appendSystemPrompt: 'Prefer simple shell commands when possible.',
+  });
+
+  assert.match(command, /--append-system-prompt-base64/);
+  assert.match(command, new RegExp(Buffer.from('Prefer simple shell commands when possible.', 'utf-8').toString('base64')));
+});
+
+test('agent docker command passes optional MCP config path', () => {
+  const command = buildDockerAgentCommand({
+    image: 'skill-optimizer-workbench:local',
+    containerName: 'skill-optimizer-agent-test',
+    workDir: '/tmp/work',
+    caseName: 'mcp-case',
+    model: 'openrouter/google/gemini-2.5-flash',
+    task: 'Use MCP.',
+    timeoutSeconds: 600,
+    envNames: [],
+    mcpConfigPath: '/work/mcporter.json',
+  });
+
+  assert.match(command, /-e MCPORTER_CONFIG=\/work\/mcporter\.json/);
+  assert.match(command, /--mcp-config '\/work\/mcporter\.json'/);
+});
+
+test('agent docker command joins optional MCP network', () => {
+  const command = buildDockerAgentCommand({
+    image: 'skill-optimizer-workbench:local',
+    containerName: 'skill-optimizer-agent-test',
+    workDir: '/tmp/work',
+    caseName: 'mcp-case',
+    model: 'openrouter/google/gemini-2.5-flash',
+    task: 'Use MCP.',
+    timeoutSeconds: 600,
+    envNames: [],
+    networkName: 'skill-optimizer-mcp-test',
+  });
+
+  assert.match(command, /--network 'skill-optimizer-mcp-test'/);
+});
+
+test('MCP service docker command mounts hidden service files outside agent work', () => {
+  const command = buildDockerMcpServiceCommand({
+    image: 'skill-optimizer-workbench:local',
+    containerName: 'skill-optimizer-mcp-test-calculator',
+    networkName: 'skill-optimizer-mcp-test',
+    alias: 'calculator',
+    mcpDir: '/tmp/case/mcp',
+    command: 'node',
+    args: ['server.mjs'],
+  });
+
+  assert.match(command, /-v '\/tmp\/case\/mcp:\/mcp:ro'/);
+  assert.match(command, /--workdir \/mcp/);
+  assert.match(command, /--network-alias 'calculator'/);
+  assert.doesNotMatch(command, /\/work/);
+});
+
+test('MCP service docker command does not forward case env vars', () => {
+  const command = buildDockerMcpServiceCommand({
+    image: 'skill-optimizer-workbench:local',
+    containerName: 'skill-optimizer-mcp-test-calculator',
+    networkName: 'skill-optimizer-mcp-test',
+    alias: 'calculator',
+    mcpDir: '/tmp/case/mcp',
+    command: 'node',
+    args: ['server.mjs'],
+  });
+
+  assert.doesNotMatch(command, /-e OPENROUTER_API_KEY/);
+});
+
+test('MCP service probe command verifies service through mcporter on private network', () => {
+  const command = buildDockerMcpServiceProbeCommand({
+    image: 'skill-optimizer-workbench:local',
+    networkName: 'skill-optimizer-mcp-test',
+    workDir: '/tmp/work',
+    serverName: 'calculator',
+  });
+
+  assert.match(command, /--network 'skill-optimizer-mcp-test'/);
+  assert.match(command, /-v '\/tmp\/work:\/work:rw'/);
+  assert.match(command, /mcporter --config \/work\/mcporter\.json --root \/work list/);
+  assert.match(command, /calculator/);
+  assert.match(command, /--schema/);
+});
+
+test('grade docker command mounts case after agent phase', () => {
+  const command = buildDockerGradeCommand({
+    image: 'skill-optimizer-workbench:local',
+    caseDir: '/tmp/case',
+    workDir: '/tmp/work',
+    resultsDir: '/tmp/results',
+    envNames: [],
+  });
+
+  assert.match(command, /--grade/);
+  assert.match(command, /-v '\/tmp\/case:\/case:ro'/);
+  assert.match(command, /-v '\/tmp\/work:\/work:rw'/);
+  assert.match(command, /-v '\/tmp\/results:\/results:rw'/);
+  assert.match(command, /--cap-drop=ALL/);
+  assert.match(command, /--security-opt no-new-privileges/);
+});
+
+test('prepareDockerWorkbenchRun honors --out as the results root', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-out-'));
+  try {
+    const sourceCaseDir = join(root, 'source-case');
+    mkdirSync(join(sourceCaseDir, 'references'), { recursive: true });
+    writeFileSync(join(sourceCaseDir, 'references', 'SKILL.md'), '# Test Skill\n', 'utf-8');
+    writeFileSync(join(sourceCaseDir, 'case.yml'), [
+      'name: pdf-merge',
+      'references: ./references',
+      'task: Merge PDFs.',
+      'graders:',
+      '  - name: merged-output',
+      '    command: node $CASE/checks/merge-pdfs.mjs',
+    ].join('\n'));
+
+    const prepared = prepareDockerWorkbenchRun({
+      casePath: join(sourceCaseDir, 'case.yml'),
+      outDir: join(root, 'custom-results'),
+      tempRoot: join(root, 'temp'),
+      now: new Date('2026-04-27T10:11:12.000Z'),
+    });
+
+    assert.equal(prepared.resultsDir, join(root, 'custom-results', '20260427-101112'));
+    prepared.cleanup();
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-metrics.ts b/tests/smoke-workbench-metrics.ts
new file mode 100644
index 0000000..96eeae3
--- /dev/null
+++ b/tests/smoke-workbench-metrics.ts
@@ -0,0 +1,75 @@
+import assert from 'node:assert/strict';
+import { test } from 'node:test';
+
+import { buildTrialSummary, buildWorkbenchMetrics } from '../src/workbench/metrics.js';
+import type { WorkbenchResult, WorkbenchTrace } from '../src/workbench/types.js';
+
+test('buildWorkbenchMetrics counts tool calls and sums usage', () => {
+  const trace: WorkbenchTrace = {
+    caseName: 'metrics-case',
+    model: 'openrouter/test/model',
+    startedAt: '2026-04-27T10:00:00.000Z',
+    endedAt: '2026-04-27T10:00:02.500Z',
+    entries: [
+      { type: 'message', role: 'user', text: 'task' },
+      {
+        type: 'message',
+        role: 'assistant',
+        usage: {
+          input: 10,
+          output: 5,
+          cacheRead: 2,
+          cacheWrite: 1,
+          totalTokens: 18,
+          cost: { input: 0.1, output: 0.2, cacheRead: 0.03, cacheWrite: 0.04, total: 0.37 },
+        },
+        stopReason: 'toolUse',
+      },
+      { type: 'tool_call', name: 'bash', arguments: { command: 'npm test' } },
+      { type: 'tool_call', name: 'read', arguments: { path: 'file.ts' } },
+      { type: 'tool_result', name: 'bash', text: 'ok' },
+      { type: 'message', role: 'assistant', text: 'done', stopReason: 'stop' },
+    ],
+  };
+
+  const metrics = buildWorkbenchMetrics(trace);
+  assert.equal(metrics.durationMs, 2500);
+  assert.equal(metrics.turns, 3);
+  assert.equal(metrics.toolCalls, 2);
+  assert.equal(metrics.toolResults, 1);
+  assert.equal(metrics.bashCalls, 1);
+  assert.equal(metrics.readCalls, 1);
+  assert.equal(metrics.stopReason, 'stop');
+  assert.equal(metrics.tokens.total, 18);
+  assert.equal(metrics.cost.total, 0.37);
+});
+
+test('buildTrialSummary extracts final text, failed graders, and bash commands', () => {
+  const trace: WorkbenchTrace = {
+    caseName: 'summary-case',
+    model: 'openrouter/test/model',
+    startedAt: '2026-04-27T10:00:00.000Z',
+    endedAt: '2026-04-27T10:00:01.000Z',
+    entries: [
+      { type: 'tool_call', name: 'bash', arguments: { command: 'firecrawl search "x"' } },
+      { type: 'message', role: 'assistant', text: 'final answer', stopReason: 'stop' },
+    ],
+  };
+  const result: WorkbenchResult = {
+    caseName: 'summary-case',
+    model: 'openrouter/test/model',
+    pass: false,
+    score: 0.5,
+    evidence: ['missing output'],
+    graders: [
+      { name: 'uses-tool', command: 'true', pass: true, score: 1, evidence: [] },
+      { name: 'saves-output', command: 'false', pass: false, score: 0, evidence: ['missing output'] },
+    ],
+  };
+
+  const summary = buildTrialSummary({ trace, result });
+  assert.equal(summary.finalAssistantMessage, 'final answer');
+  assert.deepEqual(summary.failedGraders, ['saves-output']);
+  assert.deepEqual(summary.bashCommands, ['firecrawl search "x"']);
+  assert.equal(summary.metrics.bashCalls, 1);
+});
diff --git a/tests/smoke-workbench-models.ts b/tests/smoke-workbench-models.ts
new file mode 100644
index 0000000..7527321
--- /dev/null
+++ b/tests/smoke-workbench-models.ts
@@ -0,0 +1,201 @@
+import assert from 'node:assert/strict';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import { parseModelList, slugModelRef } from '../src/workbench/models.js';
+import { runWorkbenchCase } from '../src/workbench/run-case.js';
+
+test('parseModelList accepts comma-separated OpenRouter models', () => {
+  assert.deepEqual(parseModelList(' openrouter/google/gemini-2.5-flash,openrouter/openai/gpt-5.4 '), [
+    'openrouter/google/gemini-2.5-flash',
+    'openrouter/openai/gpt-5.4',
+  ]);
+});
+
+test('parseModelList rejects empty and non-OpenRouter models', () => {
+  assert.throws(() => parseModelList(''), /at least one model/);
+  assert.throws(() => parseModelList('anthropic/claude-sonnet-4-6'), /OpenRouter/);
+});
+
+test('slugModelRef creates filesystem-safe model directories', () => {
+  assert.equal(slugModelRef('openrouter/google/gemini-2.5-flash'), 'openrouter-google-gemini-2.5-flash');
+  assert.equal(slugModelRef('openrouter/meta-llama/llama-3.3-70b-instruct:free'), 'openrouter-meta-llama-llama-3.3-70b-instruct-free');
+});
+
+test('runWorkbenchCase writes aggregate output for multi-model runs', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-models-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const casePath = join(root, 'case.yml');
+    const outDir = join(root, 'results');
+    const calls: Array<{ model?: string; resultsDir?: string }> = [];
+    mkdirSync(outDir, { recursive: true });
+    writeFileSync(casePath, 'name: model-case\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+
+    process.exitCode = undefined;
+    await runWorkbenchCase(
+      {
+        casePath,
+        outDir,
+        models: ['openrouter/google/gemini-2.5-flash', 'openrouter/openai/gpt-5.4'],
+      },
+      {
+        runDockerWorkbenchCase: async (options) => {
+          calls.push({ model: options.model, resultsDir: options.resultsDir });
+          assert.ok(options.resultsDir);
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          const pass = options.model !== 'openrouter/openai/gpt-5.4';
+          writeFileSync(resultPath, JSON.stringify({ pass, score: pass ? 1 : 0, evidence: [options.model] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ entries: [] }), 'utf-8');
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+        now: new Date('2026-04-27T10:11:12.000Z'),
+      },
+    );
+
+    const runResultPath = join(outDir, '20260427-101112', 'run-result.json');
+    assert.ok(existsSync(runResultPath));
+    const aggregate = JSON.parse(readFileSync(runResultPath, 'utf-8')) as {
+      summary: { total: number; passed: number; failed: number; passRate: number; totalTrials: number; passedTrials: number; failedTrials: number };
+      results: Array<{ model: string; passHatK: boolean; trials: Array<{ resultPath: string; tracePath: string }> }>;
+    };
+    assert.deepEqual(calls.map((call) => call.model), [
+      'openrouter/google/gemini-2.5-flash',
+      'openrouter/openai/gpt-5.4',
+    ]);
+    assert.equal(aggregate.summary.total, 2);
+    assert.equal(aggregate.summary.passed, 1);
+    assert.equal(aggregate.summary.failed, 1);
+    assert.equal(aggregate.summary.passRate, 0.5);
+    assert.equal(aggregate.summary.totalTrials, 2);
+    assert.equal(aggregate.summary.passedTrials, 1);
+    assert.equal(aggregate.summary.failedTrials, 1);
+    assert.equal(aggregate.results[0]?.trials[0]?.resultPath, 'trials/openrouter-google-gemini-2.5-flash--001/result.json');
+    assert.equal(aggregate.results[1]?.trials[0]?.tracePath, 'trials/openrouter-openai-gpt-5.4--001/trace.jsonl');
+    assert.equal(process.exitCode, 1);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchCase writes aggregate output when --models has one model', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-one-model-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const casePath = join(root, 'case.yml');
+    const outDir = join(root, 'results');
+    mkdirSync(outDir, { recursive: true });
+    writeFileSync(casePath, 'name: model-case\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+
+    process.exitCode = undefined;
+    await runWorkbenchCase(
+      {
+        casePath,
+        outDir,
+        models: ['openrouter/google/gemini-2.5-flash'],
+      },
+      {
+        runDockerWorkbenchCase: async (options) => {
+          assert.ok(options.resultsDir);
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          writeFileSync(resultPath, JSON.stringify({ pass: true, score: 1, evidence: [] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ entries: [] }), 'utf-8');
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+        now: new Date('2026-04-27T10:11:12.000Z'),
+      },
+    );
+
+    const runResultPath = join(outDir, '20260427-101112', 'run-result.json');
+    assert.ok(existsSync(runResultPath));
+    assert.ok(existsSync(join(outDir, '20260427-101112', 'trials', 'openrouter-google-gemini-2.5-flash--001', 'result.json')));
+    assert.equal(process.exitCode, undefined);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchCase --trials uses the case model when no model override is provided', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-case-model-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const casePath = join(root, 'case.yml');
+    const outDir = join(root, 'results');
+    const refsDir = join(root, 'refs');
+    const calls: Array<{ model?: string }> = [];
+    mkdirSync(refsDir, { recursive: true });
+    mkdirSync(outDir, { recursive: true });
+    writeFileSync(
+      casePath,
+      'name: model-case\nreferences: ./refs\nmodel: openrouter/openai/gpt-5.4\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n',
+      'utf-8',
+    );
+
+    process.exitCode = undefined;
+    await runWorkbenchCase(
+      {
+        casePath,
+        outDir,
+        trials: 2,
+      },
+      {
+        runDockerWorkbenchCase: async (options) => {
+          calls.push({ model: options.model });
+          assert.ok(options.resultsDir);
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          writeFileSync(resultPath, JSON.stringify({ pass: true, score: 1, evidence: [] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ entries: [] }), 'utf-8');
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+        now: new Date('2026-04-27T10:11:12.000Z'),
+      },
+    );
+
+    assert.deepEqual(calls.map((call) => call.model), [
+      'openrouter/openai/gpt-5.4',
+      'openrouter/openai/gpt-5.4',
+    ]);
+    assert.ok(existsSync(join(outDir, '20260427-101112', 'trials', 'openrouter-openai-gpt-5.4--002', 'result.json')));
+    assert.equal(process.exitCode, undefined);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-pi-agent.ts b/tests/smoke-workbench-pi-agent.ts
new file mode 100644
index 0000000..f2c01ee
--- /dev/null
+++ b/tests/smoke-workbench-pi-agent.ts
@@ -0,0 +1,159 @@
+import assert from 'node:assert/strict';
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import { createWorkbenchPiResourceLoader, createWorkbenchPiSession, createWorkbenchPiTools, stripSensitiveEnv } from '../src/workbench/pi-agent.js';
+
+function toolText(result: unknown): string {
+  const content = (result as { content?: Array<{ text?: string }> }).content ?? [];
+  return content.map((item) => item.text ?? '').join('');
+}
+
+test('createWorkbenchPiTools enables coding plus repo-scale search tools', () => {
+  const tools = createWorkbenchPiTools('/work');
+  const names = tools.map((tool) => tool.name).sort();
+
+  assert.deepEqual(names, ['bash', 'edit', 'find', 'grep', 'ls', 'read', 'write']);
+});
+
+test('stripSensitiveEnv preserves all case-allowed credentials for tool subprocesses', () => {
+  const env = stripSensitiveEnv({
+    OPENROUTER_API_KEY: 'secret',
+    OPENAI_API_KEY: 'secret',
+    GOOGLE_WORKSPACE_CLI_TOKEN: 'gws-token',
+    GOOGLE_WORKSPACE_CLI_CLIENT_SECRET: 'gws-secret',
+    GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE: '/work/gws-credentials.json',
+    MODEL_AUTH_FILE: '/run/secrets/model-auth.json',
+    WHATSAPP_ACCESS_TOKEN: 'secret',
+    DASHBOARD_TOKEN_SECRET: 'secret',
+    PATH: '/usr/bin',
+    WORK: '/work',
+  });
+
+  assert.equal(env.OPENROUTER_API_KEY, 'secret');
+  assert.equal(env.OPENAI_API_KEY, 'secret');
+  assert.equal(env.GOOGLE_WORKSPACE_CLI_TOKEN, 'gws-token');
+  assert.equal(env.GOOGLE_WORKSPACE_CLI_CLIENT_SECRET, 'gws-secret');
+  assert.equal(env.GOOGLE_WORKSPACE_CLI_CREDENTIALS_FILE, '/work/gws-credentials.json');
+  assert.equal(env.MODEL_AUTH_FILE, '/run/secrets/model-auth.json');
+  assert.equal(env.WHATSAPP_ACCESS_TOKEN, 'secret');
+  assert.equal(env.DASHBOARD_TOKEN_SECRET, 'secret');
+  assert.equal(env.PATH, '/usr/bin');
+  assert.equal(env.WORK, '/work');
+});
+
+test('createWorkbenchPiResourceLoader discovers a root SKILL.md from references', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-skill-'));
+  try {
+    writeFileSync(root + '/SKILL.md', [
+      '---',
+      'name: pdf',
+      'description: PDF merge instructions',
+      '---',
+      '',
+      '# PDF Skill',
+    ].join('\n'), 'utf-8');
+    mkdirSync(join(root, 'inputs'));
+
+    const loader = await createWorkbenchPiResourceLoader({ cwd: root });
+    const loaded = loader.getSkills().skills.map((skill) => skill.name);
+
+    assert.ok(loaded.includes('pdf'));
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('createWorkbenchPiResourceLoader appends suite prompt after workbench prompt', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-append-prompt-'));
+  try {
+    const loader = await createWorkbenchPiResourceLoader({
+      cwd: root,
+      appendSystemPrompt: 'Prefer simple shell commands when possible.',
+    });
+
+    const appended = loader.getAppendSystemPrompt().join('\n\n');
+    assert.match(appended, /Operating environment:/);
+    assert.match(appended, /Prefer simple shell commands when possible\./);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('createWorkbenchPiResourceLoader documents MCP command when configured', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-mcp-prompt-'));
+  try {
+    const loader = await createWorkbenchPiResourceLoader({
+      cwd: root,
+      mcpConfigPath: '/work/mcporter.json',
+    });
+
+    const appended = loader.getAppendSystemPrompt().join('\n\n');
+    assert.match(appended, /`mcp` is available on PATH/);
+    assert.match(appended, /Run `mcp list <server> --schema`/);
+    assert.doesNotMatch(appended, /calculator\.add/);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('createWorkbenchPiTools passes process env through bash subprocesses', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-tool-env-'));
+  const previousSecret = process.env.WORKBENCH_AGENT_SECRET;
+  try {
+    process.env.WORKBENCH_AGENT_SECRET = 'agent-secret';
+    const bashTool = createWorkbenchPiTools(root).find((tool) => tool.name === 'bash');
+    assert.ok(bashTool);
+
+    const result = await bashTool.execute(
+      'call-1',
+      { command: 'printf "%s" "$WORKBENCH_AGENT_SECRET"', timeout: 5 },
+      new AbortController().signal,
+    );
+
+    assert.equal(toolText(result), 'agent-secret');
+  } finally {
+    if (previousSecret === undefined) {
+      delete process.env.WORKBENCH_AGENT_SECRET;
+    } else {
+      process.env.WORKBENCH_AGENT_SECRET = previousSecret;
+    }
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('createWorkbenchPiSession leaves runtime API key env available after session creation', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-session-env-'));
+  const previousApiKey = process.env.OPENROUTER_API_KEY;
+  try {
+    process.env.OPENROUTER_API_KEY = 'test-openrouter-key';
+    const created = await createWorkbenchPiSession({
+      cwd: root,
+      modelRef: 'openrouter/google/gemini-2.5-flash',
+    });
+
+    assert.equal(process.env.OPENROUTER_API_KEY, 'test-openrouter-key');
+    (created.session as { dispose?: () => void }).dispose?.();
+  } finally {
+    if (previousApiKey === undefined) {
+      delete process.env.OPENROUTER_API_KEY;
+    } else {
+      process.env.OPENROUTER_API_KEY = previousApiKey;
+    }
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('createWorkbenchPiSession rejects non-OpenRouter model refs', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-openrouter-only-'));
+  try {
+    await assert.rejects(
+      () => createWorkbenchPiSession({ cwd: root, modelRef: 'direct/model' }),
+      /only supports OpenRouter/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-run-case.ts b/tests/smoke-workbench-run-case.ts
new file mode 100644
index 0000000..b9bfacd
--- /dev/null
+++ b/tests/smoke-workbench-run-case.ts
@@ -0,0 +1,60 @@
+import assert from 'node:assert/strict';
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import { runWorkbenchCase, runWorkbenchCaseFromCli } from '../src/workbench/run-case.js';
+
+test('runWorkbenchCase preserves failing result as process exitCode 1', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-run-case-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const resultsDir = join(root, 'results');
+    mkdirSync(resultsDir, { recursive: true });
+    const resultPath = join(resultsDir, 'result.json');
+    writeFileSync(resultPath, JSON.stringify({
+      pass: false,
+      score: 0,
+      evidence: ['expected failure'],
+    }), 'utf-8');
+    process.exitCode = undefined;
+
+    await runWorkbenchCase(
+      { casePath: join(root, 'case.yml') },
+      {
+        runDockerWorkbenchCase: async () => ({
+          tempDir: join(root, 'temp'),
+          caseDir: join(root, 'temp', 'case'),
+          bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+          workDir: join(root, 'temp', 'work'),
+          resultsDir,
+          resultPath,
+          tracePath: join(resultsDir, 'trace.jsonl'),
+          cleanup: () => {},
+        }),
+      },
+    );
+
+    assert.equal(process.exitCode, 1);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchCaseFromCli rejects invalid --model before loading the case', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-run-case-model-'));
+  try {
+    await assert.rejects(
+      runWorkbenchCaseFromCli([
+        join(root, 'missing-case.yml'),
+        '--model',
+        'anthropic/claude-3-5-haiku-latest',
+      ]),
+      /Workbench only supports OpenRouter model refs, got: anthropic\/claude-3-5-haiku-latest/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-suite.ts b/tests/smoke-workbench-suite.ts
new file mode 100644
index 0000000..7ea1792
--- /dev/null
+++ b/tests/smoke-workbench-suite.ts
@@ -0,0 +1,403 @@
+import assert from 'node:assert/strict';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import { loadWorkbenchSuite } from '../src/workbench/suite-loader.js';
+import { runWorkbenchSuite, runWorkbenchSuiteFromCli } from '../src/workbench/run-suite.js';
+
+test('loadWorkbenchSuite resolves case paths and validates models', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-load-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    mkdirSync(join(root, 'cases', 'missing-index'), { recursive: true });
+    writeFileSync(suitePath, [
+      'name: supabase-postgres-best-practices',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - cases/missing-index/case.yml',
+    ].join('\n'), 'utf-8');
+
+    const suite = loadWorkbenchSuite(suitePath);
+
+    assert.equal(suite.name, 'supabase-postgres-best-practices');
+    assert.deepEqual(suite.models, ['openrouter/google/gemini-2.5-flash']);
+    assert.deepEqual(suite.casePaths, [join(root, 'cases', 'missing-index', 'case.yml')]);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('loadWorkbenchSuite supports inline cases with suite defaults', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-inline-load-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    mkdirSync(join(root, 'references'), { recursive: true });
+    writeFileSync(join(root, 'references', 'SKILL.md'), '# Skill\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: react-best-practices',
+      'references: ./references',
+      'env:',
+      '  - OPENROUTER_API_KEY',
+      'timeoutSeconds: 123',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - name: async-parallel',
+      '    task: Make this faster',
+      '    graders:',
+      '      - name: async-parallel',
+      '        command: node checks/react.mjs async-parallel',
+    ].join('\n'), 'utf-8');
+
+    const suite = loadWorkbenchSuite(suitePath);
+
+    assert.equal(suite.name, 'react-best-practices');
+    assert.equal(suite.cases[0]?.slug, 'async-parallel');
+    assert.equal(suite.cases[0]?.case?.referencesDir, join(root, 'references'));
+    assert.deepEqual(suite.cases[0]?.case?.env, ['OPENROUTER_API_KEY']);
+    assert.equal(suite.cases[0]?.case?.timeoutSeconds, 123);
+    assert.deepEqual(suite.cases[0]?.case?.graders, [
+      { name: 'async-parallel', command: 'node checks/react.mjs async-parallel' },
+    ]);
+    assert.deepEqual(suite.casePaths, []);
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('loadWorkbenchSuite applies and merges MCP defaults for inline cases', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-mcp-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    mkdirSync(join(root, 'references'), { recursive: true });
+    writeFileSync(join(root, 'references', 'SKILL.md'), '# Skill\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: mcp-suite',
+      'references: ./references',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'mcpServers:',
+      '  context7:',
+      '    baseUrl: https://mcp.context7.com/mcp',
+      '  local-tools:',
+      '    command: node',
+      '    args:',
+      '      - mcp/default-server.mjs',
+      'cases:',
+      '  - name: mcp-inline',
+      '    task: Use MCP.',
+      '    mcpServers:',
+      '      context7:',
+      '        baseUrl: https://example.test/mcp',
+      '        allowedTools:',
+      '          - lookup',
+      '    graders:',
+      '      - name: output',
+      '        command: test -f answer.json',
+    ].join('\n'), 'utf-8');
+
+    const suite = loadWorkbenchSuite(suitePath);
+
+    assert.deepEqual(suite.cases[0]?.case?.mcpServers, {
+      context7: {
+        baseUrl: 'https://example.test/mcp',
+        allowedTools: ['lookup'],
+      },
+      'local-tools': {
+        command: 'node',
+        args: ['mcp/default-server.mjs'],
+      },
+    });
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('loadWorkbenchSuite reads suite appendSystemPrompt', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-prompt-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    writeFileSync(suitePath, [
+      'name: prompted-suite',
+      'appendSystemPrompt: |',
+      '  Prefer simple shell commands when possible.',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - cases/noop/case.yml',
+    ].join('\n'), 'utf-8');
+
+    const suite = loadWorkbenchSuite(suitePath);
+
+    assert.equal(suite.appendSystemPrompt, 'Prefer simple shell commands when possible.');
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('loadWorkbenchSuite rejects suite artifacts defaults', () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-artifacts-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    writeFileSync(suitePath, [
+      'name: artifact-suite',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'artifacts:',
+      '  - output.json',
+      'cases:',
+      '  - cases/noop/case.yml',
+    ].join('\n'), 'utf-8');
+
+    assert.throws(
+      () => loadWorkbenchSuite(suitePath),
+      /field "artifacts" is invalid; inspect outputs in the workspace or use --keep-workspace/,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchSuite writes case-model matrix aggregate output', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const suitePath = join(root, 'suite.yml');
+    const outDir = join(root, 'results');
+    const caseA = join(root, 'cases', 'missing-index', 'case.yml');
+    const caseB = join(root, 'cases', 'partial-index', 'case.yml');
+    mkdirSync(join(root, 'cases', 'missing-index'), { recursive: true });
+    mkdirSync(join(root, 'cases', 'partial-index'), { recursive: true });
+    writeFileSync(caseA, 'name: missing-index\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+    writeFileSync(caseB, 'name: partial-index\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: supabase-postgres-best-practices',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      '  - openrouter/openai/gpt-5.4',
+      'cases:',
+      '  - cases/missing-index/case.yml',
+      '  - cases/partial-index/case.yml',
+    ].join('\n'), 'utf-8');
+
+    process.exitCode = undefined;
+    await runWorkbenchSuite(
+      { suitePath, outDir },
+      {
+        now: new Date('2026-04-27T10:11:12.000Z'),
+        runDockerWorkbenchCase: async (options) => {
+          assert.ok(options.resultsDir);
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          const pass = !(options.casePath.includes('partial-index') && options.model === 'openrouter/openai/gpt-5.4');
+          writeFileSync(resultPath, JSON.stringify({ pass, score: pass ? 1 : 0, evidence: [options.casePath, options.model] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ entries: [] }), 'utf-8');
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+      },
+    );
+
+    const suiteResultPath = join(outDir, '20260427-101112', 'suite-result.json');
+    assert.ok(existsSync(suiteResultPath));
+    const aggregate = JSON.parse(readFileSync(suiteResultPath, 'utf-8')) as {
+      summary: { total: number; passed: number; failed: number; passRate: number; totalTrials: number; passedTrials: number; failedTrials: number };
+      results: Array<{ caseName: string; model: string; passHatK: boolean; trials: Array<{ resultPath: string; tracePath: string }> }>;
+    };
+
+    assert.equal(aggregate.summary.total, 4);
+    assert.equal(aggregate.summary.passed, 3);
+    assert.equal(aggregate.summary.failed, 1);
+    assert.equal(aggregate.summary.passRate, 0.75);
+    assert.equal(aggregate.summary.totalTrials, 4);
+    assert.equal(aggregate.summary.passedTrials, 3);
+    assert.equal(aggregate.summary.failedTrials, 1);
+    assert.equal(aggregate.results[0]?.trials[0]?.resultPath, 'trials/missing-index--openrouter-google-gemini-2.5-flash--001/result.json');
+    assert.equal(aggregate.results[3]?.trials[0]?.resultPath, 'trials/partial-index--openrouter-openai-gpt-5.4--001/result.json');
+    assert.equal(process.exitCode, 1);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchSuite passes suite appendSystemPrompt to every trial', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-prompt-run-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const suitePath = join(root, 'suite.yml');
+    const outDir = join(root, 'results');
+    const casePath = join(root, 'cases', 'prompted', 'case.yml');
+    mkdirSync(join(root, 'cases', 'prompted'), { recursive: true });
+    writeFileSync(casePath, 'name: prompted\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: prompted-suite',
+      'appendSystemPrompt: |',
+      '  Prefer simple shell commands when possible.',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - cases/prompted/case.yml',
+    ].join('\n'), 'utf-8');
+
+    process.exitCode = undefined;
+    await runWorkbenchSuite(
+      { suitePath, outDir },
+      {
+        now: new Date('2026-04-27T10:11:12.000Z'),
+        runDockerWorkbenchCase: async (options) => {
+          assert.equal(options.appendSystemPrompt, 'Prefer simple shell commands when possible.');
+          assert.ok(options.resultsDir);
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          writeFileSync(resultPath, JSON.stringify({ pass: true, score: 1, evidence: [] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ type: 'trace_start', entries: [] }), 'utf-8');
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+      },
+    );
+
+    assert.equal(process.exitCode, undefined);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchSuiteFromCli rejects model overrides because suites own models', async () => {
+  await assert.rejects(
+    () => runWorkbenchSuiteFromCli(['suite.yml', '--models', 'openrouter/google/gemini-2.5-pro']),
+    /Unknown flag: --models/,
+  );
+});
+
+test('runWorkbenchSuite missing models error references suite models only', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-missing-models-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    const casePath = join(root, 'cases', 'no-models', 'case.yml');
+    mkdirSync(join(root, 'cases', 'no-models'), { recursive: true });
+    writeFileSync(casePath, 'name: no-models\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: no-models-suite',
+      'cases:',
+      '  - cases/no-models/case.yml',
+    ].join('\n'), 'utf-8');
+
+    await assert.rejects(
+      () => runWorkbenchSuite({ suitePath }),
+      (error: unknown) => {
+        assert.ok(error instanceof Error);
+        assert.match(error.message, /suite\.yml|models/);
+        assert.doesNotMatch(error.message, /--models/);
+        return true;
+      },
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchSuite rejects non-integer programmatic concurrency', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-concurrency-invalid-'));
+  try {
+    const suitePath = join(root, 'suite.yml');
+    const casePath = join(root, 'cases', 'invalid-concurrency', 'case.yml');
+    mkdirSync(join(root, 'cases', 'invalid-concurrency'), { recursive: true });
+    mkdirSync(join(root, 'cases', 'invalid-concurrency', 'refs'), { recursive: true });
+    writeFileSync(casePath, 'name: invalid-concurrency\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: invalid-concurrency-suite',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - cases/invalid-concurrency/case.yml',
+    ].join('\n'), 'utf-8');
+
+    await assert.rejects(
+      () => runWorkbenchSuite({ suitePath, concurrency: 1.5 }),
+      /concurrency.*positive integer/i,
+    );
+  } finally {
+    rmSync(root, { recursive: true, force: true });
+  }
+});
+
+test('runWorkbenchSuite honors concurrency for independent trials', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-suite-concurrency-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const suitePath = join(root, 'suite.yml');
+    const outDir = join(root, 'results');
+    const casePath = join(root, 'cases', 'parallel', 'case.yml');
+    mkdirSync(join(root, 'cases', 'parallel'), { recursive: true });
+    writeFileSync(casePath, 'name: parallel\nreferences: ./refs\ntask: Test\ngraders:\n  - name: passes\n    command: "true"\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: parallel-suite',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - cases/parallel/case.yml',
+    ].join('\n'), 'utf-8');
+
+    let active = 0;
+    let maxActive = 0;
+    process.exitCode = undefined;
+    await runWorkbenchSuite(
+      { suitePath, outDir, trials: 3, concurrency: 2 },
+      {
+        now: new Date('2026-04-27T10:11:12.000Z'),
+        runDockerWorkbenchCase: async (options) => {
+          assert.ok(options.resultsDir);
+          active += 1;
+          maxActive = Math.max(maxActive, active);
+          await new Promise((resolve) => setTimeout(resolve, 25));
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          writeFileSync(resultPath, JSON.stringify({ pass: true, score: 1, evidence: [] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ type: 'trace_start', entries: [] }), 'utf-8');
+          active -= 1;
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+      },
+    );
+
+    assert.equal(maxActive, 2);
+    assert.equal(process.exitCode, undefined);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});
diff --git a/tests/smoke-workbench-trace.ts b/tests/smoke-workbench-trace.ts
new file mode 100644
index 0000000..7c5781b
--- /dev/null
+++ b/tests/smoke-workbench-trace.ts
@@ -0,0 +1,176 @@
+import { buildWorkbenchTrace, createTraceCollector, createTraceRecorder } from '../src/workbench/trace.js';
+
+let passed = 0;
+let failed = 0;
+
+async function test(name: string, fn: () => Promise<void> | void) {
+  try {
+    await fn();
+    passed++;
+    console.log(`  ✓ ${name}`);
+  } catch (error: any) {
+    failed++;
+    console.log(`  ✗ ${name}`);
+    console.log(`    ${error.message}`);
+  }
+}
+
+function assert(condition: boolean, message: string) {
+  if (!condition) throw new Error(`Assertion failed: ${message}`);
+}
+
+function assertEqual<T>(actual: T, expected: T, message: string) {
+  if (actual !== expected) {
+    throw new Error(`${message}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`);
+  }
+}
+
+console.log('\n=== Workbench Trace Smoke Tests ===\n');
+
+await test('buildWorkbenchTrace stores a deduped interaction timeline', () => {
+  const trace = buildWorkbenchTrace({
+    caseName: 'case-1',
+    model: 'openrouter/google/gemini-2.5-flash',
+    startedAt: '2026-01-01T00:00:00.000Z',
+    endedAt: '2026-01-01T00:00:02.000Z',
+    messages: [
+      { role: 'user', content: [{ type: 'text', text: 'Do the task' }] },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'thinking', thinking: 'Need to inspect files' },
+          { type: 'text', text: 'I will read the skill.' },
+          { type: 'toolCall', id: 'call-1', name: 'read', arguments: { path: '/work/SKILL.md' } },
+        ],
+        usage: { totalTokens: 10 },
+      },
+      {
+        role: 'toolResult',
+        toolCallId: 'call-1',
+        toolName: 'read',
+        content: [{ type: 'text', text: '# Skill' }],
+        isError: false,
+      },
+    ],
+  });
+
+  assertEqual(trace.caseName, 'case-1', 'trace should preserve caseName');
+  assertEqual(trace.entries.length, 4, 'trace should normalize messages into entries');
+  assertEqual(trace.entries[0].type, 'message', 'first entry should be user message');
+  assertEqual(trace.entries[1].type, 'message', 'second entry should be assistant message');
+  assertEqual(trace.entries[2].type, 'tool_call', 'third entry should be tool call');
+  assertEqual(trace.entries[3].type, 'tool_result', 'fourth entry should be tool result');
+  assert(!('events' in trace), 'trace should not include raw streaming events');
+  assert(!('messages' in trace), 'trace should not duplicate raw messages');
+});
+
+await test('buildWorkbenchTrace preserves assistant provider error messages', () => {
+  const trace = buildWorkbenchTrace({
+    caseName: 'case-error',
+    model: 'openrouter/google/gemini-2.5-flash',
+    startedAt: '2026-01-01T00:00:00.000Z',
+    endedAt: '2026-01-01T00:00:02.000Z',
+    messages: [
+      {
+        role: 'assistant',
+        content: [],
+        stopReason: 'error',
+        errorMessage: 'Provider returned 500',
+      },
+    ],
+  });
+
+  const entry = trace.entries[0] as { type: string; errorMessage?: string; stopReason?: unknown };
+  assertEqual(entry.type, 'message', 'entry should be a message');
+  assertEqual(entry.stopReason, 'error', 'entry should preserve stop reason');
+  assertEqual(entry.errorMessage, 'Provider returned 500', 'entry should preserve provider error message');
+});
+
+await test('createTraceCollector records arbitrary events in order', () => {
+  const collector = createTraceCollector();
+  collector.record({ step: 1 });
+  collector.record('tool-call');
+  collector.record(42);
+
+  assertEqual(collector.events.length, 3, 'collector should record all events');
+  assertEqual((collector.events[0] as { step?: number }).step, 1, 'collector should preserve object payload');
+  assertEqual(collector.events[1], 'tool-call', 'collector should preserve string payload');
+  assertEqual(collector.events[2], 42, 'collector should preserve numeric payload');
+});
+
+await test('createTraceRecorder captures Pi session events and normalized entries', () => {
+  const recorder = createTraceRecorder({ now: () => '2026-01-01T00:00:01.000Z' });
+
+  recorder.record({
+    type: 'message_end',
+    message: {
+      role: 'assistant',
+      content: [{ type: 'text', text: 'I will run the command.' }],
+      stopReason: 'toolUse',
+    },
+  });
+  recorder.record({
+    type: 'tool_execution_start',
+    toolCallId: 'call-1',
+    toolName: 'bash',
+    args: { command: 'firecrawl search browser --scrape' },
+  });
+  recorder.record({
+    type: 'tool_execution_end',
+    toolCallId: 'call-1',
+    toolName: 'bash',
+    result: { content: [{ type: 'text', text: 'ok' }] },
+    isError: false,
+  });
+
+  const trace = recorder.toTrace({
+    caseName: 'case-events',
+    model: 'openrouter/test/model',
+    startedAt: '2026-01-01T00:00:00.000Z',
+    endedAt: '2026-01-01T00:00:02.000Z',
+  });
+
+  assertEqual(trace.events?.length, 3, 'trace should preserve raw-ish Pi events');
+  assertEqual(trace.events?.[0]?.timestamp, '2026-01-01T00:00:01.000Z', 'trace events should have capture timestamps');
+  assertEqual(trace.entries.length, 3, 'trace should derive normalized entries from events');
+  assertEqual(trace.entries[0].type, 'message', 'first entry should be assistant message');
+  assertEqual(trace.entries[1].type, 'tool_call', 'second entry should be tool call');
+  assertEqual(trace.entries[2].type, 'tool_result', 'third entry should be tool result');
+  assertEqual(
+    ((trace.entries[1] as { arguments?: { command?: string } }).arguments)?.command,
+    'firecrawl search browser --scrape',
+    'tool call entry should preserve bash command',
+  );
+});
+
+await test('createTraceRecorder preserves session messages when events are partial', () => {
+  const recorder = createTraceRecorder({ now: () => '2026-01-01T00:00:01.000Z' });
+
+  recorder.record({
+    type: 'tool_execution_start',
+    toolCallId: 'call-1',
+    toolName: 'bash',
+    args: { command: 'node parse-pdf.mjs' },
+  });
+
+  const trace = recorder.toTrace({
+    caseName: 'partial-events',
+    model: 'openrouter/test/model',
+    startedAt: '2026-01-01T00:00:00.000Z',
+    endedAt: '2026-01-01T00:00:02.000Z',
+    messages: [
+      { role: 'user', content: [{ type: 'text', text: 'Extract the PDF facts.' }] },
+      { role: 'assistant', content: [{ type: 'text', text: 'I will parse the PDF.' }] },
+    ],
+  });
+
+  assertEqual(trace.entries.length, 3, 'trace should keep session messages plus partial event entries');
+  assertEqual(trace.entries[0].type, 'message', 'first entry should be a session message');
+  assertEqual((trace.entries[0] as { role?: string }).role, 'user', 'first session message should be user');
+  assertEqual(trace.entries[1].type, 'message', 'second entry should be a session message');
+  assertEqual((trace.entries[1] as { role?: string }).role, 'assistant', 'second session message should be assistant');
+  assertEqual(trace.entries[2].type, 'tool_call', 'partial tool event should still be included');
+});
+
+console.log(`\n${passed} passed, ${failed} failed\n`);
+process.exit(failed > 0 ? 1 : 0);
diff --git a/tests/smoke-workbench-trials.ts b/tests/smoke-workbench-trials.ts
new file mode 100644
index 0000000..ea49b51
--- /dev/null
+++ b/tests/smoke-workbench-trials.ts
@@ -0,0 +1,109 @@
+import assert from 'node:assert/strict';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { test } from 'node:test';
+
+import { aggregateTrials, formatTrialNumber, parseTrialsFlag } from '../src/workbench/trials.js';
+import { runWorkbenchSuite } from '../src/workbench/run-suite.js';
+
+test('formatTrialNumber returns stable three-digit slugs', () => {
+  assert.equal(formatTrialNumber(1), '001');
+  assert.equal(formatTrialNumber(12), '012');
+  assert.equal(formatTrialNumber(123), '123');
+});
+
+test('parseTrialsFlag rejects invalid trial counts', () => {
+  assert.equal(parseTrialsFlag(undefined), 1);
+  assert.equal(parseTrialsFlag('3'), 3);
+  assert.throws(() => parseTrialsFlag('0'), /positive integer/);
+  assert.throws(() => parseTrialsFlag('1.5'), /positive integer/);
+});
+
+test('aggregateTrials computes pass@k, pass^k, pass rate, and mean score', () => {
+  const aggregate = aggregateTrials([
+    { trial: 1, pass: false, score: 0.25 },
+    { trial: 2, pass: true, score: 1 },
+    { trial: 3, pass: false, score: 0.5 },
+  ]);
+
+  assert.equal(aggregate.totalTrials, 3);
+  assert.equal(aggregate.passedTrials, 1);
+  assert.equal(aggregate.failedTrials, 2);
+  assert.equal(aggregate.trialPassRate, 1 / 3);
+  assert.equal(aggregate.meanScore, (0.25 + 1 + 0.5) / 3);
+  assert.equal(aggregate.passAtK, true);
+  assert.equal(aggregate.passHatK, false);
+});
+
+test('runWorkbenchSuite writes trial directories and case-model aggregates', async () => {
+  const root = mkdtempSync(join(tmpdir(), 'skill-opt-workbench-trials-'));
+  const previousExitCode = process.exitCode;
+  try {
+    const suitePath = join(root, 'suite.yml');
+    const outDir = join(root, 'results');
+    mkdirSync(join(root, 'references'), { recursive: true });
+    writeFileSync(join(root, 'references', 'SKILL.md'), '# Skill\n', 'utf-8');
+    writeFileSync(suitePath, [
+      'name: trial-suite',
+      'references: ./references',
+      'models:',
+      '  - openrouter/google/gemini-2.5-flash',
+      'cases:',
+      '  - name: trial-case',
+      '    task: Test trials',
+      '    graders:',
+      '      - name: passes',
+      '        command: "true"',
+    ].join('\n'), 'utf-8');
+
+    process.exitCode = undefined;
+    await runWorkbenchSuite(
+      { suitePath, outDir, trials: 3 },
+      {
+        now: new Date('2026-04-27T10:11:12.000Z'),
+        runDockerWorkbenchCase: async (options) => {
+          assert.ok(options.resultsDir);
+          mkdirSync(options.resultsDir, { recursive: true });
+          const resultPath = join(options.resultsDir, 'result.json');
+          const tracePath = join(options.resultsDir, 'trace.jsonl');
+          const pass = options.resultsDir.endsWith('--002');
+          writeFileSync(resultPath, JSON.stringify({ pass, score: pass ? 1 : 0, evidence: [] }), 'utf-8');
+          writeFileSync(tracePath, JSON.stringify({ entries: [] }), 'utf-8');
+          return {
+            tempDir: join(root, 'temp'),
+            caseDir: join(root, 'temp', 'case'),
+            bundledCasePath: join(root, 'temp', 'case', 'case.yml'),
+            workDir: join(root, 'temp', 'work'),
+            resultsDir: options.resultsDir,
+            resultPath,
+            tracePath,
+            cleanup: () => {},
+          };
+        },
+      },
+    );
+
+    const runRoot = join(outDir, '20260427-101112');
+    assert.ok(existsSync(join(runRoot, 'trials', 'trial-case--openrouter-google-gemini-2.5-flash--001', 'result.json')));
+    assert.ok(existsSync(join(runRoot, 'trials', 'trial-case--openrouter-google-gemini-2.5-flash--002', 'result.json')));
+    assert.ok(existsSync(join(runRoot, 'trials', 'trial-case--openrouter-google-gemini-2.5-flash--003', 'result.json')));
+
+    const aggregate = JSON.parse(readFileSync(join(runRoot, 'suite-result.json'), 'utf-8')) as {
+      summary: { totalTrials: number; passedTrials: number; failedTrials: number; trialPassRate: number; meanScore: number };
+      results: Array<{ passAtK: boolean; passHatK: boolean; totalTrials: number }>;
+    };
+    assert.equal(aggregate.summary.totalTrials, 3);
+    assert.equal(aggregate.summary.passedTrials, 1);
+    assert.equal(aggregate.summary.failedTrials, 2);
+    assert.equal(aggregate.summary.trialPassRate, 1 / 3);
+    assert.equal(aggregate.summary.meanScore, 1 / 3);
+    assert.equal(aggregate.results[0]?.passAtK, true);
+    assert.equal(aggregate.results[0]?.passHatK, false);
+    assert.equal(aggregate.results[0]?.totalTrials, 3);
+    assert.equal(process.exitCode, 1);
+  } finally {
+    process.exitCode = previousExitCode;
+    rmSync(root, { recursive: true, force: true });
+  }
+});