fastxyz · bucurdavid · Apr 16, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json
@@ -0,0 +1,22 @@
+{
+  "name": "skill-optimizer",
+  "interface": {
+    "displayName": "Skill Optimizer",
+    "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs"
+  },
+  "plugins": [
+    {
+      "name": "skill-optimizer",
+      "source": {
+        "source": "local",
+        "path": "./"
+      },
+      "policy": {
+        "installation": "AVAILABLE",
+        "authentication": "ON_INSTALL"
+      },
+      "category": "Coding"
+    }
+  ]
+}
diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
@@ -0,0 +1,21 @@
+{
+  "name": "skill-optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "owner": {
+    "name": "Fast"
+  },
+  "plugins": [
+    {
+      "name": "skill-optimizer",
+      "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+      "version": "2.0.0",
+      "source": "./",
+      "author": {
+        "name": "Fast"
+      },
+      "skills": [
+        "./skills/skill-optimizer"
+      ]
+    }
+  ]
+}
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
@@ -0,0 +1,19 @@
+{
+  "name": "skill-optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "version": "2.0.0",
+  "author": {
+    "name": "Fast"
+  },
+  "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
+  "repository": "https://github.com/fastxyz/skill-optimizer",
+  "license": "MIT",
+  "keywords": [
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation",
+    "optimization"
+  ],
+  "skills": "./skills/"
+}
diff --git a/.codex b/.codex
diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json
@@ -0,0 +1,35 @@
+{
+  "name": "skill-optimizer",
+  "version": "2.0.0",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "author": {
+    "name": "Fast"
+  },
+  "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
+  "repository": "https://github.com/fastxyz/skill-optimizer",
+  "license": "MIT",
+  "keywords": [
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation",
+    "optimization"
+  ],
+  "skills": "./skills/",
+  "interface": {
+    "displayName": "Skill Optimizer",
+    "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "developerName": "Fast",
+    "category": "Coding",
+    "capabilities": [
+      "Read",
+      "Write"
+    ],
+    "defaultPrompt": [
+      "Use Skill Optimizer to design and run an eval suite for this agent skill.",
+      "Use Skill Optimizer to inspect failing traces and improve the skill."
+    ],
+    "brandColor": "#3B82F6"
+  }
+}
diff --git a/.codex/INSTALL.md b/.codex/INSTALL.md
@@ -0,0 +1,31 @@
+# Installing skill-optimizer for Codex
+
+Use `skill-optimizer` in Codex as either a plugin or a native skill.
+
+## Plugin Install
+
+Register this repository as a plugin marketplace:
+
+```bash
+codex plugin marketplace add fastxyz/skill-optimizer
+```
+
+Open `/plugins`, select the `skill-optimizer` marketplace, and install the `skill-optimizer` plugin.
+
+The marketplace file is `.agents/plugins/marketplace.json`. It exposes the repository root as the plugin source so Codex can load `.codex-plugin/plugin.json` and the bundled `skills/` directory.
+
+To pin a Git ref while installing the marketplace:
+
+```bash
+codex plugin marketplace add fastxyz/skill-optimizer --ref main
+```
+
+## Skill-Only Install
+
+Install the canonical skill with the open skills CLI:
+
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a codex -y
+```
+
+Restart Codex if the skill does not appear immediately.
diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json
@@ -0,0 +1,36 @@
+{
+  "name": "skill-optimizer",
+  "displayName": "Skill Optimizer",
+  "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+  "version": "2.0.0",
+  "author": {
+    "name": "Fast"
+  },
+  "homepage": "https://github.com/fastxyz/skill-optimizer#readme",
+  "repository": "https://github.com/fastxyz/skill-optimizer",
+  "license": "MIT",
+  "keywords": [
+    "agent-skills",
+    "evals",
+    "skill-testing",
+    "model-evaluation",
+    "optimization"
+  ],
+  "skills": "./skills/",
+  "interface": {
+    "displayName": "Skill Optimizer",
+    "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs",
+    "developerName": "Fast",
+    "category": "Coding",
+    "capabilities": [
+      "Read",
+      "Write"
+    ],
+    "defaultPrompt": [
+      "Use Skill Optimizer to design and run an eval suite for this agent skill.",
+      "Use Skill Optimizer to inspect failing traces and improve the skill."
+    ],
+    "brandColor": "#3B82F6"
+  }
+}
diff --git a/.cursor/INSTALL.md b/.cursor/INSTALL.md
@@ -0,0 +1,15 @@
+# Installing skill-optimizer for Cursor
+
+## Skill install
+
+Install the skill into Cursor's project or global skill directory through the open skills CLI:
+
+```bash
+npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a cursor -y
+```
+
+Cursor can also import remote skills from GitHub in Settings -> Rules -> Project Rules -> Add Rule -> Remote Rule (Github).
+
+## Plugin metadata
+
+This repository includes `.cursor-plugin/plugin.json` for Cursor-compatible plugin metadata. The canonical skill remains `skills/skill-optimizer/SKILL.md`.
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,10 @@ dist/
 
 # Cache directories
 .cache/
-.opencode/
+.opencode/*
+!.opencode/INSTALL.md
+!.opencode/plugins/
+!.opencode/plugins/skill-optimizer.js
 
 # Test results and benchmarks
 results/
@@ -55,6 +58,8 @@ docs/specs/
 
 # Skill-optimizer generated artifacts
 .skill-optimizer/
+.skill-eval/
+.results/
 
 # Local user config (personal paths, model choices — not repo artifacts)
 skill-optimizer.json

diff --git a/.opencode/INSTALL.md b/.opencode/INSTALL.md
@@ -0,0 +1,21 @@
+# Installing skill-optimizer for OpenCode
+
+Add the plugin to `opencode.json` at user or project scope:
+
+```json
+{
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git"]
+}
+```
+
+Restart OpenCode. The plugin registers the repository `skills/` directory so the native `skill` tool can load `skill-optimizer`.
+
+Verify with the skill tool by listing skills or loading `skill-optimizer`.
+
+To pin a version, append a tag or commit ref:
+
+```json
+{
+  "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git#v2.0.0"]
+}
+```
diff --git a/.opencode/plugins/skill-optimizer.js b/.opencode/plugins/skill-optimizer.js
@@ -0,0 +1,25 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const pluginDir = path.dirname(fileURLToPath(import.meta.url));
+const skillsDir = path.resolve(pluginDir, "..", "..", "skills");
+
+function registerSkillsDir(config) {
+  config.skills = config.skills || {};
+  config.skills.paths = config.skills.paths || [];
+
+  if (!config.skills.paths.includes(skillsDir)) {
+    config.skills.paths.push(skillsDir);
+  }
+}
+
+export const SkillOptimizerPlugin = async () => ({
+  config: async (config) => {
+    registerSkillsDir(config);
+  },
+});
+
+export default {
+  id: "skill-optimizer",
+  server: SkillOptimizerPlugin,
+};
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,61 @@
+# AGENTS.md
+
+## Project Overview
+
+`skill-optimizer` is a Docker workbench for running and grading agent skill eval cases. The current public CLI centers on `run-case` and `run-suite`.
+
+## Key Commands
+
+```bash
+npm run build
+npm run typecheck
+npm test
+npx tsx src/cli.ts --help
+npx tsx src/cli.ts run-case --help
+npx tsx src/cli.ts run-suite --help
+```
+
+## Important Files
+
+- `src/cli.ts`: public CLI entrypoint
+- `src/workbench/`: workbench case loading, suite loading, Docker runner, Pi agent, graders, and traces
+- `docker/workbench-runner.Dockerfile`: generic non-root container image for setup, agent, grade, and cleanup phases
+- `skills/skill-optimizer/SKILL.md`: canonical distributable Agent Skill
+- `.claude-plugin/`, `.codex-plugin/`, `.cursor-plugin/`, `.opencode/`: cross-agent plugin manifests and install support
+- `.agents/plugins/marketplace.json`: Codex repo marketplace entry for the root plugin
+- `gemini-extension.json`, `GEMINI.md`: Gemini extension metadata and context file
+- `examples/workbench/`: tracked example eval suites
+- `README.md`: provider-specific installation instructions for Claude Code, Codex, Cursor, OpenCode, Gemini CLI, and skill-only installs
+- `CONTRIBUTING.md`: contributor workflow and current workbench invariants
+
+## Installation Docs
+
+Keep the README installation section aligned with packaged plugin metadata:
+
+- Claude Code: `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json`
+- Codex: `.agents/plugins/marketplace.json` and `.codex-plugin/plugin.json`
+- Cursor: `.cursor-plugin/plugin.json` and `.cursor/INSTALL.md`
+- OpenCode: `.opencode/plugins/skill-optimizer.js` and `.opencode/INSTALL.md`
+- Gemini CLI: `gemini-extension.json` and `GEMINI.md`
+- Skill-only installs: `npx skills add fastxyz/skill-optimizer --skill skill-optimizer ...`
+
+## Invariants
+
+- Keep evaluation static: extraction and matching are allowed; do not execute model-produced code outside the Docker workbench as part of evaluation.
+- `run-suite` uses models from `suite.yml`; do not add a `run-suite --models` override.
+- Keep OpenRouter model refs as `openrouter/...`; real model runs require `OPENROUTER_API_KEY`.
+- Cases use `graders: [{ name, command }]`; legacy `check:` and `artifacts:` are invalid.
+- Graders are the acceptance contract; evaluate outputs from `/work`, generated artifacts, `answer.json`, `trace.jsonl`, and result state.
+- The agent phase sees only `/work`, not `/case` or `/results`.
+- Keep plugin metadata pointed at the canonical `skills/skill-optimizer/SKILL.md`; do not create divergent skill copies.
+- Codex plugin metadata lives in `.codex-plugin/plugin.json`; the repo marketplace lives in `.agents/plugins/marketplace.json` and points at `./`.
+- Provider install docs should link to the same canonical skill/plugin metadata, not separate skill copies.
+- Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials.
+
+## Testing Guidance
+
+- Run `npm run typecheck` after TypeScript changes.
+- Run `npm test` before finishing behavior changes.
+- For Docker runner or image changes, also run `docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .`.
+- For CLI/docs changes, verify `npx tsx src/cli.ts --help` if touched docs mention CLI behavior.
+- For plugin/package metadata changes, run `npx tsx tests/smoke-skill-distribution.ts` and verify `npm pack --dry-run --json` includes required plugin files without result/cache directories.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## 2.0.0 — 2026-05-01
+
+### Changed
+
+- Rebuilt Skill Optimizer around the eval workbench: realistic skill cases, model matrices, isolated agent workspaces, trace inspection, deterministic grader evidence, and iterative skill improvement.
+- Repositioned package and plugin metadata around the skill eval lab workflow instead of implementation mechanics.
+
+### Breaking Changes
+
+- Removed the legacy reference-solution preflight flow and `verify-suite`; graders are now the sole acceptance contract.
+- Removed reference-solution SDK exports and packaged example solution scripts.
+
+### Added
+
+- Hidden MCP services for eval cases, exposed to agents through the workbench `mcp` command.
+- Post-run optimization guidance for inspecting failures, updating skills or supporting code, and re-running evals.
+
 ## 1.1.0 — 2026-04-16
 
 ### Breaking Changes
@@ -25,7 +42,7 @@ The config file `skill-benchmark.json` is no longer auto-detected. Rename it to
 ### Added
 - **prompt surface type** — benchmark and optimize prompt templates, Claude Code skills, and agent instructions. Discovers phases and capabilities from markdown, evaluates output quality with content-based criteria.
 - **Codex auth** — direct OpenAI model runs can use browser-login tokens stored by Codex (`~/.codex/auth.json`) instead of requiring `OPENAI_API_KEY`. Set `benchmark.authMode: "codex"` and use `openai/<model>` IDs.
-- **SKILL folder** — bundled AI-agent guidance (`SKILL/SKILL.md`) so agents can use skill-optimizer reliably without extra setup.
+- **skills folder** — bundled AI-agent guidance (`skills/skill-optimizer/SKILL.md`) so agents can use skill-optimizer reliably without extra setup.
 - **Optimizer loop diagram** — README now includes a visual workflow diagram of the optimizer loop.
 - **Stable task IDs** — task IDs are now derived from a SHA-1 hash of the action names (SDK/CLI/MCP surfaces) or prompt text (prompt surface). For SDK/CLI/MCP surfaces, where action names come from discovered code rather than LLM output, IDs are stable across regenerations and the `--task <id>` filter works reliably. For the prompt surface, IDs are stable when the LLM produces identical wording; if it rephrases a task the ID changes (fixes [#17](https://github.com/fastxyz/skill-optimizer/issues/17)).