diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json new file mode 100644 index 0000000..1b088b0 --- /dev/null +++ b/.agents/plugins/marketplace.json @@ -0,0 +1,22 @@ +{ + "name": "skill-optimizer", + "interface": { + "displayName": "Skill Optimizer", + "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs" + }, + "plugins": [ + { + "name": "skill-optimizer", + "source": { + "source": "local", + "path": "./" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_INSTALL" + }, + "category": "Coding" + } + ] +} diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 0000000..5bf84bf --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,21 @@ +{ + "name": "skill-optimizer", + "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "owner": { + "name": "Fast" + }, + "plugins": [ + { + "name": "skill-optimizer", + "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "version": "2.0.0", + "source": "./", + "author": { + "name": "Fast" + }, + "skills": [ + "./skills/skill-optimizer" + ] + } + ] +} diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..7aada51 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,19 @@ +{ + "name": "skill-optimizer", + "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "version": "2.0.0", + "author": { + "name": "Fast" + }, + "homepage": "https://github.com/fastxyz/skill-optimizer#readme", + "repository": "https://github.com/fastxyz/skill-optimizer", + "license": "MIT", + "keywords": [ + "agent-skills", + "evals", + "skill-testing", + "model-evaluation", + "optimization" + ], + "skills": "./skills/" +} diff --git a/.codex b/.codex deleted file mode 100644 index e69de29..0000000 diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json new file mode 100644 index 0000000..71fbfc5 --- /dev/null +++ b/.codex-plugin/plugin.json @@ -0,0 +1,35 @@ +{ + "name": "skill-optimizer", + "version": "2.0.0", + "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "author": { + "name": "Fast" + }, + "homepage": "https://github.com/fastxyz/skill-optimizer#readme", + "repository": "https://github.com/fastxyz/skill-optimizer", + "license": "MIT", + "keywords": [ + "agent-skills", + "evals", + "skill-testing", + "model-evaluation", + "optimization" + ], + "skills": "./skills/", + "interface": { + "displayName": "Skill Optimizer", + "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "developerName": "Fast", + "category": "Coding", + "capabilities": [ + "Read", + "Write" + ], + "defaultPrompt": [ + "Use Skill Optimizer to design and run an eval suite for this agent skill.", + "Use Skill Optimizer to inspect failing traces and improve the skill." + ], + "brandColor": "#3B82F6" + } +} diff --git a/.codex/INSTALL.md b/.codex/INSTALL.md new file mode 100644 index 0000000..489e8ee --- /dev/null +++ b/.codex/INSTALL.md @@ -0,0 +1,31 @@ +# Installing skill-optimizer for Codex + +Use `skill-optimizer` in Codex as either a plugin or a native skill. + +## Plugin Install + +Register this repository as a plugin marketplace: + +```bash +codex plugin marketplace add fastxyz/skill-optimizer +``` + +Open `/plugins`, select the `skill-optimizer` marketplace, and install the `skill-optimizer` plugin. + +The marketplace file is `.agents/plugins/marketplace.json`. It exposes the repository root as the plugin source so Codex can load `.codex-plugin/plugin.json` and the bundled `skills/` directory. + +To pin a Git ref while installing the marketplace: + +```bash +codex plugin marketplace add fastxyz/skill-optimizer --ref main +``` + +## Skill-Only Install + +Install the canonical skill with the open skills CLI: + +```bash +npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a codex -y +``` + +Restart Codex if the skill does not appear immediately. diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json new file mode 100644 index 0000000..c65a94a --- /dev/null +++ b/.cursor-plugin/plugin.json @@ -0,0 +1,36 @@ +{ + "name": "skill-optimizer", + "displayName": "Skill Optimizer", + "description": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "version": "2.0.0", + "author": { + "name": "Fast" + }, + "homepage": "https://github.com/fastxyz/skill-optimizer#readme", + "repository": "https://github.com/fastxyz/skill-optimizer", + "license": "MIT", + "keywords": [ + "agent-skills", + "evals", + "skill-testing", + "model-evaluation", + "optimization" + ], + "skills": "./skills/", + "interface": { + "displayName": "Skill Optimizer", + "shortDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "longDescription": "Benchmark, evaluate, and optimize skills to ensure reliable performance across all LLMs", + "developerName": "Fast", + "category": "Coding", + "capabilities": [ + "Read", + "Write" + ], + "defaultPrompt": [ + "Use Skill Optimizer to design and run an eval suite for this agent skill.", + "Use Skill Optimizer to inspect failing traces and improve the skill." + ], + "brandColor": "#3B82F6" + } +} diff --git a/.cursor/INSTALL.md b/.cursor/INSTALL.md new file mode 100644 index 0000000..c6d8de6 --- /dev/null +++ b/.cursor/INSTALL.md @@ -0,0 +1,15 @@ +# Installing skill-optimizer for Cursor + +## Skill install + +Install the skill into Cursor's project or global skill directory through the open skills CLI: + +```bash +npx skills add fastxyz/skill-optimizer --skill skill-optimizer -a cursor -y +``` + +Cursor can also import remote skills from GitHub in Settings -> Rules -> Project Rules -> Add Rule -> Remote Rule (Github). + +## Plugin metadata + +This repository includes `.cursor-plugin/plugin.json` for Cursor-compatible plugin metadata. The canonical skill remains `skills/skill-optimizer/SKILL.md`. diff --git a/.gitignore b/.gitignore index 1a93e1c..955a362 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,10 @@ dist/ # Cache directories .cache/ -.opencode/ +.opencode/* +!.opencode/INSTALL.md +!.opencode/plugins/ +!.opencode/plugins/skill-optimizer.js # Test results and benchmarks results/ @@ -55,6 +58,8 @@ docs/specs/ # Skill-optimizer generated artifacts .skill-optimizer/ +.skill-eval/ +.results/ # Local user config (personal paths, model choices — not repo artifacts) skill-optimizer.json diff --git a/.opencode/INSTALL.md b/.opencode/INSTALL.md new file mode 100644 index 0000000..c93409a --- /dev/null +++ b/.opencode/INSTALL.md @@ -0,0 +1,21 @@ +# Installing skill-optimizer for OpenCode + +Add the plugin to `opencode.json` at user or project scope: + +```json +{ + "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git"] +} +``` + +Restart OpenCode. The plugin registers the repository `skills/` directory so the native `skill` tool can load `skill-optimizer`. + +Verify with the skill tool by listing skills or loading `skill-optimizer`. + +To pin a version, append a tag or commit ref: + +```json +{ + "plugin": ["skill-optimizer@git+https://github.com/fastxyz/skill-optimizer.git#v2.0.0"] +} +``` diff --git a/.opencode/plugins/skill-optimizer.js b/.opencode/plugins/skill-optimizer.js new file mode 100644 index 0000000..f733c67 --- /dev/null +++ b/.opencode/plugins/skill-optimizer.js @@ -0,0 +1,25 @@ +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +const pluginDir = path.dirname(fileURLToPath(import.meta.url)); +const skillsDir = path.resolve(pluginDir, "..", "..", "skills"); + +function registerSkillsDir(config) { + config.skills = config.skills || {}; + config.skills.paths = config.skills.paths || []; + + if (!config.skills.paths.includes(skillsDir)) { + config.skills.paths.push(skillsDir); + } +} + +export const SkillOptimizerPlugin = async () => ({ + config: async (config) => { + registerSkillsDir(config); + }, +}); + +export default { + id: "skill-optimizer", + server: SkillOptimizerPlugin, +}; diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..0d09351 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,61 @@ +# AGENTS.md + +## Project Overview + +`skill-optimizer` is a Docker workbench for running and grading agent skill eval cases. The current public CLI centers on `run-case` and `run-suite`. + +## Key Commands + +```bash +npm run build +npm run typecheck +npm test +npx tsx src/cli.ts --help +npx tsx src/cli.ts run-case --help +npx tsx src/cli.ts run-suite --help +``` + +## Important Files + +- `src/cli.ts`: public CLI entrypoint +- `src/workbench/`: workbench case loading, suite loading, Docker runner, Pi agent, graders, and traces +- `docker/workbench-runner.Dockerfile`: generic non-root container image for setup, agent, grade, and cleanup phases +- `skills/skill-optimizer/SKILL.md`: canonical distributable Agent Skill +- `.claude-plugin/`, `.codex-plugin/`, `.cursor-plugin/`, `.opencode/`: cross-agent plugin manifests and install support +- `.agents/plugins/marketplace.json`: Codex repo marketplace entry for the root plugin +- `gemini-extension.json`, `GEMINI.md`: Gemini extension metadata and context file +- `examples/workbench/`: tracked example eval suites +- `README.md`: provider-specific installation instructions for Claude Code, Codex, Cursor, OpenCode, Gemini CLI, and skill-only installs +- `CONTRIBUTING.md`: contributor workflow and current workbench invariants + +## Installation Docs + +Keep the README installation section aligned with packaged plugin metadata: + +- Claude Code: `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json` +- Codex: `.agents/plugins/marketplace.json` and `.codex-plugin/plugin.json` +- Cursor: `.cursor-plugin/plugin.json` and `.cursor/INSTALL.md` +- OpenCode: `.opencode/plugins/skill-optimizer.js` and `.opencode/INSTALL.md` +- Gemini CLI: `gemini-extension.json` and `GEMINI.md` +- Skill-only installs: `npx skills add fastxyz/skill-optimizer --skill skill-optimizer ...` + +## Invariants + +- Keep evaluation static: extraction and matching are allowed; do not execute model-produced code outside the Docker workbench as part of evaluation. +- `run-suite` uses models from `suite.yml`; do not add a `run-suite --models` override. +- Keep OpenRouter model refs as `openrouter/...`; real model runs require `OPENROUTER_API_KEY`. +- Cases use `graders: [{ name, command }]`; legacy `check:` and `artifacts:` are invalid. +- Graders are the acceptance contract; evaluate outputs from `/work`, generated artifacts, `answer.json`, `trace.jsonl`, and result state. +- The agent phase sees only `/work`, not `/case` or `/results`. +- Keep plugin metadata pointed at the canonical `skills/skill-optimizer/SKILL.md`; do not create divergent skill copies. +- Codex plugin metadata lives in `.codex-plugin/plugin.json`; the repo marketplace lives in `.agents/plugins/marketplace.json` and points at `./`. +- Provider install docs should link to the same canonical skill/plugin metadata, not separate skill copies. +- Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials. + +## Testing Guidance + +- Run `npm run typecheck` after TypeScript changes. +- Run `npm test` before finishing behavior changes. +- For Docker runner or image changes, also run `docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .`. +- For CLI/docs changes, verify `npx tsx src/cli.ts --help` if touched docs mention CLI behavior. +- For plugin/package metadata changes, run `npx tsx tests/smoke-skill-distribution.ts` and verify `npm pack --dry-run --json` includes required plugin files without result/cache directories. diff --git a/CHANGELOG.md b/CHANGELOG.md index d52754a..5dc20ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ # Changelog +## 2.0.0 — 2026-05-01 + +### Changed + +- Rebuilt Skill Optimizer around the eval workbench: realistic skill cases, model matrices, isolated agent workspaces, trace inspection, deterministic grader evidence, and iterative skill improvement. +- Repositioned package and plugin metadata around the skill eval lab workflow instead of implementation mechanics. + +### Breaking Changes + +- Removed the legacy reference-solution preflight flow and `verify-suite`; graders are now the sole acceptance contract. +- Removed reference-solution SDK exports and packaged example solution scripts. + +### Added + +- Hidden MCP services for eval cases, exposed to agents through the workbench `mcp` command. +- Post-run optimization guidance for inspecting failures, updating skills or supporting code, and re-running evals. + ## 1.1.0 — 2026-04-16 ### Breaking Changes @@ -25,7 +42,7 @@ The config file `skill-benchmark.json` is no longer auto-detected. Rename it to ### Added - **prompt surface type** — benchmark and optimize prompt templates, Claude Code skills, and agent instructions. Discovers phases and capabilities from markdown, evaluates output quality with content-based criteria. - **Codex auth** — direct OpenAI model runs can use browser-login tokens stored by Codex (`~/.codex/auth.json`) instead of requiring `OPENAI_API_KEY`. Set `benchmark.authMode: "codex"` and use `openai/` IDs. -- **SKILL folder** — bundled AI-agent guidance (`SKILL/SKILL.md`) so agents can use skill-optimizer reliably without extra setup. +- **skills folder** — bundled AI-agent guidance (`skills/skill-optimizer/SKILL.md`) so agents can use skill-optimizer reliably without extra setup. - **Optimizer loop diagram** — README now includes a visual workflow diagram of the optimizer loop. - **Stable task IDs** — task IDs are now derived from a SHA-1 hash of the action names (SDK/CLI/MCP surfaces) or prompt text (prompt surface). For SDK/CLI/MCP surfaces, where action names come from discovered code rather than LLM output, IDs are stable across regenerations and the `--task ` filter works reliably. For the prompt surface, IDs are stable when the LLM produces identical wording; if it rephrases a task the ID changes (fixes [#17](https://github.com/fastxyz/skill-optimizer/issues/17)). diff --git a/CLAUDE.md b/CLAUDE.md index f5fd890..e91b35e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,17 +2,9 @@ ## Project Overview -`skill-optimizer` measures whether LLMs pick the right SDK methods, CLI commands, or MCP tools from docs and task prompts, and can run a benchmark-driven optimization loop over an allowed target repo. +`skill-optimizer` is a Docker workbench for running and grading agent skill eval cases. The current public CLI centers on `run-case` and `run-suite`. -The repo has five layers: - -- `src/project/`: unified `skill-optimizer.json` config loading, validation, and path resolution -- `src/runtime/pi/`: shared Pi auth/model/runtime helpers -- `src/tasks/`: shared task generation, grounding, and artifact freezing from discovered surfaces -- `src/benchmark/`: loads tasks and surface definitions, builds prompts, calls models, extracts actions, evaluates them, and writes reports -- `src/optimizer/`: runs a benchmark-driven optimization loop against a constrained target repo - -The benchmark is static. Do not change behavior in ways that execute model-produced code or shell commands as part of evaluation. +The workbench gives an agent an isolated Docker `/work` directory, captures traces, and grades deterministic local outcomes from files, command logs, generated artifacts, or other workspace state. ## Key Commands @@ -21,101 +13,52 @@ npm run build npm run typecheck npm test npx tsx src/cli.ts --help -npx tsx src/cli.ts generate-tasks --help -npx tsx src/cli.ts optimize --help -``` - -Typical benchmark run: - -```bash -export OPENROUTER_API_KEY=... -npx tsx src/cli.ts run --config ./skill-optimizer.json -``` - -Generate tasks only: - -```bash -npx tsx src/cli.ts generate-tasks --config ./skill-optimizer.json -``` - -Typical optimizer run: - -```bash -tsx src/optimizer/materialize-mock-repo.ts mcp-tracker-demo ./.tmp/mock-repos -npx tsx src/cli.ts optimize --config ./.tmp/mock-repos/mcp-tracker-demo/skill-optimizer.json +npx tsx src/cli.ts run-case --help +npx tsx src/cli.ts run-suite --help ``` ## Important Files -- `src/cli.ts`: public CLI entrypoint (`init`, `run`, `optimize`, `compare`) -- `src/project/types.ts`: unified public project config types -- `src/project/load.ts`: unified `skill-optimizer.json` loader -- `src/runtime/pi/models.ts`: shared Pi model/auth resolution -- `src/tasks/index.ts`: shared task generation entrypoint over discovered surfaces -- `src/benchmark/runner.ts`: orchestration for benchmark execution -- `src/benchmark/types.ts`: benchmark report, metric, and extraction types -- `src/benchmark/init.ts`: scaffolded starter `skill-optimizer.json` -- `src/optimizer/loop.ts`: accept/reject iteration loop -- `src/optimizer/manifest.ts`: adapter from unified project config into the current optimizer loop -- `src/optimizer/mock-repos.ts`: tracked template materialization and isolated git init -- `mock-repos/mcp-tracker-demo/`: current richer demo target for optimizer testing +- `src/cli.ts`: public CLI entrypoint +- `src/workbench/`: workbench case loading, suite loading, Docker runner, Pi agent, graders, and traces +- `docker/workbench-runner.Dockerfile`: generic non-root container image for setup, agent, grade, and cleanup phases +- `skills/skill-optimizer/SKILL.md`: canonical distributable Agent Skill +- `skills/skill-optimizer/references/workbench.md`: detailed workbench schema and usage reference +- `.claude-plugin/`, `.codex-plugin/`, `.cursor-plugin/`, `.opencode/`: cross-agent plugin manifests and install support +- `.agents/plugins/marketplace.json`: Codex repo marketplace entry for the root plugin +- `gemini-extension.json`, `GEMINI.md`: Gemini extension metadata and context file +- `examples/workbench/`: tracked example eval suites +- `README.md`: provider-specific installation instructions for Claude Code, Codex, Cursor, OpenCode, Gemini CLI, and skill-only installs +- `CONTRIBUTING.md`: contributor workflow and current workbench invariants + +## Installation Docs + +Keep the README installation section aligned with packaged plugin metadata: + +- Claude Code: `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json` +- Codex: `.agents/plugins/marketplace.json` and `.codex-plugin/plugin.json` +- Cursor: `.cursor-plugin/plugin.json` and `.cursor/INSTALL.md` +- OpenCode: `.opencode/plugins/skill-optimizer.js` and `.opencode/INSTALL.md` +- Gemini CLI: `gemini-extension.json` and `GEMINI.md` +- Skill-only installs: `npx skills add fastxyz/skill-optimizer --skill skill-optimizer ...` ## Invariants -- Keep benchmark evaluation static. Extraction and matching are allowed; executing generated code is not. -- Keep path resolution relative to the unified config file being loaded. -- `targetRepo.allowedPaths` is the optimizer safety boundary. Do not widen edits outside it during mutation. -- `requireCleanGit` must remain effectively enforced for optimizer targets. -- Optimizer-owned artifacts under the configured task-generation output dir must not be treated as target-repo mutations. -- **The target repo's skill file is never modified.** The optimizer copies it to `.skill-optimizer/skill-v0.md` on start and creates versioned copies per accepted iteration. The mutation agent writes to these local copies; `skillOverride` makes the benchmark read from them. -- Stable-surface optimize runs assume the callable surface is frozen for the duration of the run. If a change renames commands/tools/APIs, the surface must be rediscovered and the benchmark snapshot regenerated before further comparisons are meaningful. -- Materialized mock repos must stay isolated from tracked templates. -- Documentation examples should match the current CLI and config schema. - -## Editing Guidance - -- Prefer small changes in the existing architecture over broad refactors. -- When updating config or project types, also update the README examples and any scaffolding in `src/benchmark/init.ts` if needed. -- When changing optimizer behavior, verify both the loop and the unified project defaults still agree. -- Code-first surface discovery is the preferred mode for `sdk`, `cli`, and `mcp` via `target.discovery.sources`. Explicit manifest files (`target.cli.commands`, `target.mcp.tools`, `target.discovery.fallbackManifest`) are supported for projects that cannot use code-first discovery. -- Be careful around mock repo references: code may support template names that are not currently present in the working tree. +- Keep evaluation static: extraction and matching are allowed; do not execute model-produced code outside the Docker workbench as part of evaluation. +- `run-suite` uses models from `suite.yml`; do not add a `run-suite --models` override. +- Keep OpenRouter model refs as `openrouter/...`; real model runs require `OPENROUTER_API_KEY`. +- Cases use `graders: [{ name, command }]`; legacy `check:` and `artifacts:` are invalid. +- Graders are the acceptance contract; evaluate outputs from `/work`, generated artifacts, `answer.json`, `trace.jsonl`, and result state. +- The agent phase sees only `/work`, not `/case` or `/results`. +- Keep plugin metadata pointed at the canonical `skills/skill-optimizer/SKILL.md`; do not create divergent skill copies. +- Codex plugin metadata lives in `.codex-plugin/plugin.json`; the repo marketplace lives in `.agents/plugins/marketplace.json` and points at `./`. +- Provider install docs should link to the same canonical skill/plugin metadata, not separate skill copies. +- Do not commit `.skill-eval/`, `.results/`, `.env`, or credentials. ## Testing Guidance - Run `npm run typecheck` after TypeScript changes. -- Run `npm test` before finishing when behavior changes may affect extraction, evaluation, reporting, or optimizer flow. -- For CLI-only or docs-only changes, at minimum verify `npx tsx src/cli.ts --help` still works if the touched docs reference CLI behavior. - -## Model ID Convention - -Model IDs use a provider-prefixed format. The prefix determines how the request is routed: - -``` -openrouter// — routed through OpenRouter -anthropic/ — direct Anthropic API -openai/ — direct OpenAI API -``` - -**For `openrouter/` model IDs, preserve the exact slug from OpenRouter's catalog** — these are passed verbatim to OpenRouter's API and must match exactly, including dots in version numbers: -- `openrouter/anthropic/claude-sonnet-4.6` ✓ (dots — OpenRouter's catalog format) -- `openrouter/openai/gpt-5.4` ✓ (dots) -- `openrouter/deepseek/deepseek-v3.2` ✓ (dots) -- `openrouter/google/gemini-2.5-flash` ✓ - -**For `anthropic/` direct-API model IDs, use hyphens** — Anthropic's own API slugs use hyphens: -- `anthropic/claude-sonnet-4-6` ✓ (hyphens) -- `anthropic/claude-opus-4-6` ✓ (hyphens) - -**For `openai/` direct-API model IDs, use dots in version segments** — OpenAI's API slugs use dots: -- `openai/gpt-5.4` ✓ (dot) -- `openai/gpt-4.1` ✓ (dot) - -`src/project/validate.ts` warns on dot-notation for `anthropic/` model IDs only (`model-id-bad-format`) and `src/project/fix.ts` auto-corrects them. Both `openai/` and `openrouter/` are fully exempt from any dot→hyphen rewriting. When adding new model presets to `src/init/scaffold.ts`, `src/init/wizard.ts`, or `src/benchmark/init.ts`, copy the slug exactly from the OpenRouter catalog for `openrouter/` models. - -Display names (`name:` / `label:` fields) are human-readable and should keep dots (e.g. `'Claude Sonnet 4.6'`, `'Gemini 2.5 Flash'`). - -## Environment Notes - -- Do not commit `.env` or secrets. -- Pi-based examples use `benchmark.format: "pi"` and typically expect `OPENROUTER_API_KEY`. -- The current unified config also allows the optimizer model to use `OPENROUTER_API_KEY`. +- Run `npm test` before finishing behavior changes. +- For Docker runner or image changes, also run `docker build -t skill-optimizer-workbench:local -f docker/workbench-runner.Dockerfile .`. +- For CLI/docs changes, verify `npx tsx src/cli.ts --help` if touched docs mention CLI behavior. +- For plugin/package metadata changes, run `npx tsx tests/smoke-skill-distribution.ts` and verify `npm pack --dry-run --json` includes required plugin files without result/cache directories. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2a4cd71..870e01f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,10 @@ # Contributing to skill-optimizer -Thanks for contributing! This project is a small, opinionated tool — changes should preserve its core invariants (static evaluation, `allowedPaths` safety boundary, per-model universality). +Thanks for contributing! This project is a small, opinionated Docker workbench for evaluating agent skills. Changes should preserve deterministic grading, isolated agent workspaces, and the canonical `skills/skill-optimizer/SKILL.md` distribution path. + +## Installing The Skill + +See `README.md#installation` for provider-specific install instructions for Claude Code, OpenAI Codex CLI/App, Cursor, OpenCode, Gemini CLI, and skill-only installs. ## Local workflow @@ -13,42 +17,48 @@ npm test npm run build ``` -All three commands must pass before opening a PR. +All three commands must pass before opening a PR when code changes are involved. ## Project layout -- `src/cli.ts` — CLI entry point (single source of truth; all `npm run