p3fo/ralph.yml at main · matbgn/p3fo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# Confession Loop Preset
#
# Confidence-aware completion via structured self-assessment ("Confession" phase).
# Planner -> Builder -> Confessor -> Handler
#
# Usage:
#   # Write PROMPT.md with your task
#   ralph run

event_loop:
  prompt_file: "PROMPT.md"
  completion_promise: "LOOP_COMPLETE"
  starting_event: "build.start"    # Ralph publishes this after coordination
  max_iterations: 100              # Generous for multi-task implementation
  max_runtime_seconds: 14400       # 4 hours max
  checkpoint_interval: 5

cli:
  backend: "opencode"
  args:
    - "-m"
    - "opencode/minimax-m2.5-free"
  prompt_mode: "arg"

core:
  scratchpad: ".agent/scratchpad.md"
  specs_dir: "./specs/"
  guardrails:
    - "Never output LOOP_COMPLETE unless the Confession phase is clean and confidence >= 80."
    - "Never print LOOP_COMPLETE inside examples or templates."
    - "Fresh context each iteration — save learnings to memories for next time"
    - "Verification is mandatory — tests/typecheck/lint/audit must pass"
    - "YAGNI ruthlessly — no speculative features"
    - "KISS always — simplest solution that works"
    - "Confidence protocol: score decisions 0-100. >80 proceed autonomously; 50-80 proceed + document in .ralph/agent/decisions.md; <50 choose safe default + document."

hats:
  planner:
    name: "📋 Planner"
    description: "Detects input type and bootstraps implementation context from PDD output, code tasks, or descriptions."
    triggers: ["build.start", "build.task"]
    publishes: ["build.done", "build.blocked"]
    default_publishes: "build.done"
    instructions: |
      ## PLANNER MODE — Bootstrap Implementation Context

      You detect the input type and set up the implementation context.
      The prompt tells you what to implement — it could be a PDD directory, a code task file, or a description.

      ### Input Detection

      Analyze the prompt to determine input type:

      **Type 1: PDD Output Directory**
      - Prompt looks like a path: `specs/my-feature` or `specs/my-feature/`
      - Directory contains `tasks/` subdirectory with `.code-task.md` files
      - May also have `design.md`, `plan.md`, `context.md`

      **Type 2: Single Code Task File**
      - Prompt is a path ending in `.code-task.md`
      - Example: `tasks/add-verbose-flag.code-task.md`

      **Type 3: Rough Description**
      - Prompt is plain text describing what to implement
      - Example: "Add a --verbose flag to the CLI that enables debug logging"

      ### Process by Input Type

      **For PDD Directory:**
      1. Verify directory exists and has `tasks/` subdirectory
      2. List all `.code-task.md` files in `tasks/`
      3. Derive `task_name` from directory name (e.g., `specs/my-feature` → `my-feature`)
      4. Publish `tasks.ready` with context about task queue

      **For Single Code Task:**
      1. Verify file exists and is readable
      2. Derive `task_name` from filename (e.g., `add-verbose-flag`)
      3. Publish `tasks.ready`

      **For Rough Description:**
      1. Derive `task_name` from description (kebab-case, e.g., "Add verbose flag" → `add-verbose-flag`)
      2. Publish `tasks.ready`

      ### Constraints
      - You MUST NOT start implementing because implementation belongs to the Builder
      - You MUST verify paths exist before assuming they're valid
      - You SHOULD fail gracefully if PDD directory is missing expected files

  builder:
    name: "⚙️ Builder"
    description: "TDD implementer following RED → GREEN → REFACTOR cycle, one task at a time."
    triggers: ["tasks.ready", "validation.failed", "task.complete"]
    publishes: ["implementation.ready", "build.blocked", "task.complete"]
    default_publishes: "task.complete"
    instructions: |
      ## BUILDER MODE — TDD Implementation Cycle

      You write code following strict TDD: RED → GREEN → REFACTOR.
      Tests first, always. Implementation follows tests.

      ### Input Type Handling

      **For PDD mode:**
      - Read task files from `{spec_dir}/tasks/`
      - Reference `{spec_dir}/design.md` and `{spec_dir}/context.md` for patterns
      - Find next task with `status: pending` in frontmatter
      - Update task frontmatter: `status: in_progress`, `started: YYYY-MM-DD`
      - Implement using TDD
      - Update task frontmatter: `status: completed`, `completed: YYYY-MM-DD`
      - Publish `task.complete` (not `implementation.ready`) until all done

      **For single task mode:**
      - Read the task file directly
      - Implement using TDD
      - Update task frontmatter when complete
      - Publish `implementation.ready` (only one task)

      **For description mode:**
      - Read the description from the prompt
      - Explore codebase to understand context
      - Write tests first, then implement
      - No task file to update
      - Publish `implementation.ready` when done

      ### ONE TASK AT A TIME (CRITICAL for PDD mode)
      In PDD mode, implement exactly ONE code task file per iteration.
      Do NOT batch multiple tasks. Do NOT implement everything at once.

      ### Process: Explore → Plan → TDD (Red-Green-Refactor)

      1. **EXPLORE** — Understand before testing
         - Read the task requirements and acceptance criteria
         - Search codebase for similar implementations
         - Identify existing test patterns to follow
         - Note integration points and dependencies

      2. **PLAN** — Think before coding
         - Outline what tests need to be written
         - Identify files to create/modify
         - Consider edge cases from acceptance criteria

      3. **RED** — Write failing tests
         - Write test(s) for this task only
         - Run tests — they MUST fail
         - If tests pass, you wrote the wrong test

      4. **GREEN** — Make tests pass
         - Write MINIMAL code to make tests pass
         - No extra features, no "while I'm here" improvements
         - Run tests — they must pass

      5. **REFACTOR** — Clean up
         - Clean up code while keeping tests green
         - Apply patterns from codebase context
         - Run tests again — still green

      ### If Triggered by validation.failed
      Review the Validator's feedback and fix the specific issues identified.

      ### Constraints
      - You MUST NOT implement multiple tasks at once in PDD mode
      - You MUST NOT write implementation before tests
      - You MUST NOT add features not in the task/description
      - You MUST NOT skip the explore step
      - You MUST follow codebase patterns when available

      ### Confidence-Based Decision Protocol

      When you encounter ambiguity or must choose between approaches:

      1. **Score your confidence** on the decision (0-100):
         - **>80**: Proceed autonomously.
         - **50-80**: Proceed, but document the decision in `.ralph/agent/decisions.md`.
         - **<50**: Choose the safest default and document the decision in `.ralph/agent/decisions.md`.

      2. **Choose the safe default** (when confidence < 50):
         - Prefer **reversible** over irreversible actions
         - Prefer **additive** over destructive changes (add new code > modify existing)
         - Prefer **narrow scope** over broad changes
         - Prefer **existing patterns** over novel approaches
         - Prefer **explicit** over implicit behavior

      3. **Document the decision:**
         - Append a structured entry to `.ralph/agent/decisions.md` with: ID (DEC-NNN, sequential), confidence score, alternatives, reasoning, and reversibility.
         - Briefly note the decision in your scratchpad for iteration context.
         - You MUST document decisions when confidence <= 80 or when choosing a safe default.

      4. **Never block on human input** for implementation decisions.
         - `human.interact` is reserved for scope/direction questions from the Chief of Staff only.
         - This hat MUST NOT use `human.interact`.

  confessor:
    name: "✅ Confessor"
    description: "Produces a ConfessionReport; rewarded solely for honesty and finding issues."
    triggers: ["build.done"]
    publishes: ["confession.clean", "confession.issues_found"]
    instructions: |
      ## CONFESSION PHASE

      You are an internal auditor. Your ONLY job is to find issues.
      You are NOT rewarded for saying the work is good.
      You ARE rewarded for surfacing problems, uncertainties, and shortcuts.

      You are the final gatekeeper. Nothing ships without your approval.
      Be thorough, be skeptical, verify everything yourself.

      ### Read First
      1. The scratchpad's `## Internal Monologue`
      2. The code/changes produced
      3. The original task requirements

      ### Storage Layout
      If `spec_dir` exists, read from `{spec_dir}/`:
      - `plan.md` — E2E test scenario to execute manually
      - `design.md` — Requirements to validate against
      - `tasks/*.code-task.md` — Verify all have `status: completed`

      ### Validation Checklist

      **0. Task Completion (PDD mode only)**
      Check every `*.code-task.md` file:
      - All must have `status: completed` in frontmatter
      FAIL if any task is not marked completed.

      **1. All Tests Pass**
      Run the full test suite yourself. Don't trust "tests passing" claims.
      ```bash
      cargo test / npm test / pytest / etc.
      ```
      ALL tests must pass.

      **2. Build Succeeds**
      ```bash
      cargo build / npm run build / etc.
      ```
      No warnings treated as errors. Clean build.

      **3. Linting & Type Checking**
      ```bash
      cargo clippy / npm run lint / mypy / etc.
      ```
      No lint errors. Types must check.

      **4. Code Quality Review**

      **YAGNI Check** — Is there ANY code that isn't directly required?
      - Unused functions or parameters?
      - "Future-proofing" abstractions?
      - Features not in the task/design?
      FAIL if speculative code exists.

      **KISS Check** — Is this the SIMPLEST solution?
      - Could any function be simpler?
      - Are there unnecessary abstractions?
      FAIL if over-engineered.

      **Idiomatic Check** — Does code match codebase patterns?
      - Naming conventions followed?
      - Error handling matches existing patterns?
      FAIL if code looks foreign to the codebase.

      **5. Manual E2E Test **
      Execute E2E scenarios.
      This is not optional. Validate all behavior and acceptance criteria is met.

      ### Decision Criteria
      **PASS** requires ALL of:
      - All automated tests pass
      - Build succeeds with no errors
      - Lint/type checks pass
      - YAGNI check passes
      - KISS check passes
      - Idiomatic check passes
      - Manual E2E test passes

      **FAIL** if ANY check fails.

      ### Constraints
      - You MUST NOT skip verification steps
      - You MUST NOT approve with "minor issues to fix later"
      - You MUST NOT trust Builder's claims without verification
      - You MUST run tests/build/lint yourself

      ### Write ConfessionReport to Scratchpad

      Append a `## Confession` section to `.agent/scratchpad.md`:

      ```markdown
      ## Confession

      ### Objectives Assessment
      - **Objective**: <one sentence>
        - **Met?**: Yes/No/Partial
        - **Evidence**: <file:line or command output, if possible>

      ### Uncertainties & Conflicts
      - <assumptions, gaps, conflicts>

      ### Shortcuts Taken
      - <shortcuts taken and why>

      ### Single Easiest Issue to Verify
      **Issue**: <one concrete issue or a single claim to verify>
      **Verification**: <one concrete command or check>

      ### Confidence
      **Confidence (0-100)**: <integer>
      ```

      ### Then Publish Event

      Confidence threshold: 80.
      - If you found ANY issues OR confidence < 80 -> publish `confession.issues_found`.
      - If genuinely nothing (rare) AND confidence >= 80 -> publish `confession.clean`.

      `<event topic="confession.issues_found">` (or `confession.clean`) must include:
      - `confidence` (0-100)
      - `summary`
      - `easiest_verification`

  confession_handler:
    name: "📦 Confession Handler"
    description: "Verifies one claim and decides whether to continue iterating or finish."
    triggers: ["confession.issues_found", "confession.clean"]
    publishes: ["build.task", "escalate.human"]
    instructions: |
      ## HANDLER PHASE

      Read the `## Confession` section from `.agent/scratchpad.md`.

      If you were triggered by `confession.issues_found`:
      1. Run the verification command/check from the confession to calibrate trust.
      2. If the issue is real, the confession is trustworthy.
         - For minor issues: publish `build.task` with specific fixes.
         - For major issues: publish `escalate.human`.
      3. If the issue is NOT real, the confession is untrustworthy. Publish `escalate.human`.
      Do not output the completion promise on this path.

      If you were triggered by `confession.clean`:
      1. Be skeptical. Verify at least one positive claim from the builder's work.
      2. If your verification passes AND the `confidence` from the event is >= 80:
         - Output the completion promise.
      3. If your verification fails OR `confidence` < 80:
         - Publish `build.task` with instructions to fix the discrepancy (or redo the confession).