From 3c23fdc3e318c7e70faf1ddb4dc92601183c1ddd Mon Sep 17 00:00:00 2001 From: Amit Paz Date: Sat, 9 May 2026 09:08:27 +0300 Subject: [PATCH] =?UTF-8?q?fix(subagents):=20break=20recursion=20=E2=80=94?= =?UTF-8?q?=20disarm=20user=20hooks=20for=20subagent=20sessions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each ``claude -p`` capture-extract / dream / graph_extraction subagent is itself a Claude Code session. The user's PostToolUse / Stop / SessionEnd hooks therefore fire for the subagent's *own* tool uses. Without a guard, this cascades: 1. Active session does N tool uses → PostToolUse hook → spawns capture-extract subagent. 2. Subagent's own tool uses (Read, Bash, Grep …) re-fire PostToolUse, which writes to *its* session_id buffer.jsonl. 3. After ``LORE_CAPTURE_N`` entries, the hook spawns *another* capture-extract for the subagent — and so on. Observed in production for one user: ~700 spawns/hour, ~$34/h on Haiku, 685 distinct sessions hit in 60 minutes despite only one interactive ``claude`` process running. ``--strict-mcp-config`` from PR #52 isolates MCP but does not isolate hooks. Two layers of defense: 1. ``subagent_config.settings_body()`` now writes ``hooks`` as empty arrays. ``--settings`` overrides the user's hooks for the subagent's session. 2. New ``SubagentConfig.env_overrides()`` returns ``LORE_AUTO_SAVE=false`` and ``LORE_DREAM_AUTO=false``. The hook scripts honor these as master kill switches and exit 0 immediately. All three spawn sites now pass ``env={**os.environ, **cfg.env_overrides()}`` so the guard survives any caching of ``settings.json`` by the running parent Claude Code process. 3 new tests: hooks-empty in materialized settings, env_overrides shape, env_overrides parity across roles. Existing TestSpawnClaudeArgs extended to assert recursion-guard env vars are passed to subprocess.Popen. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lore/cli/commands/capture.py | 6 +++ src/lore/cli/commands/dream.py | 2 + src/lore/services/graph_extraction.py | 2 + src/lore/subagent_config.py | 52 +++++++++++++++++++++++-- tests/services/test_graph_extraction.py | 7 ++++ tests/test_subagent_config.py | 38 ++++++++++++++++++ 6 files changed, 104 insertions(+), 3 deletions(-) diff --git a/src/lore/cli/commands/capture.py b/src/lore/cli/commands/capture.py index b31842d..3f7e92c 100644 --- a/src/lore/cli/commands/capture.py +++ b/src/lore/cli/commands/capture.py @@ -616,6 +616,12 @@ def _spawn_subagent( stdout=log_fh, stderr=subprocess.STDOUT, start_new_session=not foreground, + # Recursion guard: the subagent is itself a Claude Code + # session, so the user's PostToolUse / Stop hooks fire on + # *its* tool uses. Without these env vars the lore-capture-* + # hooks would spawn another capture-extract for the + # subagent's session, which spawns another, etc. + env={**os.environ, **cfg.env_overrides()}, ) except OSError: log_fh.close() diff --git a/src/lore/cli/commands/dream.py b/src/lore/cli/commands/dream.py index cdd7cd9..2fbd6ec 100644 --- a/src/lore/cli/commands/dream.py +++ b/src/lore/cli/commands/dream.py @@ -440,6 +440,8 @@ def _spawn_subagent( stdout=log_fh, stderr=subprocess.STDOUT, start_new_session=True, + # Recursion guard — see lore.subagent_config docstring. + env={**os.environ, **cfg.env_overrides()}, ) except OSError: log_fh.close() diff --git a/src/lore/services/graph_extraction.py b/src/lore/services/graph_extraction.py index 1ee6a7a..fdd3e57 100644 --- a/src/lore/services/graph_extraction.py +++ b/src/lore/services/graph_extraction.py @@ -251,6 +251,8 @@ def _spawn_claude(prompt: str) -> "subprocess.Popen[bytes]": stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + # Recursion guard — see lore.subagent_config docstring. + env={**os.environ, **cfg.env_overrides()}, ) diff --git a/src/lore/subagent_config.py b/src/lore/subagent_config.py index 737aa29..b6b32b1 100644 --- a/src/lore/subagent_config.py +++ b/src/lore/subagent_config.py @@ -10,10 +10,31 @@ This module materializes two artifacts under ``~/.lore/subagent/`` — a minimal MCP config (lore-only or empty) and a minimal settings -override (no plugins, no thinking, low effort) — and returns the -paths plus the chosen model. Spawn sites add ``--model``, +override (no plugins, no thinking, low effort, **no hooks**) — and +returns the paths plus the chosen model. Spawn sites add ``--model``, ``--strict-mcp-config``, ``--mcp-config``, and ``--settings``. +Recursion guard +--------------- +Subagents are themselves Claude Code sessions, which means the user's +PostToolUse / Stop / SessionEnd hooks fire for **the subagent's own +tool uses**. Without a guard, a single capture-extract spawn produces +its own session JSONL, accumulates tool-use entries in its own +buffer.jsonl, crosses ``LORE_CAPTURE_N``, and spawns *another* +capture-extract for itself — and the cascade continues. This was +observed in production: ~700 spawns/hour for a single user, ~$34/h +of background spend on Haiku. + +Two layers of defense: + + 1. ``settings_body()`` writes ``hooks`` as empty arrays. ``--settings`` + overrides the user's hooks for the subagent's session. + 2. ``env_overrides()`` returns ``LORE_AUTO_SAVE=false`` and + ``LORE_DREAM_AUTO=false``. The hook scripts honor these as master + kill switches and exit 0 immediately. Spawn sites merge this into + the subprocess env so the guard survives any caching of + ``settings.json`` by the running Claude Code process. + Environment overrides: * ``LORE_SUBAGENT_MODEL`` — fallback default for all roles * ``LORE_DREAM_MODEL`` — dream-specific override @@ -90,11 +111,21 @@ def _mcp_empty_body() -> dict: def _settings_body() -> dict: # ``--settings`` merges with the user's settings.json. Setting these - # keys explicitly overrides any inherited values. + # keys explicitly overrides any inherited values. ``hooks`` as empty + # arrays prevents the recursion described in the module docstring — + # without it, the subagent's own PostToolUse / Stop / SessionEnd + # events would fire the user's lore-capture-* hooks and spawn nested + # capture-extracts ad infinitum. return { "enabledPlugins": {}, "alwaysThinkingEnabled": False, "effortLevel": "low", + "hooks": { + "UserPromptSubmit": [], + "PostToolUse": [], + "Stop": [], + "SessionEnd": [], + }, } @@ -126,6 +157,21 @@ def claude_flags(self) -> list[str]: "--settings", str(self.settings_path), ] + def env_overrides(self) -> dict[str, str]: + """Env additions for the subagent's subprocess. + + Master kill switches the lore hooks honor — keeps the + recursion guard working even if the parent Claude Code + process has cached ``~/.claude/settings.json`` and is still + firing the user's hooks against the subagent's session. + Spawn sites should pass ``env={**os.environ, **cfg.env_overrides()}`` + to ``subprocess.Popen``. + """ + return { + "LORE_AUTO_SAVE": "false", + "LORE_DREAM_AUTO": "false", + } + def subagent_config(*, role: str, with_lore_mcp: bool) -> SubagentConfig: """Return paths + model for a subagent spawn. diff --git a/tests/services/test_graph_extraction.py b/tests/services/test_graph_extraction.py index 667aac3..6de0531 100644 --- a/tests/services/test_graph_extraction.py +++ b/tests/services/test_graph_extraction.py @@ -152,6 +152,13 @@ def __init__(self, cmd, **kwargs): assert "--settings" in flags # Stdin/stdout hygiene. assert captured["kwargs"]["stdin"] is subprocess.DEVNULL + # Recursion-guard env vars must be set (LORE_AUTO_SAVE=false, + # LORE_DREAM_AUTO=false). Without these, the subagent's own + # tool uses fire the user's lore-capture-* hooks and spawn + # nested capture-extracts. + env = captured["kwargs"]["env"] + assert env["LORE_AUTO_SAVE"] == "false" + assert env["LORE_DREAM_AUTO"] == "false" # ── extract_and_persist with stub spawn_fn ───────────────────────── diff --git a/tests/test_subagent_config.py b/tests/test_subagent_config.py index 01a72ff..3ddaf31 100644 --- a/tests/test_subagent_config.py +++ b/tests/test_subagent_config.py @@ -88,6 +88,44 @@ def test_disables_plugins_thinking_and_high_effort(self, _isolated_lore_home): assert body["alwaysThinkingEnabled"] is False assert body["effortLevel"] == "low" + def test_hooks_are_empty_to_break_subagent_recursion(self, _isolated_lore_home): + # Without this, the subagent's own PostToolUse / Stop / SessionEnd + # events fire the user's lore-capture-* hooks and spawn nested + # capture-extracts ad infinitum. Observed in production: + # ~700 spawns/hour on Haiku, ~$34/h. + cfg = sc.subagent_config(role="capture", with_lore_mcp=True) + body = json.loads(Path(cfg.settings_path).read_text()) + assert body["hooks"] == { + "UserPromptSubmit": [], + "PostToolUse": [], + "Stop": [], + "SessionEnd": [], + } + + +# ── env_overrides() — recursion guard fallback ──────────────────── + + +class TestEnvOverrides: + def test_disarms_capture_and_dream_hook_kill_switches(self, _isolated_lore_home): + cfg = sc.subagent_config(role="capture", with_lore_mcp=True) + env = cfg.env_overrides() + # Master kill switches the lore hook scripts honor — second line + # of defense in case the parent claude process has cached + # ~/.claude/settings.json and is still firing user hooks against + # the subagent's session. + assert env["LORE_AUTO_SAVE"] == "false" + assert env["LORE_DREAM_AUTO"] == "false" + + def test_env_overrides_same_for_all_roles(self, _isolated_lore_home): + # Recursion is a Claude-Code-level concern; affects all subagent + # types equally regardless of role. + for role in ("capture", "dream", "graph"): + cfg = sc.subagent_config(role=role, with_lore_mcp=False) + env = cfg.env_overrides() + assert env["LORE_AUTO_SAVE"] == "false" + assert env["LORE_DREAM_AUTO"] == "false" + # ── claude_flags() shape ──────────────────────────────────────────