From 3c23fdc3e318c7e70faf1ddb4dc92601183c1ddd Mon Sep 17 00:00:00 2001
From: Amit Paz <amit.paz@gmail.com>
Date: Sat, 9 May 2026 09:08:27 +0300
Subject: [PATCH] =?UTF-8?q?fix(subagents):=20break=20recursion=20=E2=80=94?=
 =?UTF-8?q?=20disarm=20user=20hooks=20for=20subagent=20sessions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each ``claude -p`` capture-extract / dream / graph_extraction subagent
is itself a Claude Code session. The user's PostToolUse / Stop /
SessionEnd hooks therefore fire for the subagent's *own* tool uses.
Without a guard, this cascades:

  1. Active session does N tool uses → PostToolUse hook → spawns
     capture-extract subagent.
  2. Subagent's own tool uses (Read, Bash, Grep …) re-fire PostToolUse,
     which writes to *its* session_id buffer.jsonl.
  3. After ``LORE_CAPTURE_N`` entries, the hook spawns *another*
     capture-extract for the subagent — and so on.

Observed in production for one user: ~700 spawns/hour, ~$34/h on
Haiku, 685 distinct sessions hit in 60 minutes despite only one
interactive ``claude`` process running. ``--strict-mcp-config`` from
PR #52 isolates MCP but does not isolate hooks.

Two layers of defense:

  1. ``subagent_config.settings_body()`` now writes ``hooks`` as
     empty arrays. ``--settings`` overrides the user's hooks for
     the subagent's session.
  2. New ``SubagentConfig.env_overrides()`` returns
     ``LORE_AUTO_SAVE=false`` and ``LORE_DREAM_AUTO=false``. The
     hook scripts honor these as master kill switches and exit 0
     immediately. All three spawn sites now pass
     ``env={**os.environ, **cfg.env_overrides()}`` so the guard
     survives any caching of ``settings.json`` by the running
     parent Claude Code process.

3 new tests: hooks-empty in materialized settings, env_overrides
shape, env_overrides parity across roles. Existing
TestSpawnClaudeArgs extended to assert recursion-guard env vars
are passed to subprocess.Popen.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/lore/cli/commands/capture.py        |  6 +++
 src/lore/cli/commands/dream.py          |  2 +
 src/lore/services/graph_extraction.py   |  2 +
 src/lore/subagent_config.py             | 52 +++++++++++++++++++++++--
 tests/services/test_graph_extraction.py |  7 ++++
 tests/test_subagent_config.py           | 38 ++++++++++++++++++
 6 files changed, 104 insertions(+), 3 deletions(-)

diff --git a/src/lore/cli/commands/capture.py b/src/lore/cli/commands/capture.py
index b31842d..3f7e92c 100644
--- a/src/lore/cli/commands/capture.py
+++ b/src/lore/cli/commands/capture.py
@@ -616,6 +616,12 @@ def _spawn_subagent(
             stdout=log_fh,
             stderr=subprocess.STDOUT,
             start_new_session=not foreground,
+            # Recursion guard: the subagent is itself a Claude Code
+            # session, so the user's PostToolUse / Stop hooks fire on
+            # *its* tool uses. Without these env vars the lore-capture-*
+            # hooks would spawn another capture-extract for the
+            # subagent's session, which spawns another, etc.
+            env={**os.environ, **cfg.env_overrides()},
         )
     except OSError:
         log_fh.close()
diff --git a/src/lore/cli/commands/dream.py b/src/lore/cli/commands/dream.py
index cdd7cd9..2fbd6ec 100644
--- a/src/lore/cli/commands/dream.py
+++ b/src/lore/cli/commands/dream.py
@@ -440,6 +440,8 @@ def _spawn_subagent(
             stdout=log_fh,
             stderr=subprocess.STDOUT,
             start_new_session=True,
+            # Recursion guard — see lore.subagent_config docstring.
+            env={**os.environ, **cfg.env_overrides()},
         )
     except OSError:
         log_fh.close()
diff --git a/src/lore/services/graph_extraction.py b/src/lore/services/graph_extraction.py
index 1ee6a7a..fdd3e57 100644
--- a/src/lore/services/graph_extraction.py
+++ b/src/lore/services/graph_extraction.py
@@ -251,6 +251,8 @@ def _spawn_claude(prompt: str) -> "subprocess.Popen[bytes]":
         stdin=subprocess.DEVNULL,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
+        # Recursion guard — see lore.subagent_config docstring.
+        env={**os.environ, **cfg.env_overrides()},
     )
 
 
diff --git a/src/lore/subagent_config.py b/src/lore/subagent_config.py
index 737aa29..b6b32b1 100644
--- a/src/lore/subagent_config.py
+++ b/src/lore/subagent_config.py
@@ -10,10 +10,31 @@
 
 This module materializes two artifacts under ``~/.lore/subagent/`` —
 a minimal MCP config (lore-only or empty) and a minimal settings
-override (no plugins, no thinking, low effort) — and returns the
-paths plus the chosen model. Spawn sites add ``--model``,
+override (no plugins, no thinking, low effort, **no hooks**) — and
+returns the paths plus the chosen model. Spawn sites add ``--model``,
 ``--strict-mcp-config``, ``--mcp-config``, and ``--settings``.
 
+Recursion guard
+---------------
+Subagents are themselves Claude Code sessions, which means the user's
+PostToolUse / Stop / SessionEnd hooks fire for **the subagent's own
+tool uses**. Without a guard, a single capture-extract spawn produces
+its own session JSONL, accumulates tool-use entries in its own
+buffer.jsonl, crosses ``LORE_CAPTURE_N``, and spawns *another*
+capture-extract for itself — and the cascade continues. This was
+observed in production: ~700 spawns/hour for a single user, ~$34/h
+of background spend on Haiku.
+
+Two layers of defense:
+
+  1. ``settings_body()`` writes ``hooks`` as empty arrays. ``--settings``
+     overrides the user's hooks for the subagent's session.
+  2. ``env_overrides()`` returns ``LORE_AUTO_SAVE=false`` and
+     ``LORE_DREAM_AUTO=false``. The hook scripts honor these as master
+     kill switches and exit 0 immediately. Spawn sites merge this into
+     the subprocess env so the guard survives any caching of
+     ``settings.json`` by the running Claude Code process.
+
 Environment overrides:
   * ``LORE_SUBAGENT_MODEL``  — fallback default for all roles
   * ``LORE_DREAM_MODEL``     — dream-specific override
@@ -90,11 +111,21 @@ def _mcp_empty_body() -> dict:
 
 def _settings_body() -> dict:
     # ``--settings`` merges with the user's settings.json. Setting these
-    # keys explicitly overrides any inherited values.
+    # keys explicitly overrides any inherited values. ``hooks`` as empty
+    # arrays prevents the recursion described in the module docstring —
+    # without it, the subagent's own PostToolUse / Stop / SessionEnd
+    # events would fire the user's lore-capture-* hooks and spawn nested
+    # capture-extracts ad infinitum.
     return {
         "enabledPlugins": {},
         "alwaysThinkingEnabled": False,
         "effortLevel": "low",
+        "hooks": {
+            "UserPromptSubmit": [],
+            "PostToolUse": [],
+            "Stop": [],
+            "SessionEnd": [],
+        },
     }
 
 
@@ -126,6 +157,21 @@ def claude_flags(self) -> list[str]:
             "--settings", str(self.settings_path),
         ]
 
+    def env_overrides(self) -> dict[str, str]:
+        """Env additions for the subagent's subprocess.
+
+        Master kill switches the lore hooks honor — keeps the
+        recursion guard working even if the parent Claude Code
+        process has cached ``~/.claude/settings.json`` and is still
+        firing the user's hooks against the subagent's session.
+        Spawn sites should pass ``env={**os.environ, **cfg.env_overrides()}``
+        to ``subprocess.Popen``.
+        """
+        return {
+            "LORE_AUTO_SAVE": "false",
+            "LORE_DREAM_AUTO": "false",
+        }
+
 
 def subagent_config(*, role: str, with_lore_mcp: bool) -> SubagentConfig:
     """Return paths + model for a subagent spawn.
diff --git a/tests/services/test_graph_extraction.py b/tests/services/test_graph_extraction.py
index 667aac3..6de0531 100644
--- a/tests/services/test_graph_extraction.py
+++ b/tests/services/test_graph_extraction.py
@@ -152,6 +152,13 @@ def __init__(self, cmd, **kwargs):
         assert "--settings" in flags
         # Stdin/stdout hygiene.
         assert captured["kwargs"]["stdin"] is subprocess.DEVNULL
+        # Recursion-guard env vars must be set (LORE_AUTO_SAVE=false,
+        # LORE_DREAM_AUTO=false). Without these, the subagent's own
+        # tool uses fire the user's lore-capture-* hooks and spawn
+        # nested capture-extracts.
+        env = captured["kwargs"]["env"]
+        assert env["LORE_AUTO_SAVE"] == "false"
+        assert env["LORE_DREAM_AUTO"] == "false"
 
 
 # ── extract_and_persist with stub spawn_fn ─────────────────────────
diff --git a/tests/test_subagent_config.py b/tests/test_subagent_config.py
index 01a72ff..3ddaf31 100644
--- a/tests/test_subagent_config.py
+++ b/tests/test_subagent_config.py
@@ -88,6 +88,44 @@ def test_disables_plugins_thinking_and_high_effort(self, _isolated_lore_home):
         assert body["alwaysThinkingEnabled"] is False
         assert body["effortLevel"] == "low"
 
+    def test_hooks_are_empty_to_break_subagent_recursion(self, _isolated_lore_home):
+        # Without this, the subagent's own PostToolUse / Stop / SessionEnd
+        # events fire the user's lore-capture-* hooks and spawn nested
+        # capture-extracts ad infinitum. Observed in production:
+        # ~700 spawns/hour on Haiku, ~$34/h.
+        cfg = sc.subagent_config(role="capture", with_lore_mcp=True)
+        body = json.loads(Path(cfg.settings_path).read_text())
+        assert body["hooks"] == {
+            "UserPromptSubmit": [],
+            "PostToolUse": [],
+            "Stop": [],
+            "SessionEnd": [],
+        }
+
+
+# ── env_overrides() — recursion guard fallback ────────────────────
+
+
+class TestEnvOverrides:
+    def test_disarms_capture_and_dream_hook_kill_switches(self, _isolated_lore_home):
+        cfg = sc.subagent_config(role="capture", with_lore_mcp=True)
+        env = cfg.env_overrides()
+        # Master kill switches the lore hook scripts honor — second line
+        # of defense in case the parent claude process has cached
+        # ~/.claude/settings.json and is still firing user hooks against
+        # the subagent's session.
+        assert env["LORE_AUTO_SAVE"] == "false"
+        assert env["LORE_DREAM_AUTO"] == "false"
+
+    def test_env_overrides_same_for_all_roles(self, _isolated_lore_home):
+        # Recursion is a Claude-Code-level concern; affects all subagent
+        # types equally regardless of role.
+        for role in ("capture", "dream", "graph"):
+            cfg = sc.subagent_config(role=role, with_lore_mcp=False)
+            env = cfg.env_overrides()
+            assert env["LORE_AUTO_SAVE"] == "false"
+            assert env["LORE_DREAM_AUTO"] == "false"
+
 
 # ── claude_flags() shape ──────────────────────────────────────────