fix: prevent KV cache corruption on SWA/ISWA models (e.g. Gemma-4)

Ralf Waldukat · Ralf Waldukat · commit 939fa72d10c7 · 2026-04-12T22:49:37.000+07:00
SWA/ISWA KV caches maintain global position maps (g_iswa_pos_max/min) that
are only cleared by llama_memory_clear(), not by kv_cache_seq_rm(). When
generate() finds a prefix match (e.g. shared BOS token), it calls
kv_cache_seq_rm which returns True for ISWA, skipping the full reset. But
the stale position maps cause batch allocator inconsistency and
llama_decode returned -1 on subsequent prompts.

Changes:
- Add _has_swa property via llama_model_n_swa() &gt; 0
- reset() now calls llama_memory_clear() unconditionally
- generate() bypasses prefix-match optimization for SWA models,
  forcing full state reset (same path as recurrent models)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -553,6 +553,14 @@ def free_lora_adapter():
 
         self._sampler = None
 
+        # Cache model architecture flags to avoid repeated FFI calls
+        self._is_recurrent_model = llama_cpp.llama_model_is_recurrent(
+            self._model.model
+        ) or llama_cpp.llama_model_is_hybrid(self._model.model)
+        self._has_swa_model = llama_cpp.llama_model_n_swa(
+            self._model.model
+        ) > 0
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         return self._ctx.ctx
@@ -580,6 +588,14 @@ def eval_logits(self) -> Deque[List[float]]:
             maxlen=self._n_ctx if self._logits_all else 1,
         )
 
+    @property
+    def _is_recurrent(self) -> bool:
+        return self._is_recurrent_model
+
+    @property
+    def _has_swa(self) -> bool:
+        return self._has_swa_model
+
     def tokenize(
         self, text: bytes, add_bos: bool = True, special: bool = False
     ) -> List[int]:
@@ -638,6 +654,10 @@ def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
 
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        if mem is not None:
+            llama_cpp.llama_memory_clear(mem, True)
+
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
 
@@ -889,11 +909,29 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
+
+            # Recurrent models cannot rewind state; reset if needed
+            if self._is_recurrent and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+                if self.verbose:
+                    print(
+                        "Llama.generate: recurrent model requires full state reset",
+                        file=sys.stderr,
+                    )
+
+            # SWA/ISWA models (e.g. Gemma-4) have split KV caches whose
+            # position-tracking maps are only cleared by a full reset.
+            # Partial seq_rm leaves stale positions and causes decode failure.
+            if self._has_swa and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+
             if longest_prefix > 0:
                 if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
                     reset = False