SciSharp · martindevans · Aug 6, 2025 · Jul 1, 2025 · Jul 23, 2025 · Jul 25, 2025
diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -119,7 +119,7 @@ public void GlobalCleanup()
         {
             if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
             {
-                Executor.Context.NativeHandle.KvCacheClear();
+                Executor.Context.NativeHandle.MemoryClear();
             }
         }
 

diff --git a/LLama.Examples/Examples/BatchedExecutorSimple.cs b/LLama.Examples/Examples/BatchedExecutorSimple.cs
@@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>
 
                 // A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
                 // a bug in LLamaSharp, llama.cpp or a hardware error.
-                if (decodeResult == DecodeResult.Error)
-                    throw new Exception("Unknown error occurred while inferring.");
+                if (decodeResult != DecodeResult.Ok)
+                    throw new Exception($"Error occurred while inferring: {decodeResult}");
 
                 // After inference all of the conversations must be sampled before running inference again.
                 foreach (var conversationData in conversations)

diff --git a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
@@ -79,7 +79,7 @@ public static async Task Run()
                     // When the prompt contains images we clear KV_CACHE to restart conversation
                     // See:
                     // https://github.com/ggerganov/llama.cpp/discussions/3620
-                    ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
+                    ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
 
                     int index = 0;
                     foreach (var path in imagePathsWithCurlyBraces)

diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -110,6 +110,15 @@ public class ModelOptions
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
+        /// <inheritdoc />
+        public bool? OpOffload { get; set; }
+
+        /// <inheritdoc />
+        public bool? SwaFull { get; set; }
+
+        /// <inheritdoc />
+        public bool? KVUnified { get; set; }
+
         /// <inheritdoc />
         public float? DefragThreshold { get; set; }
 

diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -109,8 +109,7 @@ public interface IContextParams
     bool FlashAttention { get; }
 
     /// <summary>
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
-    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
+    /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>
     float? DefragThreshold { get; }
 
@@ -123,4 +122,25 @@ public interface IContextParams
     /// Attention type to use for embeddings
     /// </summary>
     LLamaAttentionType AttentionType { get; }
+
+    /// <summary>
+    /// Offload host tensor operations to device
+    /// </summary>
+    bool? OpOffload { get; }
+
+    /// <summary>
+    /// use a unified buffer across the input sequences when computing the attention.
+    /// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+    /// <br />
+    /// ref: <a href="https://github.com/ggml-org/llama.cpp/pull/14363">https://github.com/ggml-org/llama.cpp/pull/14363</a>
+    /// </summary>
+    bool? KVUnified { get; }
+
+    /// <summary>
+    /// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    /// </summary>
+    /// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
+    ///       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+    /// </remarks>
+    bool? SwaFull { get; }
 }
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
@@ -84,7 +84,7 @@
         _disposed = true;
 
         // Remove this conversation from the KV cache
-        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
+        Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
 
         // Prevent finalizer from running
         GC.SuppressFinalize(this);
@@ -129,7 +129,7 @@
         _forked = true;
 
         // Assign tokens to the new sequence
-        Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
+        Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);
 
         return c;
     }
@@ -406,11 +406,11 @@
         /// <param name="end">End position (exclusive)</param>
         public void Remove(LLamaPos start, LLamaPos end)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
 
         /// <summary>
        /// Removes <see cref="count"/> tokens starting from <see cref="start"/>
        /// </summary>
        /// <param name="start">Start position (inclusive)</param>
        /// <param name="count">Number of tokens</param>
@@ -420,7 +420,7 @@
                 return;
 
             var end = start.Value + count;
-            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
         }
         #endregion
 
@@ -435,7 +435,7 @@
         /// <param name="delta">Amount to add on to each token position</param>
         public void Add(LLamaPos start, LLamaPos end, int delta)
         {
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
         }
         #endregion
 
@@ -452,7 +452,7 @@
             if (divisor <= 0)
                 throw new ArgumentOutOfRangeException(nameof(divisor));
 
-            _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
+            _conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
         }
         #endregion
     }

diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
@@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
         }
         if (state.ContextState is null)
         {
-            Executor.Context.NativeHandle.KvCacheClear();
+            Executor.Context.NativeHandle.MemoryClear();
         }
         else
         {

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -112,6 +112,15 @@ public record ModelParams
         /// <inheritdoc />
         public bool VocabOnly { get; set; }
 
+        /// <inheritdoc />
+        public bool? OpOffload { get; set; }
+
+        /// <inheritdoc />
+        public bool? SwaFull { get; set; }
+
+        /// <inheritdoc />
+        public bool? KVUnified { get; set; }
+
         /// <summary>
         /// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
         /// </summary>

diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);
+
+            if (@params.SwaFull.HasValue)
+                result.swa_full = @params.SwaFull.Value;
+            if (@params.OpOffload.HasValue)
+                result.op_offload = @params.OpOffload.Value;
+            if (@params.KVUnified.HasValue)
+                result.kv_unified = @params.KVUnified.Value;
         }
 
         private static int Threads(int? value)

diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
@@ -128,15 +128,16 @@ public StatefulExecutorBase WithSessionFile(string filename)
             }
             if (File.Exists(filename))
             {
-                _logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
+                _logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename);
+
                 var session_tokens = new LLamaToken[Context.ContextSize];
                 if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
                 {
                     _logger?.LogError($"[LLamaExecutor] Failed to load session file {filename}");
                     throw new RuntimeError($"Failed to load session file {_pathSession}");
                 }
                 _session_tokens = session_tokens.Take((int)n_token_count_out).ToList();
-                _logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens");
+                _logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length);
             }
             else
             {
@@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
             // if we run out of context:
             // - take the tokensToKeep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
-            int n_left = _pastTokensCount - tokensToKeep;
-            int n_discard = n_left / 2;
+            var n_left = _pastTokensCount - tokensToKeep;
+            var n_discard = n_left / 2;
 
-            NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
-            NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
+            Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
+            Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
 
             _pastTokensCount -= n_discard;
             // stop saving session if we run out of context

diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
@@ -114,7 +114,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
             batch.Add(tokens[i], i, LLamaSeqId.Zero, true);
 
         // clear previous kv_cache values
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
@@ -144,7 +144,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
 
         var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];
 
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         return (normalize ? Sigmoid(score) : score, tokens.Length);
     }
@@ -155,7 +155,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
         var seqNum = logicCap.Value + 1;
         List<float> scores = new List<float>(seqNum);
         // clear previous kv_cache values
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         // Check if we should cancel the work, just before doing anything expensive (encode/decode)
         cancellationToken.ThrowIfCancellationRequested();
@@ -189,7 +189,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
             scores.Add(normalize ? Sigmoid(score) : score);
         }
 
-        Context.NativeHandle.KvCacheClear();
+        Context.NativeHandle.MemoryClear();
 
         return scores;
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -119,7 +119,7 @@ public void GlobalCleanup() @@
             {
                 if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
                 {
-                    Executor.Context.NativeHandle.KvCacheClear();
+                    Executor.Context.NativeHandle.MemoryClear();
                 }
             }
@@ Expand Down @@