Skip to content

Update july 2025 #1225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ public void GlobalCleanup()
{
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
{
Executor.Context.NativeHandle.KvCacheClear();
Executor.Context.NativeHandle.MemoryClear();
}
}

Expand Down
4 changes: 2 additions & 2 deletions LLama.Examples/Examples/BatchedExecutorSimple.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ await AnsiConsole.Live(table).StartAsync(async ctx =>

// A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
// a bug in LLamaSharp, llama.cpp or a hardware error.
if (decodeResult == DecodeResult.Error)
throw new Exception("Unknown error occurred while inferring.");
if (decodeResult != DecodeResult.Ok)
throw new Exception($"Error occurred while inferring: {decodeResult}");

// After inference all of the conversations must be sampled before running inference again.
foreach (var conversationData in conversations)
Expand Down
2 changes: 1 addition & 1 deletion LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public static async Task Run()
// When the prompt contains images we clear KV_CACHE to restart conversation
// See:
// https://github.com/ggerganov/llama.cpp/discussions/3620
ex.Context.NativeHandle.KvCacheRemove( LLamaSeqId.Zero, -1, -1 );
ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );

int index = 0;
foreach (var path in imagePathsWithCurlyBraces)
Expand Down
54 changes: 0 additions & 54 deletions LLama.Unittest/LLavaWeightsTests.cs

This file was deleted.

9 changes: 9 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@ public class ModelOptions
/// <inheritdoc />
public bool VocabOnly { get; set; }

/// <inheritdoc />
public bool? OpOffload { get; set; }

/// <inheritdoc />
public bool? SwaFull { get; set; }

/// <inheritdoc />
public bool? KVUnified { get; set; }

/// <inheritdoc />
public float? DefragThreshold { get; set; }

Expand Down
24 changes: 22 additions & 2 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ public interface IContextParams
bool FlashAttention { get; }

/// <summary>
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
/// </summary>
float? DefragThreshold { get; }

Expand All @@ -123,4 +122,25 @@ public interface IContextParams
/// Attention type to use for embeddings
/// </summary>
LLamaAttentionType AttentionType { get; }

/// <summary>
/// Offload host tensor operations to device
/// </summary>
bool? OpOffload { get; }

/// <summary>
/// use a unified buffer across the input sequences when computing the attention.
/// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
/// <br />
/// ref: <a href="https://github.com/ggml-org/llama.cpp/pull/14363">https://github.com/ggml-org/llama.cpp/pull/14363</a>
/// </summary>
bool? KVUnified { get; }

/// <summary>
/// Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
/// </summary>
/// <remarks>Setting to false when n_seq_max > 1 can cause bad performance in some cases
/// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
/// </remarks>
bool? SwaFull { get; }
}
12 changes: 6 additions & 6 deletions LLama/Batched/Conversation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
_disposed = true;

// Remove this conversation from the KV cache
Executor.Context.NativeHandle.KvCacheRemove(ConversationId, -1, -1);
Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);

// Prevent finalizer from running
GC.SuppressFinalize(this);
Expand Down Expand Up @@ -129,7 +129,7 @@
_forked = true;

// Assign tokens to the new sequence
Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
Executor.Context.NativeHandle.MemorySequenceCopy(ConversationId, c.ConversationId, 0, _end);

return c;
}
Expand Down Expand Up @@ -406,11 +406,11 @@
/// <param name="end">End position (exclusive)</param>
public void Remove(LLamaPos start, LLamaPos end)
{
_conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
_conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
}

/// <summary>
/// Removes <see cref="count"/> tokens starting from <see cref="start"/>

Check warning on line 413 in LLama/Batched/Conversation.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

XML comment has cref attribute 'start' that could not be resolved

Check warning on line 413 in LLama/Batched/Conversation.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

XML comment has cref attribute 'count' that could not be resolved

Check warning on line 413 in LLama/Batched/Conversation.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

XML comment has cref attribute 'start' that could not be resolved

Check warning on line 413 in LLama/Batched/Conversation.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

XML comment has cref attribute 'count' that could not be resolved

Check warning on line 413 in LLama/Batched/Conversation.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

XML comment has cref attribute 'start' that could not be resolved

Check warning on line 413 in LLama/Batched/Conversation.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

XML comment has cref attribute 'count' that could not be resolved
/// </summary>
/// <param name="start">Start position (inclusive)</param>
/// <param name="count">Number of tokens</param>
Expand All @@ -420,7 +420,7 @@
return;

var end = start.Value + count;
_conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
_conversation.Executor.Context.NativeHandle.MemorySequenceRemove(_conversation.ConversationId, start, end);
}
#endregion

Expand All @@ -435,7 +435,7 @@
/// <param name="delta">Amount to add on to each token position</param>
public void Add(LLamaPos start, LLamaPos end, int delta)
{
_conversation.Executor.Context.NativeHandle.KvCacheSequenceAdd(_conversation.ConversationId, start, end, delta);
_conversation.Executor.Context.NativeHandle.MemorySequenceAdd(_conversation.ConversationId, start, end, delta);
}
#endregion

Expand All @@ -452,7 +452,7 @@
if (divisor <= 0)
throw new ArgumentOutOfRangeException(nameof(divisor));

_conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
_conversation.Executor.Context.NativeHandle.MemorySequenceDivide(_conversation.ConversationId, start, end, divisor);
}
#endregion
}
Expand Down
2 changes: 1 addition & 1 deletion LLama/ChatSession.cs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public void LoadSession(SessionState state, bool loadTransforms = true)
}
if (state.ContextState is null)
{
Executor.Context.NativeHandle.KvCacheClear();
Executor.Context.NativeHandle.MemoryClear();
}
else
{
Expand Down
9 changes: 9 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ public record ModelParams
/// <inheritdoc />
public bool VocabOnly { get; set; }

/// <inheritdoc />
public bool? OpOffload { get; set; }

/// <inheritdoc />
public bool? SwaFull { get; set; }

/// <inheritdoc />
public bool? KVUnified { get; set; }

/// <summary>
/// `Encoding` cannot be directly JSON serialized, instead store the name as a string which can
/// </summary>
Expand Down
7 changes: 7 additions & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo

result.n_threads = Threads(@params.Threads);
result.n_threads_batch = Threads(@params.BatchThreads);

if (@params.SwaFull.HasValue)
result.swa_full = @params.SwaFull.Value;
if (@params.OpOffload.HasValue)
result.op_offload = @params.OpOffload.Value;
if (@params.KVUnified.HasValue)
result.kv_unified = @params.KVUnified.Value;
}

private static int Threads(int? value)
Expand Down
13 changes: 7 additions & 6 deletions LLama/LLamaExecutorBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,15 +128,16 @@ public StatefulExecutorBase WithSessionFile(string filename)
}
if (File.Exists(filename))
{
_logger?.LogInformation($"[LLamaExecutor] Attempting to load saved session from {filename}");
_logger?.LogInformation("[LLamaExecutor] Attempting to load saved session from {0}", filename);

var session_tokens = new LLamaToken[Context.ContextSize];
if (!NativeApi.llama_state_load_file(Context.NativeHandle, _pathSession, session_tokens, (ulong)Context.ContextSize, out var n_token_count_out))
{
_logger?.LogError($"[LLamaExecutor] Failed to load session file {filename}");
throw new RuntimeError($"Failed to load session file {_pathSession}");
}
_session_tokens = session_tokens.Take((int)n_token_count_out).ToList();
_logger?.LogInformation($"[LLamaExecutor] Loaded a session with prompt size of {session_tokens.Length} tokens");
_logger?.LogInformation("[LLamaExecutor] Loaded a session with prompt size of {0} tokens", session_tokens.Length);
}
else
{
Expand Down Expand Up @@ -190,11 +191,11 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
// if we run out of context:
// - take the tokensToKeep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
int n_left = _pastTokensCount - tokensToKeep;
int n_discard = n_left / 2;
var n_left = _pastTokensCount - tokensToKeep;
var n_discard = n_left / 2;

NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
Context.NativeHandle.MemorySequenceRemove(LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
Context.NativeHandle.MemorySequenceAdd(LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);

_pastTokensCount -= n_discard;
// stop saving session if we run out of context
Expand Down
8 changes: 4 additions & 4 deletions LLama/LLamaReranker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn
batch.Add(tokens[i], i, LLamaSeqId.Zero, true);

// clear previous kv_cache values
Context.NativeHandle.KvCacheClear();
Context.NativeHandle.MemoryClear();

// Check if we should cancel the work, just before doing anything expensive (encode/decode)
cancellationToken.ThrowIfCancellationRequested();
Expand Down Expand Up @@ -144,7 +144,7 @@ public async Task<IReadOnlyList<float>> GetRelevanceScores(string input, IReadOn

var score = Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero)[0];

Context.NativeHandle.KvCacheClear();
Context.NativeHandle.MemoryClear();

return (normalize ? Sigmoid(score) : score, tokens.Length);
}
Expand All @@ -155,7 +155,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
var seqNum = logicCap.Value + 1;
List<float> scores = new List<float>(seqNum);
// clear previous kv_cache values
Context.NativeHandle.KvCacheClear();
Context.NativeHandle.MemoryClear();

// Check if we should cancel the work, just before doing anything expensive (encode/decode)
cancellationToken.ThrowIfCancellationRequested();
Expand Down Expand Up @@ -189,7 +189,7 @@ private async Task<IReadOnlyList<float>> CalcRelevanceScores(LLamaBatch batch, b
scores.Add(normalize ? Sigmoid(score) : score);
}

Context.NativeHandle.KvCacheClear();
Context.NativeHandle.MemoryClear();

return scores;
}
Expand Down
Loading
Loading