From 15602eca462dafe358f41c476d3b538f0539a522 Mon Sep 17 00:00:00 2001 From: sergey21000 <67040429+sergey21000@users.noreply.github.com> Date: Sat, 2 Aug 2025 10:28:56 +0300 Subject: [PATCH] fix: rename op_offloat to op_offload in llama.py --- llama_cpp/llama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2e93670e6..71d94ebd8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -92,7 +92,7 @@ def __init__( embedding: bool = False, offload_kqv: bool = True, flash_attn: bool = False, - op_offloat: Optional[bool] = None, + op_offload: Optional[bool] = None, swa_full: Optional[bool] = None, # Sampling Params no_perf: bool = False, @@ -174,7 +174,7 @@ def __init__( embedding: Embedding mode only. offload_kqv: Offload K, Q, V to GPU. flash_attn: Use flash attention. - op_offloat: offload host tensor operations to device + op_offload: offload host tensor operations to device swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) no_perf: Measure performance timings. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. @@ -343,8 +343,8 @@ def __init__( self.context_params.offload_kqv = offload_kqv self.context_params.flash_attn = flash_attn - if op_offloat is not None: - self.context_params.op_offloat = op_offloat + if op_offload is not None: + self.context_params.op_offload = op_offload if swa_full is not None: self.context_params.swa_full = swa_full @@ -2097,7 +2097,7 @@ def __getstate__(self): embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, flash_attn=self.context_params.flash_attn, - op_offloat=self.context_params.op_offloat, + op_offload=self.context_params.op_offload, swa_full=self.context_params.swa_full, # Sampling Params no_perf=self.context_params.no_perf,