@@ -92,7 +92,7 @@ def __init__(
92
92
embedding : bool = False ,
93
93
offload_kqv : bool = True ,
94
94
flash_attn : bool = False ,
95
- op_offloat : Optional [bool ] = None ,
95
+ op_offload : Optional [bool ] = None ,
96
96
swa_full : Optional [bool ] = None ,
97
97
# Sampling Params
98
98
no_perf : bool = False ,
@@ -174,7 +174,7 @@ def __init__(
174
174
embedding: Embedding mode only.
175
175
offload_kqv: Offload K, Q, V to GPU.
176
176
flash_attn: Use flash attention.
177
- op_offloat : offload host tensor operations to device
177
+ op_offload : offload host tensor operations to device
178
178
swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
179
179
no_perf: Measure performance timings.
180
180
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
@@ -343,8 +343,8 @@ def __init__(
343
343
self .context_params .offload_kqv = offload_kqv
344
344
self .context_params .flash_attn = flash_attn
345
345
346
- if op_offloat is not None :
347
- self .context_params .op_offloat = op_offloat
346
+ if op_offload is not None :
347
+ self .context_params .op_offload = op_offload
348
348
349
349
if swa_full is not None :
350
350
self .context_params .swa_full = swa_full
@@ -2097,7 +2097,7 @@ def __getstate__(self):
2097
2097
embedding = self .context_params .embeddings ,
2098
2098
offload_kqv = self .context_params .offload_kqv ,
2099
2099
flash_attn = self .context_params .flash_attn ,
2100
- op_offloat = self .context_params .op_offloat ,
2100
+ op_offload = self .context_params .op_offload ,
2101
2101
swa_full = self .context_params .swa_full ,
2102
2102
# Sampling Params
2103
2103
no_perf = self .context_params .no_perf ,
0 commit comments