Skip to content

Commit 2c064e8

Browse files
Update gguf.py
1 parent 02932be commit 2c064e8

File tree

1 file changed

+108
-84
lines changed

1 file changed

+108
-84
lines changed

quantllm/quant/gguf.py

Lines changed: 108 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""GGUF (GGML Universal Format) quantization implementation."""
1+
"""GGUF (GGML Universal Format) quantization implementation with enhanced 2-bit support."""
22

33
import gc
44
import math
@@ -25,7 +25,7 @@
2525
except ImportError:
2626
CT_AVAILABLE = False
2727

28-
# Updated GGUF quantization types with detailed configurations
28+
# Updated GGUF quantization types with modern 2-bit support
2929
SUPPORTED_GGUF_TYPES = {
3030
2: {
3131
"Q2_K": {
@@ -35,6 +35,18 @@
3535
"feed_forward.w2": "Q4_K",
3636
"default": "Q2_K"
3737
}
38+
},
39+
"IQ2_XXS": {
40+
"description": "Importance-based 2-bit quantization (extra small)",
41+
"tensor_configs": {
42+
"default": "IQ2_XXS"
43+
}
44+
},
45+
"IQ2_XS": {
46+
"description": "Importance-based 2-bit quantization (small)",
47+
"tensor_configs": {
48+
"default": "IQ2_XS"
49+
}
3850
}
3951
},
4052
3: {
@@ -79,8 +91,8 @@
7991
"Q4_K_M": {
8092
"description": "Uses Q6_K for half of attention.wv and feed_forward.w2, Q4_K for others",
8193
"tensor_configs": {
82-
"attention.wv": ["Q6_K", "Q4_K"], # Split tensors
83-
"feed_forward.w2": ["Q6_K", "Q4_K"], # Split tensors
94+
"attention.wv": ["Q6_K", "Q4_K"],
95+
"feed_forward.w2": ["Q6_K", "Q4_K"],
8496
"default": "Q4_K"
8597
}
8698
},
@@ -107,8 +119,8 @@
107119
"Q5_K_M": {
108120
"description": "Uses Q6_K for half of attention.wv and feed_forward.w2, Q5_K for others",
109121
"tensor_configs": {
110-
"attention.wv": ["Q6_K", "Q5_K"], # Split tensors
111-
"feed_forward.w2": ["Q6_K", "Q5_K"], # Split tensors
122+
"attention.wv": ["Q6_K", "Q5_K"],
123+
"feed_forward.w2": ["Q6_K", "Q5_K"],
112124
"default": "Q5_K"
113125
}
114126
},
@@ -141,7 +153,7 @@
141153
SUPPORTED_GGUF_BITS = list(SUPPORTED_GGUF_TYPES.keys())
142154

143155
class GGUFQuantizer:
144-
"""GGUF-specific quantizer implementation."""
156+
"""GGUF-specific quantizer implementation with enhanced quantization support."""
145157

146158
def __init__(
147159
self,
@@ -194,48 +206,43 @@ def __init__(
194206

195207
def _get_default_quant_type(self, bits: int) -> str:
196208
"""Select optimal GGUF quantization type based on bit width."""
209+
preferences = {
210+
2: "IQ2_XS", # Prefer modern 2-bit quantization
211+
3: "Q3_K_M",
212+
4: "Q4_K_M",
213+
5: "Q5_K_M",
214+
6: "Q6_K",
215+
8: "Q8_0"
216+
}
197217
if bits in SUPPORTED_GGUF_TYPES:
198218
types = list(SUPPORTED_GGUF_TYPES[bits].keys())
199-
# Prefer balanced options (e.g., Q4_K_M over Q4_K_S or Q4_0)
200-
preferences = {
201-
2: "Q2_K",
202-
3: "Q3_K_M",
203-
4: "Q4_K_M",
204-
5: "Q5_K_M",
205-
6: "Q6_K",
206-
8: "Q8_0"
207-
}
208219
return preferences.get(bits, types[0])
209220
raise ValueError(f"No supported GGUF types for {bits} bits")
210221

211222
def get_tensor_quant_type(self, tensor_name: str) -> Union[str, List[str]]:
212223
"""Get the quantization type for a specific tensor."""
213-
# Check for exact match
214224
if tensor_name in self.tensor_configs:
215225
return self.tensor_configs[tensor_name]
216226

217-
# Check for partial matches (e.g., "attention.wv" in "model.attention.wv.weight")
218227
for key in self.tensor_configs:
219228
if key != "default" and key in tensor_name:
220229
return self.tensor_configs[key]
221230

222-
# Return default if no specific config found
223231
return self.tensor_configs["default"]
224232

225233
def _get_quant_description(self) -> str:
226234
"""Get the description of the current quantization configuration."""
227235
return SUPPORTED_GGUF_TYPES[self.bits][self.quant_type]["description"]
228236

229237
def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel]):
230-
"""Initialize model and tokenizer using BitsAndBytes."""
238+
"""Initialize model and tokenizer using BitsAndBytes for 4/8-bit or FP16 for others."""
231239
try:
232240
if isinstance(model_name, str):
233241
logger.log_info(f"Loading tokenizer from: {model_name}")
234242
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
235243

236-
logger.log_info(f"Loading model with BitsAndBytes configuration")
244+
logger.log_info(f"Loading model with {'BitsAndBytes' if self.bits in [4, 8] else 'FP16'} configuration")
237245

238-
# Load model with quantization settings
239246
load_kwargs = {
240247
"device_map": self.device_map,
241248
"max_memory": self.max_memory,
@@ -245,8 +252,7 @@ def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel
245252
"trust_remote_code": True,
246253
}
247254

248-
# Add quantization config if provided
249-
if self.quantization_config:
255+
if self.quantization_config and self.bits in [4, 8]:
250256
load_kwargs["quantization_config"] = self.quantization_config
251257

252258
self.model = AutoModelForCausalLM.from_pretrained(
@@ -266,7 +272,6 @@ def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel
266272
else:
267273
raise TypeError("model_name must be a string or PreTrainedModel instance")
268274

269-
# Enable gradient checkpointing if requested
270275
if self.use_gradient_checkpointing and hasattr(self.model, "gradient_checkpointing_enable"):
271276
self.model.gradient_checkpointing_enable()
272277
logger.log_info("Gradient checkpointing enabled for memory efficiency")
@@ -282,7 +287,7 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
282287
total_params = sum(p.numel() for p in model.parameters())
283288
param_size = sum(p.numel() * p.element_size() for p in model.parameters())
284289
buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
285-
total_size = (param_size + buffer_size) / (1024 * 1024) # MB
290+
total_size = (param_size + buffer_size) / (1024 * 1024)
286291

287292
prefix = f"{stage} " if stage else ""
288293
logger.log_info(f"\n{prefix}Model Statistics:")
@@ -293,18 +298,18 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
293298
logger.log_info(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / (1024 * 1024):.2f} MB")
294299

295300
def convert_to_gguf(self, output_path: str):
296-
"""Convert quantized model to GGUF format using llama.cpp conversion tools."""
301+
"""Convert model to GGUF format with separate quantization step."""
297302
if not CT_AVAILABLE:
298303
raise ImportError("CTransformers is required for GGUF conversion")
299304

300305
temp_dir = None
306+
temp_gguf = None
301307
try:
302-
# Print header
303308
logger.log_info("\n" + "="*80)
304309
logger.log_info("🚀 Starting GGUF Conversion Process".center(80))
305310
logger.log_info("="*80 + "\n")
306311

307-
# Model Information Section
312+
# Model Information
308313
logger.log_info("📊 Model Information:")
309314
logger.log_info("-"*40)
310315
model_type = self.model.config.model_type if hasattr(self.model, 'config') else None
@@ -322,7 +327,7 @@ def convert_to_gguf(self, output_path: str):
322327
logger.log_info(f"• Model Size: {model_size:.2f} GB")
323328
logger.log_info("")
324329

325-
# Conversion Settings Section
330+
# Conversion Settings
326331
logger.log_info("⚙️ Conversion Settings:")
327332
logger.log_info("-"*40)
328333
logger.log_info(f"• Output Path: {output_path}")
@@ -340,93 +345,115 @@ def convert_to_gguf(self, output_path: str):
340345
logger.log_info("• Checkpoint saved successfully")
341346
logger.log_info("")
342347

343-
# Find convert.py script
348+
# Find llama.cpp tools
344349
logger.log_info("🔍 Locating GGUF Conversion Tools:")
345350
logger.log_info("-"*40)
346351

347-
# First try pip installation path
348352
try:
349353
import llama_cpp
350354
llama_cpp_path = os.path.dirname(llama_cpp.__file__)
351-
potential_convert = os.path.join(llama_cpp_path, "convert.py")
352-
if os.path.exists(potential_convert):
353-
convert_script = potential_convert
354-
logger.log_info(f"• Found convert.py in llama_cpp package: {convert_script}")
355-
else:
356-
convert_script = None
357-
except ImportError:
358-
convert_script = None
359-
360-
if not convert_script:
361-
logger.log_info("• Attempting to install llama-cpp-python...")
355+
convert_script = os.path.join(llama_cpp_path, "convert.py")
356+
quantize_bin = os.path.join(llama_cpp_path, "quantize")
357+
if not os.path.exists(convert_script):
358+
raise FileNotFoundError("convert.py not found")
359+
if not os.path.exists(quantize_bin):
360+
raise FileNotFoundError("quantize binary not found")
361+
logger.log_info(f"• Found convert.py: {convert_script}")
362+
logger.log_info(f"• Found quantize: {quantize_bin}")
363+
except (ImportError, FileNotFoundError) as e:
364+
logger.log_error(f"• Failed to locate llama.cpp tools: {e}")
362365
try:
366+
logger.log_info("• Attempting to install llama-cpp-python...")
363367
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"])
364368
import llama_cpp
365369
llama_cpp_path = os.path.dirname(llama_cpp.__file__)
366370
convert_script = os.path.join(llama_cpp_path, "convert.py")
367-
if not os.path.exists(convert_script):
368-
raise FileNotFoundError("convert.py not found after installation")
369-
logger.log_info("• Successfully installed and located convert.py")
370-
except Exception as e:
371-
logger.log_error(f"• Failed to install/locate llama-cpp-python: {e}")
371+
quantize_bin = os.path.join(llama_cpp_path, "quantize")
372+
logger.log_info("• Successfully installed and located tools")
373+
except Exception as inst_err:
372374
raise RuntimeError(
373-
"Could not find or install llama-cpp-python. Please install manually:\n"
374-
"pip install llama-cpp-python --upgrade"
375-
)
375+
f"Could not find or install llama-cpp-python: {inst_err}\n"
376+
"Install manually: pip install llama-cpp-python --upgrade"
377+
) from e
376378

377-
logger.log_info("")
378-
379-
# Build conversion command
380-
logger.log_info("🛠️ Preparing Conversion Command:")
379+
# Convert to FP16 GGUF
380+
logger.log_info("🛠️ Converting to FP16 GGUF:")
381381
logger.log_info("-"*40)
382-
383-
cmd = [
382+
temp_gguf = f"{output_path}_temp_f16.gguf"
383+
cmd_convert = [
384384
sys.executable,
385385
convert_script,
386386
temp_dir,
387-
"--outfile", output_path,
388-
"--outtype", f"q{self.bits}" if self.bits < 16 else "f16" if self.bits == 16 else "f32",
387+
"--outfile", temp_gguf,
388+
"--outtype", "f16",
389389
"--model-type", model_type
390390
]
391391

392-
logger.log_info(f"• Command: {' '.join(cmd)}")
393-
logger.log_info("")
392+
logger.log_info(f"• Command: {' '.join(cmd_convert)}")
393+
with tqdm(total=100, desc="Converting to FP16", unit="%") as pbar:
394+
process = subprocess.Popen(
395+
cmd_convert,
396+
stdout=subprocess.PIPE,
397+
stderr=subprocess.PIPE,
398+
universal_newlines=True
399+
)
400+
401+
while True:
402+
output = process.stdout.readline()
403+
if output == '' and process.poll() is not None:
404+
break
405+
if output and "Converting" in output:
406+
try:
407+
progress = int(output.split("%")[0].split()[-1])
408+
pbar.n = progress
409+
pbar.refresh()
410+
except:
411+
pass
412+
logger.log_info(f"• {output.strip()}")
394413

395-
# Execute conversion
396-
logger.log_info("🔄 Running GGUF Conversion:")
414+
return_code = process.wait()
415+
if return_code != 0:
416+
error_output = process.stderr.read()
417+
raise RuntimeError(f"FP16 GGUF conversion failed:\n{error_output}")
418+
419+
# Quantize to target type
420+
logger.log_info("\n🔄 Quantizing GGUF:")
397421
logger.log_info("-"*40)
422+
cmd_quantize = [
423+
quantize_bin,
424+
temp_gguf,
425+
output_path,
426+
self.quant_type.lower() # llama.cpp expects lowercase
427+
]
398428

399-
with tqdm(total=100, desc="Converting to GGUF", unit="%",
400-
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") as pbar:
429+
logger.log_info(f"• Command: {' '.join(cmd_quantize)}")
430+
with tqdm(total=100, desc="Quantizing GGUF", unit="%") as pbar:
401431
process = subprocess.Popen(
402-
cmd,
432+
cmd_quantize,
403433
stdout=subprocess.PIPE,
404434
stderr=subprocess.PIPE,
405435
universal_newlines=True
406436
)
407437

408-
# Monitor conversion progress
409438
while True:
410439
output = process.stdout.readline()
411440
if output == '' and process.poll() is not None:
412441
break
413-
if output:
414-
if "Converting" in output:
415-
try:
416-
progress = int(output.split("%")[0].split()[-1])
417-
pbar.n = progress
418-
pbar.refresh()
419-
except:
420-
pass
442+
if output and "%" in output:
443+
try:
444+
progress = int(output.split("%")[0].split()[-1])
445+
pbar.n = progress
446+
pbar.refresh()
447+
except:
448+
pass
421449
logger.log_info(f"• {output.strip()}")
422450

423-
# Check for errors
424451
return_code = process.wait()
425452
if return_code != 0:
426453
error_output = process.stderr.read()
427-
raise RuntimeError(f"GGUF conversion failed with error:\n{error_output}")
428-
429-
# Verify and report results
454+
raise RuntimeError(f"GGUF quantization failed:\n{error_output}")
455+
456+
# Verify results
430457
if os.path.exists(output_path):
431458
logger.log_info("\n✅ Conversion Results:")
432459
logger.log_info("-"*40)
@@ -448,17 +475,16 @@ def convert_to_gguf(self, output_path: str):
448475
logger.log_error("\n❌ Conversion Failed:")
449476
logger.log_error("-"*40)
450477
logger.log_error(f"• Error: {str(e)}")
451-
if temp_dir and os.path.exists(temp_dir):
452-
shutil.rmtree(temp_dir, ignore_errors=True)
453478
raise RuntimeError(f"Failed to convert model to GGUF: {str(e)}") from e
454479

455480
finally:
456-
# Cleanup
457481
if temp_dir and os.path.exists(temp_dir):
458482
logger.log_info("\n🧹 Cleaning Up:")
459483
logger.log_info("-"*40)
460484
logger.log_info("• Removing temporary files...")
461485
shutil.rmtree(temp_dir, ignore_errors=True)
486+
if temp_gguf and os.path.exists(temp_gguf):
487+
os.remove(temp_gguf)
462488
self._clear_memory()
463489

464490
def _clear_memory(self):
@@ -467,5 +493,3 @@ def _clear_memory(self):
467493
if torch.cuda.is_available():
468494
torch.cuda.empty_cache()
469495
torch.cuda.synchronize()
470-
471-

0 commit comments

Comments
 (0)