1- """GGUF (GGML Universal Format) quantization implementation."""
1+ """GGUF (GGML Universal Format) quantization implementation with enhanced 2-bit support ."""
22
33import gc
44import math
2525except ImportError :
2626 CT_AVAILABLE = False
2727
28- # Updated GGUF quantization types with detailed configurations
28+ # Updated GGUF quantization types with modern 2-bit support
2929SUPPORTED_GGUF_TYPES = {
3030 2 : {
3131 "Q2_K" : {
3535 "feed_forward.w2" : "Q4_K" ,
3636 "default" : "Q2_K"
3737 }
38+ },
39+ "IQ2_XXS" : {
40+ "description" : "Importance-based 2-bit quantization (extra small)" ,
41+ "tensor_configs" : {
42+ "default" : "IQ2_XXS"
43+ }
44+ },
45+ "IQ2_XS" : {
46+ "description" : "Importance-based 2-bit quantization (small)" ,
47+ "tensor_configs" : {
48+ "default" : "IQ2_XS"
49+ }
3850 }
3951 },
4052 3 : {
7991 "Q4_K_M" : {
8092 "description" : "Uses Q6_K for half of attention.wv and feed_forward.w2, Q4_K for others" ,
8193 "tensor_configs" : {
82- "attention.wv" : ["Q6_K" , "Q4_K" ], # Split tensors
83- "feed_forward.w2" : ["Q6_K" , "Q4_K" ], # Split tensors
94+ "attention.wv" : ["Q6_K" , "Q4_K" ],
95+ "feed_forward.w2" : ["Q6_K" , "Q4_K" ],
8496 "default" : "Q4_K"
8597 }
8698 },
107119 "Q5_K_M" : {
108120 "description" : "Uses Q6_K for half of attention.wv and feed_forward.w2, Q5_K for others" ,
109121 "tensor_configs" : {
110- "attention.wv" : ["Q6_K" , "Q5_K" ], # Split tensors
111- "feed_forward.w2" : ["Q6_K" , "Q5_K" ], # Split tensors
122+ "attention.wv" : ["Q6_K" , "Q5_K" ],
123+ "feed_forward.w2" : ["Q6_K" , "Q5_K" ],
112124 "default" : "Q5_K"
113125 }
114126 },
141153SUPPORTED_GGUF_BITS = list (SUPPORTED_GGUF_TYPES .keys ())
142154
143155class GGUFQuantizer :
144- """GGUF-specific quantizer implementation."""
156+ """GGUF-specific quantizer implementation with enhanced quantization support ."""
145157
146158 def __init__ (
147159 self ,
@@ -194,48 +206,43 @@ def __init__(
194206
195207 def _get_default_quant_type (self , bits : int ) -> str :
196208 """Select optimal GGUF quantization type based on bit width."""
209+ preferences = {
210+ 2 : "IQ2_XS" , # Prefer modern 2-bit quantization
211+ 3 : "Q3_K_M" ,
212+ 4 : "Q4_K_M" ,
213+ 5 : "Q5_K_M" ,
214+ 6 : "Q6_K" ,
215+ 8 : "Q8_0"
216+ }
197217 if bits in SUPPORTED_GGUF_TYPES :
198218 types = list (SUPPORTED_GGUF_TYPES [bits ].keys ())
199- # Prefer balanced options (e.g., Q4_K_M over Q4_K_S or Q4_0)
200- preferences = {
201- 2 : "Q2_K" ,
202- 3 : "Q3_K_M" ,
203- 4 : "Q4_K_M" ,
204- 5 : "Q5_K_M" ,
205- 6 : "Q6_K" ,
206- 8 : "Q8_0"
207- }
208219 return preferences .get (bits , types [0 ])
209220 raise ValueError (f"No supported GGUF types for { bits } bits" )
210221
211222 def get_tensor_quant_type (self , tensor_name : str ) -> Union [str , List [str ]]:
212223 """Get the quantization type for a specific tensor."""
213- # Check for exact match
214224 if tensor_name in self .tensor_configs :
215225 return self .tensor_configs [tensor_name ]
216226
217- # Check for partial matches (e.g., "attention.wv" in "model.attention.wv.weight")
218227 for key in self .tensor_configs :
219228 if key != "default" and key in tensor_name :
220229 return self .tensor_configs [key ]
221230
222- # Return default if no specific config found
223231 return self .tensor_configs ["default" ]
224232
225233 def _get_quant_description (self ) -> str :
226234 """Get the description of the current quantization configuration."""
227235 return SUPPORTED_GGUF_TYPES [self .bits ][self .quant_type ]["description" ]
228236
229237 def _initialize_model_and_tokenizer (self , model_name : Union [str , PreTrainedModel ]):
230- """Initialize model and tokenizer using BitsAndBytes."""
238+ """Initialize model and tokenizer using BitsAndBytes for 4/8-bit or FP16 for others ."""
231239 try :
232240 if isinstance (model_name , str ):
233241 logger .log_info (f"Loading tokenizer from: { model_name } " )
234242 self .tokenizer = AutoTokenizer .from_pretrained (model_name , trust_remote_code = True )
235243
236- logger .log_info (f"Loading model with BitsAndBytes configuration" )
244+ logger .log_info (f"Loading model with { ' BitsAndBytes' if self . bits in [ 4 , 8 ] else 'FP16' } configuration" )
237245
238- # Load model with quantization settings
239246 load_kwargs = {
240247 "device_map" : self .device_map ,
241248 "max_memory" : self .max_memory ,
@@ -245,8 +252,7 @@ def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel
245252 "trust_remote_code" : True ,
246253 }
247254
248- # Add quantization config if provided
249- if self .quantization_config :
255+ if self .quantization_config and self .bits in [4 , 8 ]:
250256 load_kwargs ["quantization_config" ] = self .quantization_config
251257
252258 self .model = AutoModelForCausalLM .from_pretrained (
@@ -266,7 +272,6 @@ def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel
266272 else :
267273 raise TypeError ("model_name must be a string or PreTrainedModel instance" )
268274
269- # Enable gradient checkpointing if requested
270275 if self .use_gradient_checkpointing and hasattr (self .model , "gradient_checkpointing_enable" ):
271276 self .model .gradient_checkpointing_enable ()
272277 logger .log_info ("Gradient checkpointing enabled for memory efficiency" )
@@ -282,7 +287,7 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
282287 total_params = sum (p .numel () for p in model .parameters ())
283288 param_size = sum (p .numel () * p .element_size () for p in model .parameters ())
284289 buffer_size = sum (b .numel () * b .element_size () for b in model .buffers ())
285- total_size = (param_size + buffer_size ) / (1024 * 1024 ) # MB
290+ total_size = (param_size + buffer_size ) / (1024 * 1024 )
286291
287292 prefix = f"{ stage } " if stage else ""
288293 logger .log_info (f"\n { prefix } Model Statistics:" )
@@ -293,18 +298,18 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
293298 logger .log_info (f"GPU Memory Reserved: { torch .cuda .memory_reserved () / (1024 * 1024 ):.2f} MB" )
294299
295300 def convert_to_gguf (self , output_path : str ):
296- """Convert quantized model to GGUF format using llama.cpp conversion tools ."""
301+ """Convert model to GGUF format with separate quantization step ."""
297302 if not CT_AVAILABLE :
298303 raise ImportError ("CTransformers is required for GGUF conversion" )
299304
300305 temp_dir = None
306+ temp_gguf = None
301307 try :
302- # Print header
303308 logger .log_info ("\n " + "=" * 80 )
304309 logger .log_info ("🚀 Starting GGUF Conversion Process" .center (80 ))
305310 logger .log_info ("=" * 80 + "\n " )
306311
307- # Model Information Section
312+ # Model Information
308313 logger .log_info ("📊 Model Information:" )
309314 logger .log_info ("-" * 40 )
310315 model_type = self .model .config .model_type if hasattr (self .model , 'config' ) else None
@@ -322,7 +327,7 @@ def convert_to_gguf(self, output_path: str):
322327 logger .log_info (f"• Model Size: { model_size :.2f} GB" )
323328 logger .log_info ("" )
324329
325- # Conversion Settings Section
330+ # Conversion Settings
326331 logger .log_info ("⚙️ Conversion Settings:" )
327332 logger .log_info ("-" * 40 )
328333 logger .log_info (f"• Output Path: { output_path } " )
@@ -340,93 +345,115 @@ def convert_to_gguf(self, output_path: str):
340345 logger .log_info ("• Checkpoint saved successfully" )
341346 logger .log_info ("" )
342347
343- # Find convert.py script
348+ # Find llama.cpp tools
344349 logger .log_info ("🔍 Locating GGUF Conversion Tools:" )
345350 logger .log_info ("-" * 40 )
346351
347- # First try pip installation path
348352 try :
349353 import llama_cpp
350354 llama_cpp_path = os .path .dirname (llama_cpp .__file__ )
351- potential_convert = os .path .join (llama_cpp_path , "convert.py" )
352- if os .path .exists (potential_convert ):
353- convert_script = potential_convert
354- logger .log_info (f"• Found convert.py in llama_cpp package: { convert_script } " )
355- else :
356- convert_script = None
357- except ImportError :
358- convert_script = None
359-
360- if not convert_script :
361- logger .log_info ("• Attempting to install llama-cpp-python..." )
355+ convert_script = os .path .join (llama_cpp_path , "convert.py" )
356+ quantize_bin = os .path .join (llama_cpp_path , "quantize" )
357+ if not os .path .exists (convert_script ):
358+ raise FileNotFoundError ("convert.py not found" )
359+ if not os .path .exists (quantize_bin ):
360+ raise FileNotFoundError ("quantize binary not found" )
361+ logger .log_info (f"• Found convert.py: { convert_script } " )
362+ logger .log_info (f"• Found quantize: { quantize_bin } " )
363+ except (ImportError , FileNotFoundError ) as e :
364+ logger .log_error (f"• Failed to locate llama.cpp tools: { e } " )
362365 try :
366+ logger .log_info ("• Attempting to install llama-cpp-python..." )
363367 subprocess .check_call ([sys .executable , "-m" , "pip" , "install" , "--upgrade" , "llama-cpp-python" ])
364368 import llama_cpp
365369 llama_cpp_path = os .path .dirname (llama_cpp .__file__ )
366370 convert_script = os .path .join (llama_cpp_path , "convert.py" )
367- if not os .path .exists (convert_script ):
368- raise FileNotFoundError ("convert.py not found after installation" )
369- logger .log_info ("• Successfully installed and located convert.py" )
370- except Exception as e :
371- logger .log_error (f"• Failed to install/locate llama-cpp-python: { e } " )
371+ quantize_bin = os .path .join (llama_cpp_path , "quantize" )
372+ logger .log_info ("• Successfully installed and located tools" )
373+ except Exception as inst_err :
372374 raise RuntimeError (
373- "Could not find or install llama-cpp-python. Please install manually: \n "
374- "pip install llama-cpp-python --upgrade"
375- )
375+ f "Could not find or install llama-cpp-python: { inst_err } \n "
376+ "Install manually: pip install llama-cpp-python --upgrade"
377+ ) from e
376378
377- logger .log_info ("" )
378-
379- # Build conversion command
380- logger .log_info ("🛠️ Preparing Conversion Command:" )
379+ # Convert to FP16 GGUF
380+ logger .log_info ("🛠️ Converting to FP16 GGUF:" )
381381 logger .log_info ("-" * 40 )
382-
383- cmd = [
382+ temp_gguf = f" { output_path } _temp_f16.gguf"
383+ cmd_convert = [
384384 sys .executable ,
385385 convert_script ,
386386 temp_dir ,
387- "--outfile" , output_path ,
388- "--outtype" , f"q { self . bits } " if self . bits < 16 else " f16" if self . bits == 16 else "f32 " ,
387+ "--outfile" , temp_gguf ,
388+ "--outtype" , " f16" ,
389389 "--model-type" , model_type
390390 ]
391391
392- logger .log_info (f"• Command: { ' ' .join (cmd )} " )
393- logger .log_info ("" )
392+ logger .log_info (f"• Command: { ' ' .join (cmd_convert )} " )
393+ with tqdm (total = 100 , desc = "Converting to FP16" , unit = "%" ) as pbar :
394+ process = subprocess .Popen (
395+ cmd_convert ,
396+ stdout = subprocess .PIPE ,
397+ stderr = subprocess .PIPE ,
398+ universal_newlines = True
399+ )
400+
401+ while True :
402+ output = process .stdout .readline ()
403+ if output == '' and process .poll () is not None :
404+ break
405+ if output and "Converting" in output :
406+ try :
407+ progress = int (output .split ("%" )[0 ].split ()[- 1 ])
408+ pbar .n = progress
409+ pbar .refresh ()
410+ except :
411+ pass
412+ logger .log_info (f"• { output .strip ()} " )
394413
395- # Execute conversion
396- logger .log_info ("🔄 Running GGUF Conversion:" )
414+ return_code = process .wait ()
415+ if return_code != 0 :
416+ error_output = process .stderr .read ()
417+ raise RuntimeError (f"FP16 GGUF conversion failed:\n { error_output } " )
418+
419+ # Quantize to target type
420+ logger .log_info ("\n 🔄 Quantizing GGUF:" )
397421 logger .log_info ("-" * 40 )
422+ cmd_quantize = [
423+ quantize_bin ,
424+ temp_gguf ,
425+ output_path ,
426+ self .quant_type .lower () # llama.cpp expects lowercase
427+ ]
398428
399- with tqdm ( total = 100 , desc = "Converting to GGUF" , unit = "%" ,
400- bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] " ) as pbar :
429+ logger . log_info ( f"• Command: { ' ' . join ( cmd_quantize ) } " )
430+ with tqdm ( total = 100 , desc = "Quantizing GGUF" , unit = "% " ) as pbar :
401431 process = subprocess .Popen (
402- cmd ,
432+ cmd_quantize ,
403433 stdout = subprocess .PIPE ,
404434 stderr = subprocess .PIPE ,
405435 universal_newlines = True
406436 )
407437
408- # Monitor conversion progress
409438 while True :
410439 output = process .stdout .readline ()
411440 if output == '' and process .poll () is not None :
412441 break
413- if output :
414- if "Converting" in output :
415- try :
416- progress = int (output .split ("%" )[0 ].split ()[- 1 ])
417- pbar .n = progress
418- pbar .refresh ()
419- except :
420- pass
442+ if output and "%" in output :
443+ try :
444+ progress = int (output .split ("%" )[0 ].split ()[- 1 ])
445+ pbar .n = progress
446+ pbar .refresh ()
447+ except :
448+ pass
421449 logger .log_info (f"• { output .strip ()} " )
422450
423- # Check for errors
424451 return_code = process .wait ()
425452 if return_code != 0 :
426453 error_output = process .stderr .read ()
427- raise RuntimeError (f"GGUF conversion failed with error :\n { error_output } " )
428-
429- # Verify and report results
454+ raise RuntimeError (f"GGUF quantization failed:\n { error_output } " )
455+
456+ # Verify results
430457 if os .path .exists (output_path ):
431458 logger .log_info ("\n ✅ Conversion Results:" )
432459 logger .log_info ("-" * 40 )
@@ -448,17 +475,16 @@ def convert_to_gguf(self, output_path: str):
448475 logger .log_error ("\n ❌ Conversion Failed:" )
449476 logger .log_error ("-" * 40 )
450477 logger .log_error (f"• Error: { str (e )} " )
451- if temp_dir and os .path .exists (temp_dir ):
452- shutil .rmtree (temp_dir , ignore_errors = True )
453478 raise RuntimeError (f"Failed to convert model to GGUF: { str (e )} " ) from e
454479
455480 finally :
456- # Cleanup
457481 if temp_dir and os .path .exists (temp_dir ):
458482 logger .log_info ("\n 🧹 Cleaning Up:" )
459483 logger .log_info ("-" * 40 )
460484 logger .log_info ("• Removing temporary files..." )
461485 shutil .rmtree (temp_dir , ignore_errors = True )
486+ if temp_gguf and os .path .exists (temp_gguf ):
487+ os .remove (temp_gguf )
462488 self ._clear_memory ()
463489
464490 def _clear_memory (self ):
@@ -467,5 +493,3 @@ def _clear_memory(self):
467493 if torch .cuda .is_available ():
468494 torch .cuda .empty_cache ()
469495 torch .cuda .synchronize ()
470-
471-
0 commit comments