Update gguf.py

codewithdark-git · web-flow · commit 2c064e8f1232 · 2025-06-21T22:37:00.000+05:00
diff --git a/quantllm/quant/gguf.py b/quantllm/quant/gguf.py
@@ -1,4 +1,4 @@
-"""GGUF (GGML Universal Format) quantization implementation."""
+"""GGUF (GGML Universal Format) quantization implementation with enhanced 2-bit support."""
 
 import gc
 import math
@@ -25,7 +25,7 @@
 except ImportError:
     CT_AVAILABLE = False
 
-# Updated GGUF quantization types with detailed configurations
+# Updated GGUF quantization types with modern 2-bit support
 SUPPORTED_GGUF_TYPES = {
     2: {
         "Q2_K": {
@@ -35,6 +35,18 @@
                 "feed_forward.w2": "Q4_K",
                 "default": "Q2_K"
             }
+        },
+        "IQ2_XXS": {
+            "description": "Importance-based 2-bit quantization (extra small)",
+            "tensor_configs": {
+                "default": "IQ2_XXS"
+            }
+        },
+        "IQ2_XS": {
+            "description": "Importance-based 2-bit quantization (small)",
+            "tensor_configs": {
+                "default": "IQ2_XS"
+            }
         }
     },
     3: {
@@ -79,8 +91,8 @@
         "Q4_K_M": {
             "description": "Uses Q6_K for half of attention.wv and feed_forward.w2, Q4_K for others",
             "tensor_configs": {
-                "attention.wv": ["Q6_K", "Q4_K"],  # Split tensors
-                "feed_forward.w2": ["Q6_K", "Q4_K"],  # Split tensors
+                "attention.wv": ["Q6_K", "Q4_K"],
+                "feed_forward.w2": ["Q6_K", "Q4_K"],
                 "default": "Q4_K"
             }
         },
@@ -107,8 +119,8 @@
         "Q5_K_M": {
             "description": "Uses Q6_K for half of attention.wv and feed_forward.w2, Q5_K for others",
             "tensor_configs": {
-                "attention.wv": ["Q6_K", "Q5_K"],  # Split tensors
-                "feed_forward.w2": ["Q6_K", "Q5_K"],  # Split tensors
+                "attention.wv": ["Q6_K", "Q5_K"],
+                "feed_forward.w2": ["Q6_K", "Q5_K"],
                 "default": "Q5_K"
             }
         },
@@ -141,7 +153,7 @@
 SUPPORTED_GGUF_BITS = list(SUPPORTED_GGUF_TYPES.keys())
 
 class GGUFQuantizer:
-    """GGUF-specific quantizer implementation."""
+    """GGUF-specific quantizer implementation with enhanced quantization support."""
     
     def __init__(
         self,
@@ -194,48 +206,43 @@ def __init__(
 
     def _get_default_quant_type(self, bits: int) -> str:
         """Select optimal GGUF quantization type based on bit width."""
+        preferences = {
+            2: "IQ2_XS",  # Prefer modern 2-bit quantization
+            3: "Q3_K_M",
+            4: "Q4_K_M",
+            5: "Q5_K_M",
+            6: "Q6_K",
+            8: "Q8_0"
+        }
         if bits in SUPPORTED_GGUF_TYPES:
             types = list(SUPPORTED_GGUF_TYPES[bits].keys())
-            # Prefer balanced options (e.g., Q4_K_M over Q4_K_S or Q4_0)
-            preferences = {
-                2: "Q2_K",
-                3: "Q3_K_M",
-                4: "Q4_K_M",
-                5: "Q5_K_M",
-                6: "Q6_K",
-                8: "Q8_0"
-            }
             return preferences.get(bits, types[0])
         raise ValueError(f"No supported GGUF types for {bits} bits")
 
     def get_tensor_quant_type(self, tensor_name: str) -> Union[str, List[str]]:
         """Get the quantization type for a specific tensor."""
-        # Check for exact match
         if tensor_name in self.tensor_configs:
             return self.tensor_configs[tensor_name]
         
-        # Check for partial matches (e.g., "attention.wv" in "model.attention.wv.weight")
         for key in self.tensor_configs:
             if key != "default" and key in tensor_name:
                 return self.tensor_configs[key]
         
-        # Return default if no specific config found
         return self.tensor_configs["default"]
 
     def _get_quant_description(self) -> str:
         """Get the description of the current quantization configuration."""
         return SUPPORTED_GGUF_TYPES[self.bits][self.quant_type]["description"]
 
     def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel]):
-        """Initialize model and tokenizer using BitsAndBytes."""
+        """Initialize model and tokenizer using BitsAndBytes for 4/8-bit or FP16 for others."""
         try:
             if isinstance(model_name, str):
                 logger.log_info(f"Loading tokenizer from: {model_name}")
                 self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                 
-                logger.log_info(f"Loading model with BitsAndBytes configuration")
+                logger.log_info(f"Loading model with {'BitsAndBytes' if self.bits in [4, 8] else 'FP16'} configuration")
                 
-                # Load model with quantization settings
                 load_kwargs = {
                     "device_map": self.device_map,
                     "max_memory": self.max_memory,
@@ -245,8 +252,7 @@ def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel
                     "trust_remote_code": True,
                 }
                 
-                # Add quantization config if provided
-                if self.quantization_config:
+                if self.quantization_config and self.bits in [4, 8]:
                     load_kwargs["quantization_config"] = self.quantization_config
                 
                 self.model = AutoModelForCausalLM.from_pretrained(
@@ -266,7 +272,6 @@ def _initialize_model_and_tokenizer(self, model_name: Union[str, PreTrainedModel
             else:
                 raise TypeError("model_name must be a string or PreTrainedModel instance")
 
-            # Enable gradient checkpointing if requested
             if self.use_gradient_checkpointing and hasattr(self.model, "gradient_checkpointing_enable"):
                 self.model.gradient_checkpointing_enable()
                 logger.log_info("Gradient checkpointing enabled for memory efficiency")
@@ -282,7 +287,7 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
         total_params = sum(p.numel() for p in model.parameters())
         param_size = sum(p.numel() * p.element_size() for p in model.parameters())
         buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
-        total_size = (param_size + buffer_size) / (1024 * 1024)  # MB
+        total_size = (param_size + buffer_size) / (1024 * 1024)
         
         prefix = f"{stage} " if stage else ""
         logger.log_info(f"\n{prefix}Model Statistics:")
@@ -293,18 +298,18 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
             logger.log_info(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / (1024 * 1024):.2f} MB")
 
     def convert_to_gguf(self, output_path: str):
-        """Convert quantized model to GGUF format using llama.cpp conversion tools."""
+        """Convert model to GGUF format with separate quantization step."""
         if not CT_AVAILABLE:
             raise ImportError("CTransformers is required for GGUF conversion")
         
         temp_dir = None
+        temp_gguf = None
         try:
-            # Print header
             logger.log_info("\n" + "="*80)
             logger.log_info("🚀 Starting GGUF Conversion Process".center(80))
             logger.log_info("="*80 + "\n")
             
-            # Model Information Section
+            # Model Information
             logger.log_info("📊 Model Information:")
             logger.log_info("-"*40)
             model_type = self.model.config.model_type if hasattr(self.model, 'config') else None
@@ -322,7 +327,7 @@ def convert_to_gguf(self, output_path: str):
             logger.log_info(f"• Model Size: {model_size:.2f} GB")
             logger.log_info("")
             
-            # Conversion Settings Section
+            # Conversion Settings
             logger.log_info("⚙️ Conversion Settings:")
             logger.log_info("-"*40)
             logger.log_info(f"• Output Path: {output_path}")
@@ -340,93 +345,115 @@ def convert_to_gguf(self, output_path: str):
             logger.log_info("• Checkpoint saved successfully")
             logger.log_info("")
             
-            # Find convert.py script
+            # Find llama.cpp tools
             logger.log_info("🔍 Locating GGUF Conversion Tools:")
             logger.log_info("-"*40)
             
-            # First try pip installation path
             try:
                 import llama_cpp
                 llama_cpp_path = os.path.dirname(llama_cpp.__file__)
-                potential_convert = os.path.join(llama_cpp_path, "convert.py")
-                if os.path.exists(potential_convert):
-                    convert_script = potential_convert
-                    logger.log_info(f"• Found convert.py in llama_cpp package: {convert_script}")
-                else:
-                    convert_script = None
-            except ImportError:
-                convert_script = None
-            
-            if not convert_script:
-                logger.log_info("• Attempting to install llama-cpp-python...")
+                convert_script = os.path.join(llama_cpp_path, "convert.py")
+                quantize_bin = os.path.join(llama_cpp_path, "quantize")
+                if not os.path.exists(convert_script):
+                    raise FileNotFoundError("convert.py not found")
+                if not os.path.exists(quantize_bin):
+                    raise FileNotFoundError("quantize binary not found")
+                logger.log_info(f"• Found convert.py: {convert_script}")
+                logger.log_info(f"• Found quantize: {quantize_bin}")
+            except (ImportError, FileNotFoundError) as e:
+                logger.log_error(f"• Failed to locate llama.cpp tools: {e}")
                 try:
+                    logger.log_info("• Attempting to install llama-cpp-python...")
                     subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"])
                     import llama_cpp
                     llama_cpp_path = os.path.dirname(llama_cpp.__file__)
                     convert_script = os.path.join(llama_cpp_path, "convert.py")
-                    if not os.path.exists(convert_script):
-                        raise FileNotFoundError("convert.py not found after installation")
-                    logger.log_info("• Successfully installed and located convert.py")
-                except Exception as e:
-                    logger.log_error(f"• Failed to install/locate llama-cpp-python: {e}")
+                    quantize_bin = os.path.join(llama_cpp_path, "quantize")
+                    logger.log_info("• Successfully installed and located tools")
+                except Exception as inst_err:
                     raise RuntimeError(
-                        "Could not find or install llama-cpp-python. Please install manually:\n"
-                        "pip install llama-cpp-python --upgrade"
-                    )
+                        f"Could not find or install llama-cpp-python: {inst_err}\n"
+                        "Install manually: pip install llama-cpp-python --upgrade"
+                    ) from e
             
-            logger.log_info("")
-            
-            # Build conversion command
-            logger.log_info("🛠️ Preparing Conversion Command:")
+            # Convert to FP16 GGUF
+            logger.log_info("🛠️ Converting to FP16 GGUF:")
             logger.log_info("-"*40)
-            
-            cmd = [
+            temp_gguf = f"{output_path}_temp_f16.gguf"
+            cmd_convert = [
                 sys.executable,
                 convert_script,
                 temp_dir,
-                "--outfile", output_path,
-                "--outtype", f"q{self.bits}" if self.bits < 16 else "f16" if self.bits == 16 else "f32",
+                "--outfile", temp_gguf,
+                "--outtype", "f16",
                 "--model-type", model_type
             ]
             
-            logger.log_info(f"• Command: {' '.join(cmd)}")
-            logger.log_info("")
+            logger.log_info(f"• Command: {' '.join(cmd_convert)}")
+            with tqdm(total=100, desc="Converting to FP16", unit="%") as pbar:
+                process = subprocess.Popen(
+                    cmd_convert,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    universal_newlines=True
+                )
+                
+                while True:
+                    output = process.stdout.readline()
+                    if output == '' and process.poll() is not None:
+                        break
+                    if output and "Converting" in output:
+                        try:
+                            progress = int(output.split("%")[0].split()[-1])
+                            pbar.n = progress
+                            pbar.refresh()
+                        except:
+                            pass
+                        logger.log_info(f"• {output.strip()}")
             
-            # Execute conversion
-            logger.log_info("🔄 Running GGUF Conversion:")
+            return_code = process.wait()
+            if return_code != 0:
+                error_output = process.stderr.read()
+                raise RuntimeError(f"FP16 GGUF conversion failed:\n{error_output}")
+            
+            # Quantize to target type
+            logger.log_info("\n🔄 Quantizing GGUF:")
             logger.log_info("-"*40)
+            cmd_quantize = [
+                quantize_bin,
+                temp_gguf,
+                output_path,
+                self.quant_type.lower()  # llama.cpp expects lowercase
+            ]
             
-            with tqdm(total=100, desc="Converting to GGUF", unit="%", 
-                     bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") as pbar:
+            logger.log_info(f"• Command: {' '.join(cmd_quantize)}")
+            with tqdm(total=100, desc="Quantizing GGUF", unit="%") as pbar:
                 process = subprocess.Popen(
-                    cmd,
+                    cmd_quantize,
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE,
                     universal_newlines=True
                 )
                 
-                # Monitor conversion progress
                 while True:
                     output = process.stdout.readline()
                     if output == '' and process.poll() is not None:
                         break
-                    if output:
-                        if "Converting" in output:
-                            try:
-                                progress = int(output.split("%")[0].split()[-1])
-                                pbar.n = progress
-                                pbar.refresh()
-                            except:
-                                pass
+                    if output and "%" in output:
+                        try:
+                            progress = int(output.split("%")[0].split()[-1])
+                            pbar.n = progress
+                            pbar.refresh()
+                        except:
+                            pass
                         logger.log_info(f"• {output.strip()}")
             
-            # Check for errors
             return_code = process.wait()
             if return_code != 0:
                 error_output = process.stderr.read()
-                raise RuntimeError(f"GGUF conversion failed with error:\n{error_output}")
-        
-            # Verify and report results
+                raise RuntimeError(f"GGUF quantization failed:\n{error_output}")
+            
+            # Verify results
             if os.path.exists(output_path):
                 logger.log_info("\n✅ Conversion Results:")
                 logger.log_info("-"*40)
@@ -448,17 +475,16 @@ def convert_to_gguf(self, output_path: str):
             logger.log_error("\n❌ Conversion Failed:")
             logger.log_error("-"*40)
             logger.log_error(f"• Error: {str(e)}")
-            if temp_dir and os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir, ignore_errors=True)
             raise RuntimeError(f"Failed to convert model to GGUF: {str(e)}") from e
         
         finally:
-            # Cleanup
             if temp_dir and os.path.exists(temp_dir):
                 logger.log_info("\n🧹 Cleaning Up:")
                 logger.log_info("-"*40)
                 logger.log_info("• Removing temporary files...")
                 shutil.rmtree(temp_dir, ignore_errors=True)
+            if temp_gguf and os.path.exists(temp_gguf):
+                os.remove(temp_gguf)
             self._clear_memory()
 
     def _clear_memory(self):
@@ -467,5 +493,3 @@ def _clear_memory(self):
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.synchronize()
-
-