code restructure trying to fix dense shaping

JyotinderSingh · JyotinderSingh · commit b0a38f20e18e · 2025-08-25T12:57:35.000+05:30
diff --git a/keras/src/layers/core/dense.py b/keras/src/layers/core/dense.py
@@ -323,8 +323,6 @@ def _check_load_own_variables(self, store):
                 f"Expected: {[v.name for v in all_vars]}"
             )
 
-    # Quantization-related (int8 and float8) methods
-
     def quantized_build(self, kernel_shape, mode):
         if mode == "int8":
             self._int8_build(kernel_shape)
@@ -553,10 +551,24 @@ def grad_fn(*args, upstream=None):
         return x
 
     def _gptq_call(self, inputs, training=None):
-        del training
-        x = ops.matmul(inputs, ops.subtract(self._kernel, self.zero_point))
-        x = ops.cast(x, self.compute_dtype)
-        x = ops.matmul(x, self.kernel_scale)
+        zero_point = self.zero_point
+        if self.gptq_config.symmetric:
+            zero_point = ops.zeros_like(self.zero_point, dtype="int8")
+
+        # Elementwise dequantization (works for per-weight or 
+        # broadcastable S/ZP)
+        dequant_kernel = ops.multiply(
+            ops.subtract(self._kernel, zero_point), self.kernel_scale
+        )
+
+        # Standard Dense matmul
+        x = ops.matmul(inputs, dequant_kernel)
+
+        # Add bias/activation to mirror Dense.call behavior
+        if self.bias is not None:
+            x = ops.add(x, self.bias)
+        if self.activation is not None:
+            x = self.activation(x)
         return x
 
     def _float8_call(self, inputs, training=None):
@@ -650,7 +662,7 @@ def grad(*args, upstream=None, variables=None):
             x = self.activation(x)
         return x
 
-    def quantize(self, mode, type_check=True):
+    def quantize(self, mode, type_check=True, config=None):
         # Prevent quantization of the subclasses
         if type_check and (type(self) is not Dense):
             raise self._not_implemented_error(self.quantize)
@@ -689,6 +701,7 @@ def quantize(self, mode, type_check=True):
             self.quantized_build(kernel_shape, mode)
         elif mode == "gptq":
             del self._kernel
+            self.gptq_config = config
             self.quantized_build(kernel_shape, mode)
         else:
             raise self._quantization_mode_error(mode)
diff --git a/keras/src/layers/core/einsum_dense.py b/keras/src/layers/core/einsum_dense.py
@@ -12,6 +12,7 @@
 from keras.src import quantizers
 from keras.src import regularizers
 from keras.src.api_export import keras_export
+from keras.src.backend.config import backend
 from keras.src.layers.input_spec import InputSpec
 from keras.src.layers.layer import Layer
 
@@ -608,11 +609,20 @@ def grad_fn(*args, upstream=None):
         return x
 
     def _gptq_call(self, inputs, training=None):
-        zero_point = self._adjust_scale_for_dequant(self.zero_point)
+        zero_point = self.zero_point
+        if self.gptq_config.symmetric:
+            # Constant zero-point (symmetric): integer 0
+            zero_point = ops.zeros_like(zero_point, dtype="int8")
 
-        dequantized_kernel = ops.subtract(self._kernel, zero_point)
+        zero_point = self._adjust_scale_for_dequant(zero_point)
 
-        x = ops.einsum(self.equation, inputs, dequantized_kernel)
+        # handle zero point with kernel
+        kernel = ops.subtract(self._kernel, zero_point)
+
+        # if backend is torch, do a cast
+        if backend() == "torch":
+            kernel = ops.cast(kernel, self.compute_dtype)
+        x = ops.einsum(self.equation, inputs, kernel)
         x = ops.cast(x, self.compute_dtype)
         x = ops.multiply(x, self.kernel_scale)
 
@@ -798,7 +808,7 @@ def grad(*args, upstream=None, variables=None):
             x = self.activation(x)
         return x
 
-    def quantize(self, mode, type_check=True):
+    def quantize(self, mode, type_check=True, config=None):
         # Prevent quantization of the subclasses
         if type_check and (type(self) is not EinsumDense):
             raise self._not_implemented_error(self.quantize)
@@ -834,6 +844,7 @@ def quantize(self, mode, type_check=True):
             del self._kernel
         elif mode == "gptq":
             del self._kernel
+            self.gptq_config = config
         self.quantized_build(kernel_shape, mode)
 
         # Assign values to the newly created variables.
diff --git a/keras/src/layers/core/embedding.py b/keras/src/layers/core/embedding.py
@@ -363,7 +363,7 @@ def _int8_call(self, inputs, training=None):
             )
         return outputs
 
-    def quantize(self, mode, type_check=True):
+    def quantize(self, mode, type_check=True, config=None):
         # Prevent quantization of the subclasses
         if type_check and (type(self) is not Embedding):
             raise self._not_implemented_error(self.quantize)
diff --git a/keras/src/layers/layer.py b/keras/src/layers/layer.py
@@ -1268,7 +1268,7 @@ def _clear_losses(self):
     def quantized_build(self, input_shape, mode):
         raise self._not_implemented_error(self.quantized_build)
 
-    def quantize(self, mode, type_check=True):
+    def quantize(self, mode, type_check=True, config=None):
         raise self._not_implemented_error(self.quantize)
 
     def _check_quantize_args(self, mode, compute_dtype):
diff --git a/keras/src/models/model.py b/keras/src/models/model.py
@@ -9,6 +9,7 @@
 from keras.src.layers.layer import Layer
 from keras.src.models.variable_mapping import map_saveable_variables
 from keras.src.quantizers.gptq_config import GPTQConfig
+from keras.src.quantizers.gptq_core import apply_gptq
 from keras.src.saving import saving_api
 from keras.src.trainers import trainer as base_trainer
 from keras.src.utils import summary_utils
@@ -421,7 +422,7 @@ def load_weights(self, filepath, skip_mismatch=False, **kwargs):
             **kwargs,
         )
 
-    def quantize(self, mode, config=None, **kwargs):
+    def quantize(self, mode, type_check=True, config=None, **kwargs):
         """Quantize the weights of the model.
 
         Note that the model must be built first before calling this method.
@@ -440,8 +441,7 @@ def quantize(self, mode, config=None, **kwargs):
                     "The `config` argument must be of type "
                     "`keras.quantizers.GPTQConfig`."
                 )
-            # The config object's own quantize method drives the process
-            config.quantize(self)
+            apply_gptq(self, config=config)
             return
 
         # For all other modes, verify that a config object was not passed.
diff --git a/keras/src/quantizers/gptq.py b/keras/src/quantizers/gptq.py
@@ -3,13 +3,15 @@
 from keras.src import ops
 from keras.src.layers import Dense
 from keras.src.layers import EinsumDense
+from keras.src.quantizers.gptq_quant import GPTQQuantizer
 
 
 class GPTQ:
-    def __init__(self, layer):
+    def __init__(self, layer, config):
         self.original_layer = layer
+        self.config = config
         self.num_samples = 0
-        self.quantizer = None
+        self.quantizer = GPTQQuantizer()
 
         # Explicitly handle each supported layer type
         if isinstance(layer, Dense) or (
@@ -63,9 +65,7 @@ def __init__(self, layer):
                 raise ValueError(
                     "The EinsumDense layer must be built before applying GPTQ. "
                 )
-            # This populates self.original_layer with attributes like
-            # `_kernel_reduced_axes`, `_kernel_transpose_axes`, etc.
-            layer._set_quantization_info()
+
         self.hessian = ops.zeros((self.rows, self.rows), dtype="float32")
 
     def update_hessian_with_batch(self, input_batch):
@@ -186,7 +186,8 @@ def quantize_and_correct_block(
                 based
                 on their activation's second-order information.
         """
-        self.original_layer.quantize("gptq")
+
+        self.original_layer.quantize("gptq", config=self.config)
 
         weights_matrix = ops.transpose(ops.cast(self.layer.kernel, "float32"))
         hessian_matrix = ops.cast(self.hessian, "float32")
@@ -271,17 +272,13 @@ def quantize_and_correct_block(
                         group_slice = weights_matrix[:, group_start:group_end]
                         self.quantizer.find_params(group_slice, weight=True)
                 else:
-                    # Per-column params
                     self.quantizer.find_params(
                         ops.expand_dims(weight_column, 1), weight=True
                     )
 
                 # Quantize the current column and store the results
                 quantized_column = self.quantizer.quantize(
-                    ops.expand_dims(weight_column, 1),
-                    self.quantizer.scale,
-                    self.quantizer.zero,
-                    self.quantizer.maxq,
+                    ops.expand_dims(weight_column, 1)
                 )[:, 0]
 
                 # Write integer weights
@@ -302,16 +299,12 @@ def quantize_and_correct_block(
                 zero_col = ops.expand_dims(
                     ops.cast(self.quantizer.zero, "float32")[0, :], 1
                 )
-
                 scales = ops.slice_update(scales, (0, abs_col), scale_col)
                 zeros = ops.slice_update(zeros, (0, abs_col), zero_col)
 
                 # Dequantize back to float32 for error correction.
                 dequantized_column = self.quantizer.dequantize(
                     ops.expand_dims(weight_column, 1),
-                    self.quantizer.scale,
-                    self.quantizer.zero,
-                    self.quantizer.maxq,
                 )[:, 0]
 
                 quantization_error = ops.divide(
@@ -408,9 +401,7 @@ def quantize_and_correct_block(
             )
 
         self.original_layer.kernel_scale.assign(scale)
-
         self.original_layer.zero_point.assign(zero_point)
-
         self.original_layer._kernel.assign(quantized_kernel)
 
     def free(self):
diff --git a/keras/src/quantizers/gptq_config.py b/keras/src/quantizers/gptq_config.py
@@ -1,7 +1,4 @@
-from absl import logging
-
 from keras.src.api_export import keras_export
-from keras.src.quantizers.gptq_core import quantize_model
 
 
 @keras_export("keras.quantizers.GPTQConfig")
@@ -157,13 +154,3 @@ def __init__(
         self.group_size = group_size
         self.symmetric = symmetric
         self.activation_order = activation_order
-
-    def quantize(self, model):
-        """
-        Applies GPTQ quantization to the provided model using this
-        configuration.
-        """
-        logging.info("Initiating quantization from GPTQConfig...")
-        # The core logic is now delegated to gptqutils, which will handle
-        # the dynamic imports and data loading.
-        quantize_model(model=model, config=self)
diff --git a/keras/src/quantizers/gptq_core.py b/keras/src/quantizers/gptq_core.py
@@ -9,7 +9,7 @@
 from keras.src.layers import EinsumDense
 from keras.src.layers import Embedding
 from keras.src.quantizers.gptq import GPTQ
-from keras.src.quantizers.gptq_quant import GPTQQuantization
+from keras.src.quantizers.gptq_quant import GPTQQuantizer
 
 
 def get_dataloader(tokenizer, sequence_length, dataset, num_samples=128):
@@ -93,16 +93,7 @@ def find_layers_in_block(block):
     return found_layers
 
 
-def apply_gptq_layerwise(
-    model,
-    dataloader,
-    num_samples,
-    hessian_damping,
-    group_size,
-    symmetric,
-    activation_order,
-    weight_bits,
-):
+def apply_gptq_layerwise(model, dataloader, config):
     """Applies GPTQ quantization layer-by-layer to a Keras model.
 
     This function is designed to work with common transformer architectures,
@@ -134,26 +125,21 @@ def apply_gptq_layerwise(
             attempt to automatically discover its structure.
         dataloader: An iterable providing calibration data. Each item should
             be a batch of token IDs suitable for the model's embedding layer.
-        num_samples: (int) The number of samples from the dataloader to use for
-            calibration.
-        hessian_damping: (float) The percentage of dampening to add to the
-            Hessian diagonal for stabilization during inverse calculation.
-            A value of 0.01 is common.
-        group_size: (int) The size of the groups to use for quantization. A
-            value of 128 means that 128 weights will share the same scaling
-            factor. Use -1 for per-channel quantization.
-        symmetric: (bool) If True, symmetric quantization is used. Otherwise,
-            asymmetric quantization is used.
-        activation_order: (bool) If True, reorders the weight columns based on
-            activation magnitude, which can improve quantization accuracy.
-        weight_bits: (int) The number of bits to use for the quantized weights,
-            e.g., 4 for 4-bit quantization.
+        config: A `GPTQConfiguration` object.
 
     Raises:
         ValueError: If the function cannot automatically find an embedding
             layer or any transformer-like blocks to quantize within the model.
     """
     logging.info("Starting model quantization...")
+
+    num_samples = config.num_samples
+    hessian_damping = config.hessian_damping
+    group_size = config.group_size
+    symmetric = config.symmetric
+    activation_order = config.activation_order
+    weight_bits = config.weight_bits
+
     embedding_layer = None
     transformer_blocks = []
     if hasattr(model, "backbone"):
@@ -221,7 +207,8 @@ def apply_gptq_layerwise(
         else:
             logging.info(f"Found layers: {list(sub_layers_map.keys())}")
             gptq_objects = {
-                name: GPTQ(layer) for name, layer in sub_layers_map.items()
+                name: GPTQ(layer, config)
+                for name, layer in sub_layers_map.items()
             }
 
             captured_inputs = {name: [] for name in sub_layers_map.keys()}
@@ -271,7 +258,7 @@ def hook(*args, **kwargs):
                 input_reshaped = ops.reshape(layer_inputs, (-1, num_features))
                 gptq_object.update_hessian_with_batch(input_reshaped)
 
-                quantizer = GPTQQuantization(
+                quantizer = GPTQQuantizer(
                     weight_bits,
                     per_channel=True,
                     symmetric=symmetric,
@@ -304,7 +291,7 @@ def hook(*args, **kwargs):
     logging.info("Quantization process complete.")
 
 
-def quantize_model(model, config):
+def apply_gptq(model, config):
     """
     Top-level function to quantize a Keras model using GPTQ.
     """
@@ -323,13 +310,4 @@ def quantize_model(model, config):
     # is now a NumPy array, which can be sliced and reused.
     calibration_dataloader = full_dataloader[: config.num_samples]
 
-    apply_gptq_layerwise(
-        model,
-        calibration_dataloader,  # Use the calibration slice
-        config.num_samples,  # Use the configured number of samples
-        config.hessian_damping,
-        config.group_size,
-        config.symmetric,
-        config.activation_order,
-        config.weight_bits,
-    )
+    apply_gptq_layerwise(model, calibration_dataloader, config)
diff --git a/keras/src/quantizers/gptq_quant.py b/keras/src/quantizers/gptq_quant.py
diff --git a/keras/src/quantizers/gptq_test.py b/keras/src/quantizers/gptq_test.py

Original file line number	Diff line number	Diff line change
`@@ -363,7 +363,7 @@ def _int8_call(self, inputs, training=None):`
`363`	`363`	`)`
`364`	`364`	`return outputs`
`365`	`365`
`366`		`- def quantize(self, mode, type_check=True):`
	`366`	`+ def quantize(self, mode, type_check=True, config=None):`
`367`	`367`	`# Prevent quantization of the subclasses`
`368`	`368`	`if type_check and (type(self) is not Embedding):`
`369`	`369`	`raise self._not_implemented_error(self.quantize)`