Merge remote-tracking branch 'origin' into kylesayrs/transform-merge

kylesayrs · kylesayrs · commit a3cd59df8e6f · 2025-07-24T18:21:03.000-04:00
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -400,7 +400,10 @@ def compress_model(self, model: Module):
 
                 # in the future, support compression on same device
                 with align_module_device(module, execution_device=exec_device):
-                    state_dict = module.state_dict(prefix=f"{prefix}.")
+                    state_dict = {
+                        f"{prefix}.{name}": param
+                        for name, param in module.named_parameters(recurse=False)
+                    }
 
                 # quantization first
                 if prefix in module_to_scheme:
@@ -421,7 +424,7 @@ def compress_model(self, model: Module):
 
                 # remove any existing parameters
                 offload_device = get_offloaded_device(module)
-                for name, _ in list(module.named_parameters()):
+                for name, _ in list(module.named_parameters(recurse=False)):
                     delete_offload_parameter(module, name)
 
                 # replace with compressed parameters
@@ -458,7 +461,10 @@ def decompress_model(self, model: Module):
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
                 # in the future, support decompression on same device
                 with align_module_device(module, execution_device="cpu"):
-                    state_dict = module.state_dict(prefix=f"{prefix}.")
+                    state_dict = {
+                        f"{prefix}.{name}": param
+                        for name, param in module.named_parameters(recurse=False)
+                    }
 
                 # sparsity first
                 if prefix in sparse_compression_targets:
@@ -483,7 +489,7 @@ def decompress_model(self, model: Module):
                 # remove any existing parameters
                 exec_device = get_execution_device(module)
                 offload_device = get_offloaded_device(module)
-                for name, _ in list(module.named_parameters()):
+                for name, _ in list(module.named_parameters(recurse=False)):
                     delete_offload_parameter(module, name)
 
                 # replace with decompressed parameters
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -111,11 +111,22 @@ def dequantize(
         elif scale.ndim == 2:
             if scale.shape[1] == 1:
                 args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
-            else:
+            # Scale height matches input or is 1 -> group quantization across columns
+            #
+            # Example 1: scale.shape[0] == 1
+            # x_q: (4, 8), scale: (1, 4) -> 2 columns per group
+            #
+            # Example 2: scale.shape[0] == x_q.shape[0]
+            # x_q: (4, 8), scale: (4, 4) -> 2 elements per group (per row)
+            elif (scale.shape[0] == 1) or (scale.shape[0] == x_q.shape[0]):
                 group_size = int(x_q.shape[1] / scale.shape[1])
                 args = QuantizationArgs(
                     strategy=QuantizationStrategy.GROUP, group_size=group_size
                 )
+            else:
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.BLOCK, block_structure=scale.shape
+                )
         else:
             raise ValueError(
                 f"Could not infer a quantization strategy from scale with {scale.ndim} "
@@ -189,7 +200,63 @@ def _process_quantization(
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
 
-    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+    # blockwise FP8: quantize per 2D block, supports block_structure for static block quant
+    if args.strategy == QuantizationStrategy.BLOCK:
+        original_shape = x.shape
+        rows, cols = x.shape[-2], x.shape[-1]
+        block_height, block_width = args.block_structure
+
+        # Ensure exact division (tensor dimensions must be divisible by block size)
+        if rows % block_height != 0:
+            raise ValueError(
+                f"Tensor height {rows} is not divisible by block_height {block_height}. "
+                f"Block quantization requires exact division."
+            )
+        if cols % block_width != 0:
+            raise ValueError(
+                f"Tensor width {cols} is not divisible by block_width {block_width}. "
+                f"Block quantization requires exact division."
+            )
+
+        # reshape into blocks and transpose to make each block contiguous
+        num_rows_blocks = rows // block_height
+        num_cols_blocks = cols // block_width
+        x_blocks = x.reshape(
+            num_rows_blocks,
+            block_height,
+            num_cols_blocks,
+            block_width,
+        ).transpose(1, 2)
+
+        # expand scale/zero_point for blocks
+        sb = scale.unsqueeze(-1).unsqueeze(-1)
+        zb = zero_point.unsqueeze(-1).unsqueeze(-1) if zero_point is not None else None
+        if do_quantize:
+            # quantize blocks
+            x_blocks = _quantize(
+                x=x_blocks,
+                scale=sb,
+                zero_point=zb,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
+                dtype=dtype,
+                global_scale=global_scale,
+            )
+        if do_dequantize:
+            # dequantize blocks
+            x_blocks = _dequantize(
+                x_q=x_blocks,
+                scale=sb,
+                zero_point=zb,
+                global_scale=global_scale,
+            )
+        # restore original shape
+        output = x_blocks.transpose(1, 2).reshape(original_shape)
+    elif args.strategy in (
+        QuantizationStrategy.GROUP,
+        QuantizationStrategy.TENSOR_GROUP,
+    ):
         n_dims = x.shape
         if len(n_dims) > 2:
             x = x.squeeze(0)
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -15,6 +15,7 @@
 
 import logging
 import math
+import warnings
 from enum import Enum
 from typing import List, Optional
 
@@ -172,14 +173,43 @@ def _initialize_scale_zero_point(
 
     if base_name == "weight" and weight_shape is not None:
         if quantization_args.strategy == QuantizationStrategy.CHANNEL:
-            # (output_channels, 1)
+            # (output_channels, 1) - only for weights
             expected_shape = (weight_shape[0], 1)
         elif quantization_args.strategy in (
             QuantizationStrategy.TENSOR_GROUP,
             QuantizationStrategy.GROUP,
         ):
+            # GROUP/TENSOR_GROUP for both weights and activations
             num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
             expected_shape = (weight_shape[0], max(num_groups, 1))
+        elif quantization_args.strategy == QuantizationStrategy.BLOCK:
+            # For block quantization, scale shape should match number of blocks - only for weights
+            if quantization_args.block_structure is None:
+                raise ValueError(
+                    "Block quantization requires block_structure to be specified"
+                )
+            block_height, block_width = quantization_args.block_structure
+            rows, cols = weight_shape[-2], weight_shape[-1]
+            num_rows_blocks = math.ceil(rows / block_height)
+            num_cols_blocks = math.ceil(cols / block_width)
+
+            # Warn if dimensions don't divide evenly
+            if rows % block_height != 0 or cols % block_width != 0:
+                warnings.warn(
+                    f"Block quantization: tensor shape {weight_shape} does not divide evenly "
+                    f"by block structure {quantization_args.block_structure}. "
+                    f"Some blocks will be incomplete which may affect quantization quality.",
+                    UserWarning,
+                )
+
+            expected_shape = (num_rows_blocks, num_cols_blocks)
+    elif quantization_args.strategy == QuantizationStrategy.BLOCK:
+        warnings.warn(
+            f"BLOCK quantization not supported for {base_name} activations. "
+            f"Falling back to tensor-level quantization.",
+            UserWarning,
+        )
+        expected_shape = 1
 
     # 3. Identify quantization scale and zp dtype
     scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -14,7 +14,7 @@
 
 import warnings
 from enum import Enum
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from compressed_tensors.utils import Aliasable
@@ -153,8 +153,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     :param symmetric: whether or not quantization scale is symmetric about zero-point
     :param strategy: string id determining the scope of scale/zero-point to apply
     :param group_size: group length to use for the group strategy
-    :param block_structure: 2d block structure to use for the block strategy, must be
-    of the format "2x4", "8x16", etc.
+    :param block_structure: 2d block structure to use for the block strategy; must be
+        a list of two ints [rows, cols] like [128, 128].
     :param dynamic: set True to perform dynamic quantization - values will not be
         calibrated during calibration phase, instead during inference new quantization
         ranges will be observed with every sample. Defaults to False for static
@@ -169,7 +169,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     symmetric: bool = True
     group_size: Optional[int] = None
     strategy: Optional[QuantizationStrategy] = None
-    block_structure: Optional[str] = None
+    block_structure: Optional[List[int]] = None
     dynamic: Union[DynamicType, bool] = False
     actorder: Union[ActivationOrdering, bool, None] = None
     observer: Optional[str] = Field(
@@ -207,6 +207,28 @@ def validate_group(cls, value) -> Union[int, None]:
 
         return value
 
+    @field_validator("block_structure", mode="before")
+    def validate_block_structure(cls, value) -> Optional[List[int]]:
+        if value is None:
+            return value
+        # For backward compatibility, allow string format "2x4", "8x16", etc.
+        if isinstance(value, str):
+            try:
+                return [int(x) for x in value.split("x")]
+            except Exception:
+                raise ValueError(
+                    f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+                )
+        if isinstance(value, (list, tuple)):
+            if len(value) != 2 or not all(isinstance(v, int) for v in value):
+                raise ValueError(
+                    f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+                )
+            return list(value)
+        raise ValueError(
+            f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+        )
+
     @field_validator("strategy", mode="before")
     def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]:
         if isinstance(value, str):
@@ -277,14 +299,15 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
 
         # infer observer w.r.t. dynamic
         if dynamic:
-            if strategy not in (
+            supported_strategies = (
                 QuantizationStrategy.TOKEN,
                 QuantizationStrategy.TENSOR,
                 QuantizationStrategy.TENSOR_GROUP,
-            ):
+                QuantizationStrategy.GROUP,
+            )
+            if strategy not in supported_strategies:
                 raise ValueError(
-                    f"One of {(QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP)} "
-                    "must be used for dynamic quantization",
+                    f"One of {supported_strategies} must be used for dynamic quantization"
                 )
 
             if (
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
@@ -52,6 +53,7 @@ class QuantizationScheme(BaseModel):
     def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
         inputs = model.input_activations
         outputs = model.output_activations
+        weights = model.weights
 
         if inputs is not None:
             if inputs.actorder is not None:
@@ -61,6 +63,22 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
             if outputs.actorder is not None:
                 raise ValueError("Cannot apply actorder to output activations")
 
+        if (
+            inputs
+            and weights
+            and weights.strategy == QuantizationStrategy.GROUP
+            and inputs.strategy == QuantizationStrategy.GROUP
+            and weights.group_size != inputs.group_size
+        ):
+            warnings.warn(
+                "Using GROUP strategy for both weights and input_activations "
+                f"with different group sizes ({weights.group_size} vs {inputs.group_size}) "
+                "may complicate fused kernel implementations. Consider using "
+                "TENSOR_GROUP strategy for both or matching group sizes.",
+                UserWarning,
+                stacklevel=2,
+            )
+
         return model
 
 
@@ -243,6 +261,29 @@ def is_preset_scheme(name: str) -> bool:
     ),
 )
 
+# Block‐wise FP8 (deepseekv3-style quantization):
+# static 128x128 per‐block weights and
+# dynamic per‐token‐group activations
+FP8_BLOCK = dict(
+    weights=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.BLOCK,
+        symmetric=True,
+        dynamic=False,
+        block_structure=[128, 128],
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.GROUP,
+        symmetric=True,
+        dynamic=True,
+        observer=None,
+        group_size=128,
+    ),
+)
+
 PRESET_SCHEMES = {
     # Unquantized (no-op)
     "UNQUANTIZED": UNQUANTIZED,
@@ -257,6 +298,7 @@ def is_preset_scheme(name: str) -> bool:
     # Float weight and activation schemes
     "FP8": FP8,
     "FP8_DYNAMIC": FP8_DYNAMIC,
+    "FP8_BLOCK": FP8_BLOCK,
     "NVFP4A16": NVFP4A16,
     "NVFP4": NVFP4,
 }
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -171,7 +171,10 @@ def compute_dynamic_scales_and_zp(
         reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
     elif args.strategy == QuantizationStrategy.TENSOR:
         reduce_dims = None
-    elif args.strategy == QuantizationStrategy.TENSOR_GROUP:
+    elif args.strategy in (
+        QuantizationStrategy.TENSOR_GROUP,
+        QuantizationStrategy.GROUP,
+    ):
         if len(value.shape) > 2:
             value = value.squeeze(0)
 
@@ -187,9 +190,15 @@ def compute_dynamic_scales_and_zp(
             ),
         )
     else:
+        supported_strategies = (
+            QuantizationStrategy.TOKEN,
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.TENSOR_GROUP,
+            QuantizationStrategy.GROUP,
+        )
         raise ValueError(
             "Dynamic quantization is only supported for ",
-            f"{QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP}",
+            f"{supported_strategies}",
         )
 
     if not reduce_dims:
diff --git a/src/compressed_tensors/transform/factory/base.py b/src/compressed_tensors/transform/factory/base.py
@@ -29,6 +29,7 @@
     align_module_device,
     delete_offload_module,
     has_offloaded_params,
+    match_named_modules,
     patch_attr,
     register_offload_module,
     update_offload_parameter,
diff --git a/tests/test_examples/test_bitmask_compression_ipynb.py b/tests/test_examples/test_bitmask_compression_ipynb.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import nbformat
 import pytest
+
+
+nbformat = pytest.importorskip("nbformat")
 from nbconvert.preprocessors import ExecutePreprocessor
 
 
diff --git a/tests/test_quantization/lifecycle/test_forward.py b/tests/test_quantization/lifecycle/test_forward.py
diff --git a/tests/test_quantization/lifecycle/test_initialize.py b/tests/test_quantization/lifecycle/test_initialize.py
diff --git a/tests/test_quantization/test_quant_args.py b/tests/test_quantization/test_quant_args.py
diff --git a/tests/test_quantization/test_utils/test_helpers.py b/tests/test_quantization/test_utils/test_helpers.py