neuralmagic · fynnsu · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/.github/workflows/quality-check.yaml b/.github/workflows/quality-check.yaml
@@ -0,0 +1,29 @@
+name: Quality Checks
+on:
+  push:
+    branches:
+      - main
+      - 'release/*'
+  pull_request:
+    branches:
+      - main
+      - 'release/*'
+
+jobs:
+  quality-check:
+    runs-on: ubuntu-24.04
+    steps:
+        - uses: actions/setup-python@v5
+          with:
+            python-version: '3.10'
+        - uses: actions/checkout@v4
+          with:
+            fetch-depth: 0
+            fetch-tags: true
+        - name: Set Env
+          run: |
+            pip3 install --upgrade pip && pip3 install --upgrade setuptools
+        - name: "⚙️ Install dependencies"
+          run: pip3 install .[dev]
+        - name: "🧹 Running quality checks"
+          run: make quality
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -4,9 +4,11 @@ on:
   push:
     branches:
       - main
+      - 'release/*'
   pull_request:
     branches:
       - main
+      - 'release/*'
 
 jobs:
   python-tests:
@@ -26,3 +28,4 @@ jobs:
           run: pip3 install .[dev,accelerate]
         - name: "🔬 Running tests"
           run: make test
+
diff --git a/setup.cfg b/setup.cfg
@@ -5,6 +5,7 @@ ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True
 sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
+skip = src/compressed_tensors/version.py
 
 line_length = 88
 lines_after_imports = 2

diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -42,7 +42,6 @@
     load_pretrained_quantization_parameters,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
-from compressed_tensors.quantization.utils import is_module_quantized
 from compressed_tensors.utils import (
     align_module_device,
     delete_offload_parameter,
@@ -195,7 +194,7 @@ def from_pretrained_model(
 
     @staticmethod
     def parse_sparsity_config(
-        compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"]
+        compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"],
     ) -> Union[Dict[str, Any], None]:
         """
         Parse sparsity config from quantization/compression config. Sparsity
@@ -215,7 +214,7 @@ def parse_sparsity_config(
 
     @staticmethod
     def parse_quantization_config(
-        compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"]
+        compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"],
     ) -> Union[Dict[str, Any], None]:
         """
         Parse quantization config from quantization/compression config. The
@@ -390,7 +389,6 @@ def compress_model(self, model: Module):
         )
 
         for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
-
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
                 module_device = get_execution_device(module)
                 is_meta = module_device.type == "meta"
@@ -562,11 +560,12 @@ def decompress(self, model_path: str, model: Module):
         :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
 
-        Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
-        The variations in these methods are a result of the subtle variations between the sparsity
-        and quantization compressors. Specifically, quantization compressors return not just the
-        decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
-        compressors only return the decompressed weight.
+        Note: decompress makes use of both _replace_sparsity_weights and
+        _replace_weights. The variations in these methods are a result of the subtle
+        variations between the sparsity and quantization compressors. Specifically,
+        quantization compressors return not just the decompressed weight, but the
+        quantization parameters (e.g scales, zero_point) whereas sparsity compressors
+        only return the decompressed weight.
 
         """
         model_path = get_safetensors_folder(model_path)
@@ -598,18 +597,17 @@ def decompress(self, model_path: str, model: Module):
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
-
                 names_to_scheme = apply_quantization_config(
                     model, self.quantization_config
                 )
                 # Load activation scales/zp or any other quantization parameters
-                # Conditionally load the weight quantization parameters if we have a dense compressor
-                # Or if a sparsity compressor has already been applied
+                # Conditionally load the weight quantization parameters if we have a
+                # dense compressor or if a sparsity compressor has already been applied
                 load_pretrained_quantization_parameters(
                     model,
                     model_path,
-                    # TODO: all weight quantization params will be moved to the compressor in a follow-up
-                    # including initialization
+                    # TODO: all weight quantization params will be moved to the
+                    # compressor in a follow-up including initialization
                     load_weight_quantization=(
                         sparse_decompressed
                         or isinstance(self.quantization_compressor, DenseCompressor)
@@ -695,7 +693,6 @@ def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
         :param model: The model whose weights are to be updated.
         """
         for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
-
             split_name = name.split(".")
             prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
             module = operator.attrgetter(prefix)(model)
@@ -731,9 +728,10 @@ def _replace_weights(self, dense_weight_generator, model: Module):
             for param_name, param_data in data.items():
                 if hasattr(module, param_name):
                     # If compressed, will have an incorrect dtype for transformers >4.49
-                    # TODO: we can also just skip initialization of scales/zp if in decompression in init
-                    # to be consistent with loading which happens later as well
-                    # however, update_data does a good shape check - should be moved to the compressor
+                    # TODO: we can also just skip initialization of scales/zp if in
+                    # decompression in init to be consistent with loading which happens
+                    # later as well however, update_data does a good shape check -
+                    # should be moved to the compressor
                     if param_name == "weight":
                         delattr(module, param_name)
                         requires_grad = param_data.dtype in (

diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -24,7 +24,6 @@
     get_nested_weight_mappings,
     merge_names,
 )
-from compressed_tensors.utils.safetensors_load import match_param_name
 from safetensors import safe_open
 from torch import Tensor
 from tqdm import tqdm
@@ -107,7 +106,8 @@ def compress(
                     compressed_dict[name] = value.to(compression_device)
                     continue
 
-                # compress values on meta if loading from meta otherwise on cpu (memory movement too expensive)
+                # compress values on meta if loading from meta otherwise on cpu (memory
+                # movement too expensive)
                 module_path = prefix[:-1] if prefix.endswith(".") else prefix
                 quant_args = names_to_scheme[module_path].weights
                 compressed_values = self.compress_weight(

diff --git a/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py
@@ -15,7 +15,6 @@
 
 from typing import Dict, Optional, Tuple
 
-import numpy
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.quantized_compressors.base import (
@@ -71,7 +70,6 @@ def compress_weight(
         zero_point: Optional[torch.Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
-
         quantized_weight = quantize(
             x=weight,
             scale=scale,
@@ -91,7 +89,6 @@ def decompress_weight(
         compressed_data: Dict[str, Tensor],
         quantization_args: Optional[QuantizationArgs] = None,
     ) -> torch.Tensor:
-
         weight = compressed_data["weight_packed"]
         scale = compressed_data["weight_scale"]
         global_scale = compressed_data["weight_global_scale"]
@@ -154,14 +151,16 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
 )
 
+
 # reference: : https://github.com/vllm-project/vllm/pull/16362
 def unpack_fp4_from_uint8(
     a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16
 ) -> torch.Tensor:
     """
     Unpacks uint8 values into fp4. Each uint8 consists of two fp4 values
-    (i.e. first four bits correspond to one fp4 value, last four corresond to a consecutive
-    fp4 value). The bits represent an index, which are mapped to an fp4 value.
+    (i.e. first four bits correspond to one fp4 value, last four correspond to a
+    consecutive fp4 value). The bits represent an index, which are mapped to an fp4
+    value.
 
     :param a: tensor to unpack
     :param m: original dim 0 size of the unpacked tensor

diff --git a/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
@@ -14,7 +14,6 @@
 import math
 from typing import Dict, Literal, Optional, Tuple, Union
 
-import numpy as np
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.quantized_compressors.base import (
@@ -135,7 +134,8 @@ def compress_weight(
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
 
-        # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
+        # We typically don't compress zp; apart from when using the packed_compressor
+        # and when storing group/channel zp
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,
@@ -166,7 +166,8 @@ def decompress_weight(
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
 
-        # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
+        # NOTE: this will fail decompression as we don't currently handle packed zp on
+        # decompression
         if not quantization_args.symmetric and quantization_args.strategy in [
             QuantizationStrategy.GROUP.value,
             QuantizationStrategy.CHANNEL.value,

diff --git a/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py b/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Dict, Generator, List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import torch
 from compressed_tensors.compressors.base import BaseCompressor

diff --git a/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py b/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py
@@ -48,7 +48,7 @@ class Marlin24Compressor(BaseCompressor):
 
     @staticmethod
     def validate_quant_compatability(
-        names_to_scheme: Dict[str, QuantizationScheme]
+        names_to_scheme: Dict[str, QuantizationScheme],
     ) -> bool:
         """
         Checks if every quantized module in the model is compatible with Marlin24

diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -73,14 +73,14 @@ def load_pretrained_quantization_parameters(
     Loads the quantization parameters (scale and zero point) from model_name_or_path to
     a model that has already been initialized with a quantization config.
 
-    NOTE: Will always load inputs/output parameters.
-    Will conditioanlly load weight parameters, if load_weight_quantization is set to True.
+    NOTE: Will always load inputs/output parameters. Will conditioanlly load weight
+    parameters, if load_weight_quantization is set to True.
 
     :param model: model to load pretrained quantization parameters to
     :param model_name_or_path: Hugging Face stub or local folder containing a quantized
         model, which is used to load quantization parameters
-    :param load_weight_quantization: whether or not the weight quantization parameters shoud
-        be laoded
+    :param load_weight_quantization: whether or not the weight quantization parameters
+        should be loaded
     """
     model_path = get_safetensors_folder(model_name_or_path)
     mapping = get_quantization_parameter_to_path_mapping(model_path)

diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -200,7 +200,8 @@ def _process_quantization(
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
 
-    # blockwise FP8: quantize per 2D block, supports block_structure for static block quant
+    # blockwise FP8: quantize per 2D block, supports block_structure for static block
+    # quantization
     if args.strategy == QuantizationStrategy.BLOCK:
         original_shape = x.shape
         rows, cols = x.shape[-2], x.shape[-1]
@@ -209,8 +210,8 @@ def _process_quantization(
         # Ensure exact division (tensor dimensions must be divisible by block size)
         if rows % block_height != 0:
             raise ValueError(
-                f"Tensor height {rows} is not divisible by block_height {block_height}. "
-                f"Block quantization requires exact division."
+                f"Tensor height {rows} is not divisible by block_height {block_height}."
+                f" Block quantization requires exact division."
             )
         if cols % block_width != 0:
             raise ValueError(

diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -17,7 +17,7 @@
 import math
 import warnings
 from enum import Enum
-from typing import List, Optional
+from typing import Optional
 
 import torch
 from compressed_tensors.quantization.lifecycle.forward import (
@@ -87,7 +87,6 @@ def initialize_module_for_quantization(
         _initialize_attn_scales(module)
 
     else:
-
         if scheme.input_activations is not None:
             _initialize_scale_zero_point(
                 module,
@@ -183,7 +182,8 @@ def _initialize_scale_zero_point(
             num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
             expected_shape = (weight_shape[0], max(num_groups, 1))
         elif quantization_args.strategy == QuantizationStrategy.BLOCK:
-            # For block quantization, scale shape should match number of blocks - only for weights
+            # For block quantization, scale shape should match number of blocks - only
+            # for weights
             if quantization_args.block_structure is None:
                 raise ValueError(
                     "Block quantization requires block_structure to be specified"
@@ -196,9 +196,10 @@ def _initialize_scale_zero_point(
             # Warn if dimensions don't divide evenly
             if rows % block_height != 0 or cols % block_width != 0:
                 warnings.warn(
-                    f"Block quantization: tensor shape {weight_shape} does not divide evenly "
-                    f"by block structure {quantization_args.block_structure}. "
-                    f"Some blocks will be incomplete which may affect quantization quality.",
+                    f"Block quantization: tensor shape {weight_shape} does not divide"
+                    f"evenly by block structure {quantization_args.block_structure}. "
+                    f"Some blocks will be incomplete which may affect quantization"
+                    "quality.",
                     UserWarning,
                 )
 

diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -217,16 +217,18 @@ def validate_block_structure(cls, value) -> Optional[List[int]]:
                 return [int(x) for x in value.split("x")]
             except Exception:
                 raise ValueError(
-                    f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+                    f"Invalid block_structure '{value}'. Must be a list of ints "
+                    "[rows, cols]."
                 )
         if isinstance(value, (list, tuple)):
             if len(value) != 2 or not all(isinstance(v, int) for v in value):
                 raise ValueError(
-                    f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+                    f"Invalid block_structure '{value}'. Must be a list of ints "
+                    "[rows, cols]."
                 )
             return list(value)
         raise ValueError(
-            f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
+            f"Invalid block_structure '{value}'. Must be a list of ints [rows, cols]."
         )
 
     @field_validator("strategy", mode="before")
@@ -307,7 +309,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
             )
             if strategy not in supported_strategies:
                 raise ValueError(
-                    f"One of {supported_strategies} must be used for dynamic quantization"
+                    f"One of {supported_strategies} must be used for dynamic quant."
                 )
 
             if (
@@ -322,7 +324,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
                         observer != "memoryless"
                     ):  # avoid annoying users with old configs
                         warnings.warn(
-                            "No observer is used for dynamic quantization, setting to None"
+                            "No observer is used for dynamic quant., setting to None"
                         )
                     observer = None
             else: