neuralmagic
diff --git a/‎pyproject.toml‎
Lines changed: 9 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/compressed_tensors/base.py‎
Lines changed: 8 additions & 3 deletions b/‎src/compressed_tensors/base.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 149 additions & 60 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 149 additions & 60 deletions
diff --git a/‎src/compressed_tensors/config/base.py‎
Lines changed: 1 addition & 0 deletions b/‎src/compressed_tensors/config/base.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/compressed_tensors/quantization/quant_args.py‎
Lines changed: 3 additions & 1 deletion b/‎src/compressed_tensors/quantization/quant_args.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/compressed_tensors/quantization/quant_config.py‎
Lines changed: 14 additions & 2 deletions b/‎src/compressed_tensors/quantization/quant_config.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎src/compressed_tensors/quantization/quant_scheme.py‎
Lines changed: 13 additions & 2 deletions b/‎src/compressed_tensors/quantization/quant_scheme.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎src/compressed_tensors/transform/apply.py‎
Lines changed: 4 additions & 0 deletions b/‎src/compressed_tensors/transform/apply.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/compressed_tensors/transform/factory/base.py‎
Lines changed: 2 additions & 2 deletions b/‎src/compressed_tensors/transform/factory/base.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/compressed_tensors/transform/factory/hadamard.py‎
Lines changed: 15 additions & 8 deletions b/‎src/compressed_tensors/transform/factory/hadamard.py‎
Lines changed: 15 additions & 8 deletions
@@ -5,3 +5,12 @@ build-backend = "setuptools.build_meta"
 [tool.black]
 line-length = 88
 target-version = ['py36']
+
+[tool.pytest.ini_options]
+markers = [
+    "unit: tests to ensure code correctness and regression test functionality",
+    "smoke: quick tests to check basic functionality",
+    "sanity: tests to ensure that new changes do not break existing functionality",
+    "regression: detailed tests to ensure major functions work correctly",
+    "integration: tests which integrate with a third party service such as HF",
+]
@@ -12,9 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-SPARSITY_CONFIG_NAME = "sparsity_config"
+# configs
 QUANTIZATION_CONFIG_NAME = "quantization_config"
-COMPRESSION_CONFIG_NAME = "compression_config"
-KV_CACHE_SCHEME_NAME = "kv_cache_scheme"
+SPARSITY_CONFIG_NAME = "sparsity_config"
+TRANSFORM_CONFIG_NAME = "transform_config"
+
+# required fields
 COMPRESSION_VERSION_NAME = "version"
 QUANTIZATION_METHOD_NAME = "quant_method"
+
+# auxillary configs
+KV_CACHE_SCHEME_NAME = "kv_cache_scheme"
@@ -32,6 +32,7 @@ class CompressionFormat(Enum):
     naive_quantized = "naive-quantized"
     pack_quantized = "pack-quantized"
     marlin_24 = "marlin-24"
+    mixed_precision = "mixed-precision"
     nvfp4_pack_quantized = "nvfp4-pack-quantized"
 
 
 
@@ -19,7 +19,7 @@
 import torch
 from compressed_tensors.utils import Aliasable
 from compressed_tensors.utils.helpers import deprecated
-from pydantic import BaseModel, Field, field_validator, model_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
 
 __all__ = [
@@ -358,6 +358,8 @@ def pytorch_dtype(self) -> torch.dtype:
     def get_observer(self) -> str:
         return self.observer
 
+    model_config = ConfigDict(extra="forbid")
+
 
 def round_to_quantized_type(
     tensor: torch.Tensor, args: QuantizationArgs
 
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Annotated, Any, Dict, List, Optional, Union
 
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
@@ -26,7 +26,7 @@
     module_type,
     parse_out_kv_cache_args,
 )
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
 from torch.nn import Module
 
 
@@ -142,6 +142,9 @@ class QuantizationConfig(BaseModel):
     quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
     global_compression_ratio: Optional[float] = None
     ignore: Optional[List[str]] = Field(default_factory=list)
+    # `run_compressed` is a dummy, unused arg for backwards compatibility
+    # see: https://github.com/huggingface/transformers/pull/39324
+    run_compressed: Annotated[Any, Field(exclude=True)] = None
 
     def model_post_init(self, __context):
         """
@@ -231,6 +234,12 @@ def from_pretrained(
                 format = CompressionFormat.int_quantized.value
             else:
                 format = CompressionFormat.dense.value
+        elif isinstance(format, list):
+            format = (
+                CompressionFormat.mixed_precision.value
+                if len(format) > 1
+                else format[0]
+            )
 
         return QuantizationConfig(
             config_groups=config_groups,
@@ -254,3 +263,6 @@ def requires_calibration_data(self):
                     return True
 
         return False
+
+    # TODO set `extra="forbid"` when upstream transformers is compatible
+    model_config = ConfigDict(extra="ignore")
@@ -14,15 +14,16 @@
 
 import warnings
 from copy import deepcopy
-from typing import Any, Dict, List, Optional
+from typing import List, Optional
 
+from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization.quant_args import (
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
 )
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel, ConfigDict, model_validator
 
 
 __all__ = [
@@ -42,18 +43,21 @@ class QuantizationScheme(BaseModel):
     :param weights: quantization config for layer weights
     :param input_activations: quantization config for layer inputs
     :param output_activations: quantization config for layer outputs
+    :param format: CompressionFormat for the layer
     """
 
     targets: List[str]
     weights: Optional[QuantizationArgs] = None
     input_activations: Optional[QuantizationArgs] = None
     output_activations: Optional[QuantizationArgs] = None
+    format: Optional[str] = None
 
     @model_validator(mode="after")
     def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
         inputs = model.input_activations
         outputs = model.output_activations
         weights = model.weights
+        format = model.format
 
         if inputs is not None:
             if inputs.actorder is not None:
@@ -63,6 +67,11 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
             if outputs.actorder is not None:
                 raise ValueError("Cannot apply actorder to output activations")
 
+        if format == CompressionFormat.mixed_precision.value:
+            raise ValueError(
+                "mixed-precision cannot be set as a format for a QuantizationScheme"
+            )
+
         if (
             inputs
             and weights
@@ -81,6 +90,8 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
 
         return model
 
+    model_config = ConfigDict(extra="forbid")
+
 
 """
 Pre-Set Quantization Scheme Args
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+from compressed_tensors import TRANSFORM_CONFIG_NAME
 from compressed_tensors.transform import TransformConfig, TransformFactory
 
 
@@ -30,3 +31,6 @@ def apply_transform_config(model: torch.nn.Module, config: TransformConfig):
     for name, scheme in config.config_groups.items():
         factory = TransformFactory.from_scheme(scheme, name=name)
         factory.apply_to_model(model)
+
+    # attach config to model for compression/serialization
+    setattr(model, TRANSFORM_CONFIG_NAME, config)
@@ -14,11 +14,10 @@
 
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import List, Optional, Tuple, Set
+from typing import List, Optional, Set, Tuple
 
 import torch
 import torch.nn.utils.parametrize as P
-from compressed_tensors import InternalModule
 from compressed_tensors.registry.registry import RegistryMixin, T
 from compressed_tensors.transform import (
     TransformArgs,
@@ -34,6 +33,7 @@
     register_offload_module,
     update_offload_parameter,
 )
+from compressed_tensors.utils.internal import InternalModule
 from torch import Tensor
 from torch.nn import Module, Parameter
 
 
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 from compressed_tensors.transform import TransformArgs, TransformScheme
@@ -26,7 +25,7 @@
 from compressed_tensors.utils import get_execution_device, get_offloaded_device
 from compressed_tensors.utils.helpers import ParameterizedDefaultDict
 from torch import Tensor, device, dtype
-from torch.nn import Linear, Module, Parameter
+from torch.nn import Module, Parameter
 
 
 @TransformFactory.register("hadamard")
@@ -54,14 +53,14 @@ def create_transform(self, module: Module, args: TransformArgs):
         """
         assert hasattr(module, "weight")
         size = get_transform_size(module, args.location, self.scheme.head_dim)
-        dtype = module.weight.dtype
+        dtype = self.scheme.precision
         device = get_offloaded_device(module)
         exec_device = get_execution_device(module)
 
         factory_kwargs = {"construct_device": exec_device}
         weight = self.weights.get(size, dtype, device, factory_kwargs=factory_kwargs)
         perm = self.perms[weight] if self.scheme.randomize else None
-        return HadamardTransform(weight, perm, args, type(module))
+        return HadamardTransform(weight, perm, self.scheme, args, type(module))
 
     def _create_weight(
         self,
@@ -85,15 +84,18 @@ def __init__(
         self,
         weight: Parameter,
         perm: Optional[Parameter],
+        scheme: TransformScheme,
         args: TransformArgs,
         module_type: type[torch.nn.Module],
     ):
         super().__init__()
         self.weight = weight
         self.perm = perm
+        self.scheme = scheme
         self.args = args
         self.module_type = module_type
-        self._scale = math.sqrt(weight.size(0))
+        self._scale = torch.tensor(weight.size(0), dtype=self.scheme.precision).sqrt()
+        self._precision = scheme.precision if args.is_online() else torch.float64
 
     def forward(self, value: Tensor) -> Tensor:
         weight = self.weight
@@ -105,6 +107,11 @@ def forward(self, value: Tensor) -> Tensor:
             weight = weight.T
 
         return (
-            apply_transform_weight(weight, value, self.args.location, self.module_type)
+            apply_transform_weight(
+                weight.to(self._precision),
+                value.to(self._precision),
+                self.args.location,
+                self.module_type,
+            )
             / self._scale
-        )
+        ).to(value.dtype)