Merge remote-tracking branch 'origin' into kylesayrs/gptq-actorder-default

kylesayrs · kylesayrs · commit 45da803bc6ba · 2025-09-09T09:59:57.000-04:00
diff --git a/README.md b/README.md
@@ -1,4 +1,14 @@
-# <img width="40" alt="tool icon" src="https://github.com/user-attachments/assets/f9b86465-aefa-4625-a09b-54e158efcf96" />  LLM Compressor
+<div align="center">
+
+<h1>
+  <img width="40" alt="tool icon" src="https://github.com/user-attachments/assets/f9b86465-aefa-4625-a09b-54e158efcf96" />
+  <span style="font-size:80px;">LLM Compressor</span>
+</h1>
+
+[![docs](https://img.shields.io/badge/docs-LLM--Compressor-blue)](https://docs.vllm.ai/projects/llm-compressor/en/latest/) [![PyPI](https://img.shields.io/pypi/v/llmcompressor.svg)](https://pypi.org/project/llmcompressor/)
+
+</div>
+
 `llmcompressor` is an easy-to-use library for optimizing models for deployment with `vllm`, including:
 
 * Comprehensive set of quantization algorithms for weight-only and activation quantization
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -247,7 +247,7 @@ def calibrate_kv_cache_input_hook(
     kv_cache to singleton QuantizedKVParameterCache.
     """
     kv_cache = getattr(module, "kv_cache")
-    kwargs["past_key_value"] = kv_cache
+    kwargs["past_key_values"] = kv_cache
     kwargs["use_cache"] = False
     return args, kwargs
 
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -104,7 +104,7 @@ class SpinQuantModifier(Modifier, use_enum_values=True):
     @field_validator("randomize", "learnable", mode="before")
     def validate_not_implemented(cls, value, info: ValidationInfo):
         if value:
-            raise NotImplementedError(f"{info.field_name} is not supported right now")
+            raise NotImplementedError(f"{info.field_name} is not supported as of now")
         return value
 
     @field_validator("rotations", mode="before")
@@ -237,10 +237,24 @@ def _create_r2_scheme(self, model: PreTrainedModel) -> TransformScheme:
 
     def _create_r3_scheme(self) -> TransformScheme:
         raise NotImplementedError(
-            "SpinQuant R3 and R4 rotations will be added in a future release"
+            "SpinQuant R3 rotations will be added in a future release"
         )
 
     def _create_r4_scheme(self) -> TransformScheme:
-        raise NotImplementedError(
-            "SpinQuant R3 and R4 rotations will be added in a future release"
+        return TransformScheme(
+            type=self.transform_type,
+            randomize=self.randomize,
+            requires_grad=self.learnable,
+            precision=self.precision,
+            apply=[
+                TransformArgs(
+                    targets=[*self.mappings.mlp_out],
+                    location="input",
+                ),
+                TransformArgs(
+                    targets=[*self.mappings.mlp_out],
+                    location="weight_input",
+                    inverse=True,
+                ),
+            ],
         )
diff --git a/src/llmcompressor/transformers/finetune/__init__.py b/src/llmcompressor/transformers/finetune/__init__.py
@@ -2,4 +2,3 @@
 
 from .data import TextGenerationDataset
 from .session_mixin import SessionManagerMixIn
-from .text_generation import apply, oneshot, train
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
@@ -231,14 +231,10 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path):
 
     output_dir, _ = next(kv_cache_fixture(recipe, tmp_path))
 
-    with init_empty_weights():
-        # TODO: There is a bug in `apply_quantization_config` which means that, if using
-        # CompressedLinears, the compression status is inferred to `compressed` and
-        # therefore the attention kvcache parameters never undergo initializations
-        model = AutoModelForCausalLM.from_pretrained(
-            output_dir,
-            quantization_config=CompressedTensorsConfig(run_compressed=False),
-        )
+    model = AutoModelForCausalLM.from_pretrained(
+        output_dir,
+        quantization_config=CompressedTensorsConfig(run_compressed=False),
+    )
 
     counts = 0
     for name, submodule in model.named_modules():

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,3 @@`
`2`	`2`
`3`	`3`	`from .data import TextGenerationDataset`
`4`	`4`	`from .session_mixin import SessionManagerMixIn`
`5`		`-from .text_generation import apply, oneshot, train`