Skip to content

Commit 45da803

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/gptq-actorder-default
2 parents 0cec9f6 + 5b3ddff commit 45da803

File tree

6 files changed

+34
-69
lines changed

6 files changed

+34
-69
lines changed

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
1-
# <img width="40" alt="tool icon" src="https://github.com/user-attachments/assets/f9b86465-aefa-4625-a09b-54e158efcf96" /> LLM Compressor
1+
<div align="center">
2+
3+
<h1>
4+
<img width="40" alt="tool icon" src="https://github.com/user-attachments/assets/f9b86465-aefa-4625-a09b-54e158efcf96" />
5+
<span style="font-size:80px;">LLM Compressor</span>
6+
</h1>
7+
8+
[![docs](https://img.shields.io/badge/docs-LLM--Compressor-blue)](https://docs.vllm.ai/projects/llm-compressor/en/latest/) [![PyPI](https://img.shields.io/pypi/v/llmcompressor.svg)](https://pypi.org/project/llmcompressor/)
9+
10+
</div>
11+
212
`llmcompressor` is an easy-to-use library for optimizing models for deployment with `vllm`, including:
313

414
* Comprehensive set of quantization algorithms for weight-only and activation quantization

src/llmcompressor/modifiers/quantization/calibration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def calibrate_kv_cache_input_hook(
247247
kv_cache to singleton QuantizedKVParameterCache.
248248
"""
249249
kv_cache = getattr(module, "kv_cache")
250-
kwargs["past_key_value"] = kv_cache
250+
kwargs["past_key_values"] = kv_cache
251251
kwargs["use_cache"] = False
252252
return args, kwargs
253253

src/llmcompressor/modifiers/transform/spinquant/base.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class SpinQuantModifier(Modifier, use_enum_values=True):
104104
@field_validator("randomize", "learnable", mode="before")
105105
def validate_not_implemented(cls, value, info: ValidationInfo):
106106
if value:
107-
raise NotImplementedError(f"{info.field_name} is not supported right now")
107+
raise NotImplementedError(f"{info.field_name} is not supported as of now")
108108
return value
109109

110110
@field_validator("rotations", mode="before")
@@ -237,10 +237,24 @@ def _create_r2_scheme(self, model: PreTrainedModel) -> TransformScheme:
237237

238238
def _create_r3_scheme(self) -> TransformScheme:
239239
raise NotImplementedError(
240-
"SpinQuant R3 and R4 rotations will be added in a future release"
240+
"SpinQuant R3 rotations will be added in a future release"
241241
)
242242

243243
def _create_r4_scheme(self) -> TransformScheme:
244-
raise NotImplementedError(
245-
"SpinQuant R3 and R4 rotations will be added in a future release"
244+
return TransformScheme(
245+
type=self.transform_type,
246+
randomize=self.randomize,
247+
requires_grad=self.learnable,
248+
precision=self.precision,
249+
apply=[
250+
TransformArgs(
251+
targets=[*self.mappings.mlp_out],
252+
location="input",
253+
),
254+
TransformArgs(
255+
targets=[*self.mappings.mlp_out],
256+
location="weight_input",
257+
inverse=True,
258+
),
259+
],
246260
)

src/llmcompressor/transformers/finetune/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,3 @@
22

33
from .data import TextGenerationDataset
44
from .session_mixin import SessionManagerMixIn
5-
from .text_generation import apply, oneshot, train

src/llmcompressor/transformers/finetune/text_generation.py

Lines changed: 0 additions & 54 deletions
This file was deleted.

tests/llmcompressor/transformers/kv_cache/test_kv_cache.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -231,14 +231,10 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path):
231231

232232
output_dir, _ = next(kv_cache_fixture(recipe, tmp_path))
233233

234-
with init_empty_weights():
235-
# TODO: There is a bug in `apply_quantization_config` which means that, if using
236-
# CompressedLinears, the compression status is inferred to `compressed` and
237-
# therefore the attention kvcache parameters never undergo initializations
238-
model = AutoModelForCausalLM.from_pretrained(
239-
output_dir,
240-
quantization_config=CompressedTensorsConfig(run_compressed=False),
241-
)
234+
model = AutoModelForCausalLM.from_pretrained(
235+
output_dir,
236+
quantization_config=CompressedTensorsConfig(run_compressed=False),
237+
)
242238

243239
counts = 0
244240
for name, submodule in model.named_modules():

0 commit comments

Comments
 (0)