Skip to content

Commit 6bb7dc6

Browse files
Experimental support for 4-bit compression with LUT per layer and per block (#3684)
### Changes Implemented computation of codebook based on k-means algorithm. ### Reason for changes ### Related tickets CVS-169609 CVS-180243 for leftovers ### Tests https://github.com/openvinotoolkit/nncf/actions/runs/21363309569 --------- Co-authored-by: Aleksandr Suslov <alexander.suslov@intel.com>
1 parent ae7a584 commit 6bb7dc6

File tree

21 files changed

+1337
-25
lines changed

21 files changed

+1337
-25
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Adaptive Codebook Compression for Large Language Models
2+
3+
This example demonstrates how to apply codebook compression to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes.
4+
5+
## Prerequisites
6+
7+
Before running this example, ensure you have Python 3.10+ installed and set up your environment:
8+
9+
### 1. Create and activate a virtual environment
10+
11+
```bash
12+
python3 -m venv nncf_env
13+
source nncf_env/bin/activate # On Windows: nncf_env\Scripts\activate.bat
14+
```
15+
16+
### 2. Install NNCF and other dependencies
17+
18+
```bash
19+
python3 -m pip install ../../../../ -r requirements.txt
20+
```
21+
22+
## Run Example
23+
24+
To run example:
25+
26+
```bash
27+
python main.py
28+
```
29+
30+
This will automatically:
31+
32+
- Download the SmolLM2 model and dataset
33+
- Apply weight compression using NNCF
34+
- Save the optimized model
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
# Copyright (c) 2026 Intel Corporation
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import warnings
13+
from functools import partial
14+
15+
import datasets
16+
import numpy as np
17+
from optimum.intel.openvino import OVModelForCausalLM
18+
from scipy.stats import norm
19+
from torch.jit import TracerWarning
20+
from transformers import AutoTokenizer
21+
from transformers import logging
22+
23+
import nncf
24+
from nncf.quantization.advanced_parameters import AdvancedAdaptiveCodebookParameters
25+
26+
logging.set_verbosity_error()
27+
warnings.filterwarnings("ignore", category=TracerWarning)
28+
29+
30+
MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
31+
COMPRESSED_MODEL_ID = "smollm2_360m_compressed_codebook"
32+
33+
34+
def get_input_shapes(model, batch_size=1) -> dict[str, list[int]]:
35+
"""
36+
Extract input shapes from the model and configure them with the specified batch size.
37+
38+
:param model: The model from which to extract input shapes.
39+
:param batch_size: The batch size to use for the input shapes. Defaults to 1.
40+
:return: A dictionary mapping input names to their shapes.
41+
"""
42+
inputs = {}
43+
44+
for val in model.model.inputs:
45+
name = val.any_name
46+
shape = list(val.partial_shape.get_min_shape())
47+
shape[0] = batch_size
48+
inputs[name] = shape
49+
50+
return inputs
51+
52+
53+
def preprocess_fn(example, tokenizer) -> dict[str, str]:
54+
"""
55+
Preprocess an example by applying the chat template to its messages.
56+
57+
:param example: The example containing messages to preprocess.
58+
:param tokenizer: The tokenizer to use for applying the chat template.
59+
:return: A dictionary with the processed text.
60+
"""
61+
return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
62+
63+
64+
def transform_func(item, tokenizer, input_shapes, max_tokens=128) -> dict[str, np.ndarray]:
65+
"""
66+
Transform a dataset item into model input format with tokenization and shape handling.
67+
68+
:param item: The dataset item containing text to transform.
69+
:param tokenizer: The tokenizer to use for text tokenization.
70+
:param input_shapes: Dictionary of expected input shapes for the model.
71+
:param max_tokens: Maximum number of tokens to use from the tokenized text. Defaults to 128.
72+
:return: A dictionary containing transformed inputs ready for model inference.
73+
"""
74+
text = item["text"]
75+
tokens = tokenizer(text)
76+
77+
res = {
78+
"input_ids": np.expand_dims(np.array(tokens["input_ids"][:max_tokens]), 0),
79+
"attention_mask": np.expand_dims(np.array(tokens["attention_mask"][:max_tokens]), 0),
80+
}
81+
82+
if "position_ids" in input_shapes:
83+
position_ids = np.cumsum(res["attention_mask"], axis=1) - 1
84+
position_ids[res["attention_mask"] == 0] = 1
85+
res["position_ids"] = position_ids
86+
batch_size = res["input_ids"].shape[0]
87+
88+
if "beam_idx" in input_shapes:
89+
res["beam_idx"] = np.arange(batch_size, dtype=int)
90+
91+
return res
92+
93+
94+
def get_dataset(model, tokenizer) -> nncf.Dataset:
95+
"""
96+
Create and prepare a quantization dataset for model compression.
97+
98+
:param model: The model for which to prepare the dataset.
99+
:param tokenizer: The tokenizer to use for processing the dataset.
100+
:return: An NNCF dataset ready for quantization.
101+
"""
102+
input_shapes = get_input_shapes(model, batch_size=1)
103+
104+
num_samples = 2048
105+
dataset = datasets.load_dataset("neuralmagic/LLM_compression_calibration", split="train")
106+
dataset = dataset.shuffle(seed=42).select(range(num_samples))
107+
dataset = dataset.map(partial(preprocess_fn, tokenizer=tokenizer))
108+
109+
quantization_dataset = nncf.Dataset(
110+
dataset, partial(transform_func, tokenizer=tokenizer, input_shapes=input_shapes)
111+
)
112+
return quantization_dataset
113+
114+
115+
def create_normal_distributed_values(n_levels=8) -> np.ndarray:
116+
"""
117+
Create a codebook of normally distributed values normalized to the range [-1, 1].
118+
119+
:param n_levels: The number of quantization levels in the codebook. Defaults to 8.
120+
:return: A numpy array of normalized normally distributed values.
121+
"""
122+
probs = (np.arange(n_levels) + 0.5) / n_levels
123+
124+
# Inverse CDF (quantiles) of standard normal distribution
125+
values = norm.ppf(probs)
126+
127+
# Normalize to [-1, 1]
128+
values = values / np.max(np.abs(values))
129+
130+
return values.astype(np.float32)
131+
132+
133+
def generate_answers(
134+
questions: list[str], model: OVModelForCausalLM, tokenizer: AutoTokenizer, max_new_tokens: int = 10
135+
) -> dict[str, str]:
136+
"""
137+
Generate answers for a list of questions using the provided model and tokenizer.
138+
139+
:param questions: List of questions to be answered.
140+
:param model: The model to use for generating answers.
141+
:param tokenizer: The tokenizer to use for processing the input and output.
142+
:param max_new_tokens: Maximum number of new tokens to generate for each answer. Defaults to 50.
143+
:return: A dictionary mapping each question to its corresponding answer.
144+
"""
145+
messages = [
146+
{"role": "system", "content": "You are a chatbot who always responds as short as possible."},
147+
{"role": "user", "content": "What is the capital of Spain?"},
148+
{"role": "assistant", "content": "Madrid."},
149+
]
150+
answers_by_questions = {}
151+
152+
for question in questions:
153+
messages.append({"role": "user", "content": question})
154+
input_ids = tokenizer.apply_chat_template(
155+
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
156+
).to(device=model.device)
157+
input_len = len(input_ids[0])
158+
159+
output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
160+
answer = tokenizer.decode(output[input_len:], skip_special_tokens=True)
161+
answers_by_questions[question] = answer
162+
messages.append({"role": "assistant", "content": answer})
163+
164+
return answers_by_questions
165+
166+
167+
def print_answers(header: str, answers_by_questions: list[str]) -> None:
168+
"""
169+
Print the answers to the console.
170+
171+
:param header: Header to print before the answers.
172+
:param answers_by_questions: Dictionary mapping questions to their answers.
173+
"""
174+
print(header)
175+
for question, answer in answers_by_questions.items():
176+
print(f"Q: {question}\nA: {answer}\n")
177+
178+
179+
QUESTIONS = [
180+
"What is the capital of France?",
181+
"What is the highest peak in the Alps?",
182+
"What is the largest city in Canada?",
183+
"What is the most visited city in Japan?",
184+
]
185+
186+
187+
def load_model_and_tokenizer(model_id: str, export=True) -> tuple[OVModelForCausalLM, AutoTokenizer]:
188+
"""
189+
Load the model and tokenizer from the specified model ID.
190+
191+
:param model_id: The identifier of the model to load.
192+
:param export: Whether to export the model for OpenVINO. Defaults to True.
193+
:return: A tuple containing the loaded model and tokenizer.
194+
"""
195+
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
196+
model = OVModelForCausalLM.from_pretrained(
197+
model_id,
198+
export=export,
199+
load_in_8bit=False,
200+
)
201+
return model, tokenizer
202+
203+
204+
def codebook_example(
205+
model_id: str, compressed_model_id: str, adaptive_codebook: bool = False, num_elements: int = 10
206+
) -> list[str]:
207+
"""
208+
Example of using the adaptive codebook compression.
209+
210+
:param model_id: The identifier of the model to load.
211+
:param compressed_model_id: The identifier for the compressed model to save.
212+
:param adaptive_codebook: Whether to use adaptive codebook compression. Defaults to False.
213+
:param num_parameters: Number of parameters in the codebook. Defaults to 8.
214+
:return: A list of answers generated by the model after compression.
215+
"""
216+
model, tokenizer = load_model_and_tokenizer(model_id)
217+
218+
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
219+
print_answers("Non-optimized model outputs:\n", answers_by_questions)
220+
221+
codebook = create_normal_distributed_values(num_elements)
222+
223+
adaptive_codebook_params = AdvancedAdaptiveCodebookParameters(
224+
num_elements=num_elements, value_type=nncf.tensor.TensorDataType.float16, across_blocks=False
225+
)
226+
quantization_dataset = get_dataset(model, tokenizer)
227+
228+
model.model = nncf.compress_weights(
229+
model.model,
230+
mode=nncf.CompressWeightsMode.ADAPTIVE_CODEBOOK if adaptive_codebook else nncf.CompressWeightsMode.CODEBOOK,
231+
ratio=1.0,
232+
group_size=-1,
233+
scale_estimation=True,
234+
dataset=quantization_dataset,
235+
advanced_parameters=nncf.AdvancedCompressionParameters(
236+
codebook=codebook, adaptive_codebook_params=adaptive_codebook_params if adaptive_codebook else None
237+
),
238+
)
239+
model.save_pretrained(compressed_model_id)
240+
tokenizer.save_pretrained(compressed_model_id)
241+
242+
model, tokenizer = load_model_and_tokenizer(compressed_model_id, False)
243+
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
244+
print_answers("Optimized model outputs:\n", answers_by_questions)
245+
246+
return list(answers_by_questions.values())
247+
248+
249+
def main() -> list[str]:
250+
"""
251+
Main function that demonstrates both standard and adaptive codebook compression.
252+
253+
:return: A list of answers generated by both compressed models.
254+
"""
255+
res = codebook_example(MODEL_ID, COMPRESSED_MODEL_ID + "_adaptive", adaptive_codebook=True)
256+
return res
257+
258+
259+
if __name__ == "__main__":
260+
main()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
datasets==4.5.0
2+
openvino==2025.4.1
3+
optimum-intel[openvino]==1.27.0
4+
optimum-onnx==0.1.0
5+
optimum==2.1.0
6+
transformers==4.53.0
7+
onnx==1.17.0
8+
torch==2.9.0
9+
torchvision==0.24.0
10+
pillow==12.0.0

0 commit comments

Comments
 (0)