Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions examples/quantization_w4a4_fp4/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@
from llmcompressor.utils import dispatch_for_generation

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID = "/data5/yliu7/HF_HOME/meta-llama/Llama-3.2-1B-Instruct/"
# MODEL_ID = "meta-llama/Llama-3.3-70B-Instruct"
scheme_name = "NVFP4"
scheme_name = "MXFP4"
# scheme_name = "MXFP8"
# scheme_name = "FP8"

SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + f"-{scheme_name}"
SAVE_DIR = f"/data5/yliu7/HF_HOME/{SAVE_DIR}"
print(f"Saving to {SAVE_DIR}")
Comment on lines +9 to +18

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This block contains hardcoded, user-specific paths, multiple re-definitions of variables, and a debug print statement. This appears to be temporary development code that should be removed before merging, as it makes the example non-runnable for other users.


# Load model.

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Expand All @@ -17,7 +28,7 @@

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 20
NUM_CALIBRATION_SAMPLES = 4
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
Expand Down Expand Up @@ -55,7 +66,8 @@ def tokenize(sample):
# * quantize the weights to fp4 with per group 16 via ptq
# * calibrate a global_scale for activations, which will be used to
# quantize activations to fp4 on the fly
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])

recipe = QuantizationModifier(targets="Linear", scheme=scheme_name, ignore=["lm_head"])

# Apply quantization.
oneshot(
Expand All @@ -69,15 +81,17 @@ def tokenize(sample):
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)

input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=100)
output = model.generate(input_ids, max_new_tokens=10)

print(tokenizer.decode(output[0]))
print("==========================================\n\n")


# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"

model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
133 changes: 133 additions & 0 deletions examples/quantizing_moe/deepseek_moe_w4a4_nvfp4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import torch
from datasets import load_dataset
from packaging.version import Version
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__

from llmcompressor import oneshot
from llmcompressor.utils import dispatch_for_generation

# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
# Please consider either downgrading your transformers version to a
# previous version or upgrading to a version where this bug is fixed

# select a Mixture of Experts model for quantization
MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
MODEL_ID = "/data0/deepseek-ai/DeepSeek-V2-Lite"
MODEL_ID = "/data0/deepseek-ai/DeepSeek-R1"
MODEL_ID = "/data1/DeepSeek-R1-bf16"
Comment on lines +14 to +17

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This example script hardcodes multiple MODEL_IDs, including local, user-specific paths. Please use a single, public model ID from the Hugging Face Hub to ensure the example is runnable by others.

Suggested change
MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
MODEL_ID = "/data0/deepseek-ai/DeepSeek-V2-Lite"
MODEL_ID = "/data0/deepseek-ai/DeepSeek-R1"
MODEL_ID = "/data1/DeepSeek-R1-bf16"
MODEL_ID = "deepseek-ai/DeepSeek-V2-Lite"


model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 2
MAX_SEQUENCE_LENGTH = 2048


# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


ds = ds.map(tokenize, remove_columns=ds.column_names)

# define a llmcompressor recipe for W416 quantization
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
# list so they remain at full precision
# recipe = "deepseek_recipe_w4a16.yaml"

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation
Comment on lines +66 to +67

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

These imports should be moved to the top of the file to follow PEP 8 guidelines.


recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])

oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
save_compressed=True,
trust_remote_code_model=True,
)

# Confirm generations of the quantized model look sane.
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
print("==========================================")
else:
print(
"WARNING: cannot perform sample generation of "
"deepseek models with transformers >= 4.48"
)

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)


# # Run the model on vLLM
# try:
# from vllm import LLM, SamplingParams

# vllm_installed = True
# except ImportError:
# vllm_installed = False

# if vllm_installed:
# print("vLLM installed, running using vLLM")
# sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
# llm = LLM(
# model=SAVE_DIR,
# tensor_parallel_size=2,
# trust_remote_code=True,
# max_model_len=1042,
# dtype=torch.half,
# )
# prompts = [
# "The capital of France is",
# "The president of the US is",
# "My name is",
# ]

# outputs = llm.generate(prompts, sampling_params)
# print("================= vLLM GENERATION ======================")
# for output in outputs:
# assert output
# prompt = output.prompt
# generated_text = output.outputs[0].text
# print("PROMPT", prompt)
# print("GENERATED TEXT", generated_text)
Comment on lines +102 to +133

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This large block of commented-out code appears to be leftover from development. It should be removed to keep the example clean and focused.

13 changes: 12 additions & 1 deletion src/llmcompressor/modifiers/quantization/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
QuantizationStrategy,
)
from compressed_tensors.quantization.lifecycle.forward import forward_quantize
from compressed_tensors.quantization.utils import (
is_fp4,
is_kv_cache_quant_scheme,
is_mx,
is_mxfp4,
)
from compressed_tensors.utils import align_module_device, update_parameter_data
from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
from compressed_tensors.utils import align_module_device, update_offload_parameter
Comment on lines +12 to 20

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There are duplicate imports in this block. is_kv_cache_quant_scheme and align_module_device are imported twice. Additionally, update_parameter_data is imported but does not appear to be used in this file. Please consolidate and clean up the imports to improve code clarity and maintainability.

Suggested change
from compressed_tensors.quantization.utils import (
is_fp4,
is_kv_cache_quant_scheme,
is_mx,
is_mxfp4,
)
from compressed_tensors.utils import align_module_device, update_parameter_data
from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
from compressed_tensors.utils import align_module_device, update_offload_parameter
from compressed_tensors.quantization.utils import (
is_fp4,
is_kv_cache_quant_scheme,
is_mx,
is_mxfp4,
)
from compressed_tensors.utils import align_module_device, update_offload_parameter

from loguru import logger
Expand Down Expand Up @@ -140,6 +147,10 @@ def update_weight_global_scale(module: Module):
!= QuantizationStrategy.TENSOR_GROUP
):
return
weight_quant_args = getattr_chain(module, "quantization_scheme.weights")
if is_mx(quantization_args=weight_quant_args):
# MX schemes do not use global scale
return

call_observer(
module,
Expand Down Expand Up @@ -196,7 +207,7 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
if quantization_args is not None:
if quantization_args.dynamic in (True, DynamicType.LOCAL):
calculate_qparams = False
if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
if is_fp4(quantization_args=quantization_args):
calculate_gparam = True

call_observer(
Expand Down
3 changes: 3 additions & 0 deletions src/llmcompressor/modifiers/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import torch
from compressed_tensors.quantization import QuantizationStrategy
from compressed_tensors.quantization.utils import is_fp4, is_mxfp4
from compressed_tensors.utils import align_modules, update_parameter_data
from torch.nn import Linear, Module

Expand Down Expand Up @@ -49,6 +50,8 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):

if weight_quant_args.strategy != QuantizationStrategy.TENSOR_GROUP:
return False
if not is_fp4(quantization_args=weight_quant_args):
return False
return True

if _is_attention_module(submodule):
Expand Down
71 changes: 71 additions & 0 deletions src/llmcompressor/transformers/compression/quantization_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def _get_quant_compression_format(
is_weight_only = weight_args is not None and input_args is None

if weight_args.num_bits == 4 and weight_args.type == QuantizationType.FLOAT.value:
if weight_args.is_mx:
return CompressionFormat.mxfp4_pack_quantized
return CompressionFormat.nvfp4_pack_quantized

if is_weight_only: # w4a16 and w8a16
Expand Down Expand Up @@ -55,6 +57,30 @@ def _get_quant_compression_format(
return CompressionFormat.naive_quantized


def _get_unique_quant_args(model):
"""
Gets a list of all the unique quantization settings present in model
"""
from compressed_tensors.quantization.utils import (
is_model_quantized,
is_module_quantized,
iter_named_leaf_modules,
)
quant_info_weight = []
quant_info_inputs = []
for _, submodule in iter_named_leaf_modules(model):
if is_module_quantized(submodule):
weight_scheme = submodule.quantization_scheme.weights
input_scheme = submodule.quantization_scheme.input_activations
if weight_scheme is not None:
if weight_scheme not in quant_info_weight:
quant_info_weight.append(weight_scheme)
if input_scheme is not None:
if input_scheme not in quant_info_inputs:
quant_info_inputs.append(input_scheme)

return quant_info_weight, quant_info_inputs

def infer_and_set_per_module_quantization_format(
model,
quantization_format: Optional[str] = None,
Expand All @@ -79,6 +105,51 @@ def infer_and_set_per_module_quantization_format(

if not save_compressed:
return None
# if save_compressed:
# weight_args, input_args = _get_unique_quant_args(model)
# is_24_structure = (
# SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
# )
# is_weight_only = len(input_args) == 0 and len(weight_args) > 0
# if (
# weight_args[0].num_bits == 4
# and weight_args[0].type == QuantizationType.FLOAT.value
# ):
# if weight_args[0].is_mx:
# return CompressionFormat.mxfp4_pack_quantized
# else:
# return CompressionFormat.nvfp4_pack_quantized

# if is_weight_only: # w4a16 and w8a16
# is_valid_pack = all(
# weight_arg.num_bits in [4, 8]
# and weight_arg.type == QuantizationType.INT.value
# for weight_arg in weight_args
# )
# if not is_valid_pack: # packing only valid for int4 and int 8
# return CompressionFormat.naive_quantized
# if is_24_structure:
# for arg in weight_args:
# if (
# arg.strategy is not QuantizationStrategy.CHANNEL.value
# and arg.strategy is not QuantizationStrategy.GROUP.value
# ):
# # marlin24 kernel only applicable for channel/group quantization
# return CompressionFormat.pack_quantized
# return CompressionFormat.marlin_24
# return CompressionFormat.pack_quantized
# else: # w8a8 float and int
# if len(weight_args) == 1:
# if (
# weight_args[0].type == QuantizationType.FLOAT.value
# and weight_args[0].num_bits == 8
# ):
# return CompressionFormat.float_quantized
# if weight_args[0].type == QuantizationType.INT.value:
# return CompressionFormat.int_quantized

# return CompressionFormat.naive_quantized


if quantization_format:
return [quantization_format]
Expand Down
Loading