diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index 63068fde12..96ab5d49d9 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -6,8 +6,19 @@ from llmcompressor.utils import dispatch_for_generation MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_ID = "/data5/yliu7/HF_HOME/meta-llama/Llama-3.2-1B-Instruct/" +# MODEL_ID = "meta-llama/Llama-3.3-70B-Instruct" +scheme_name = "NVFP4" +scheme_name = "MXFP4" +# scheme_name = "MXFP8" +# scheme_name = "FP8" + +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + f"-{scheme_name}" +SAVE_DIR = f"/data5/yliu7/HF_HOME/{SAVE_DIR}" +print(f"Saving to {SAVE_DIR}") # Load model. + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -17,7 +28,7 @@ # Select number of samples. 512 samples is a good place to start. # Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 20 +NUM_CALIBRATION_SAMPLES = 4 MAX_SEQUENCE_LENGTH = 2048 # Load dataset and preprocess. @@ -55,7 +66,8 @@ def tokenize(sample): # * quantize the weights to fp4 with per group 16 via ptq # * calibrate a global_scale for activations, which will be used to # quantize activations to fp4 on the fly -recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"]) + +recipe = QuantizationModifier(targets="Linear", scheme=scheme_name, ignore=["lm_head"]) # Apply quantization. oneshot( @@ -69,15 +81,17 @@ def tokenize(sample): print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( model.device ) -output = model.generate(input_ids, max_new_tokens=100) +output = model.generate(input_ids, max_new_tokens=10) + print(tokenizer.decode(output[0])) print("==========================================\n\n") # Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4" + model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantizing_moe/deepseek_moe_w4a4_nvfp4.py b/examples/quantizing_moe/deepseek_moe_w4a4_nvfp4.py new file mode 100644 index 0000000000..d64f3a9772 --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w4a4_nvfp4.py @@ -0,0 +1,133 @@ +import torch +from datasets import load_dataset +from packaging.version import Version +from transformers import AutoModelForCausalLM, AutoTokenizer, __version__ + +from llmcompressor import oneshot +from llmcompressor.utils import dispatch_for_generation + +# NOTE: transformers 4.49.0 has an attribute error with DeepSeek. +# Please consider either downgrading your transformers version to a +# previous version or upgrading to a version where this bug is fixed + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" +MODEL_ID = "/data0/deepseek-ai/DeepSeek-V2-Lite" +MODEL_ID = "/data0/deepseek-ai/DeepSeek-R1" +MODEL_ID = "/data1/DeepSeek-R1-bf16" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto" +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 2 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +# recipe = "deepseek_recipe_w4a16.yaml" + +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"]) + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + trust_remote_code_model=True, +) + +# Confirm generations of the quantized model look sane. +# Generation is broken for deepseek models when using the latest transformers package +if Version(__version__) < Version("4.48"): + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") +else: + print( + "WARNING: cannot perform sample generation of " + "deepseek models with transformers >= 4.48" + ) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) + + +# # Run the model on vLLM +# try: +# from vllm import LLM, SamplingParams + +# vllm_installed = True +# except ImportError: +# vllm_installed = False + +# if vllm_installed: +# print("vLLM installed, running using vLLM") +# sampling_params = SamplingParams(temperature=0.80, top_p=0.95) +# llm = LLM( +# model=SAVE_DIR, +# tensor_parallel_size=2, +# trust_remote_code=True, +# max_model_len=1042, +# dtype=torch.half, +# ) +# prompts = [ +# "The capital of France is", +# "The president of the US is", +# "My name is", +# ] + +# outputs = llm.generate(prompts, sampling_params) +# print("================= vLLM GENERATION ======================") +# for output in outputs: +# assert output +# prompt = output.prompt +# generated_text = output.outputs[0].text +# print("PROMPT", prompt) +# print("GENERATED TEXT", generated_text) diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index 2900f6bd3a..b212582859 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -9,6 +9,13 @@ QuantizationStrategy, ) from compressed_tensors.quantization.lifecycle.forward import forward_quantize +from compressed_tensors.quantization.utils import ( + is_fp4, + is_kv_cache_quant_scheme, + is_mx, + is_mxfp4, +) +from compressed_tensors.utils import align_module_device, update_parameter_data from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme from compressed_tensors.utils import align_module_device, update_offload_parameter from loguru import logger @@ -140,6 +147,10 @@ def update_weight_global_scale(module: Module): != QuantizationStrategy.TENSOR_GROUP ): return + weight_quant_args = getattr_chain(module, "quantization_scheme.weights") + if is_mx(quantization_args=weight_quant_args): +# MX schemes do not use global scale + return call_observer( module, @@ -196,7 +207,7 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str): if quantization_args is not None: if quantization_args.dynamic in (True, DynamicType.LOCAL): calculate_qparams = False - if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP: + if is_fp4(quantization_args=quantization_args): calculate_gparam = True call_observer( diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py index a10af31567..e489a1ed12 100644 --- a/src/llmcompressor/modifiers/utils/helpers.py +++ b/src/llmcompressor/modifiers/utils/helpers.py @@ -2,6 +2,7 @@ import torch from compressed_tensors.quantization import QuantizationStrategy +from compressed_tensors.quantization.utils import is_fp4, is_mxfp4 from compressed_tensors.utils import align_modules, update_parameter_data from torch.nn import Linear, Module @@ -49,6 +50,8 @@ def _valid_tensor_group_quant(layer_list: List[Linear]): if weight_quant_args.strategy != QuantizationStrategy.TENSOR_GROUP: return False + if not is_fp4(quantization_args=weight_quant_args): + return False return True if _is_attention_module(submodule): diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py index f583533af3..650d7c4de1 100644 --- a/src/llmcompressor/transformers/compression/quantization_format.py +++ b/src/llmcompressor/transformers/compression/quantization_format.py @@ -24,6 +24,8 @@ def _get_quant_compression_format( is_weight_only = weight_args is not None and input_args is None if weight_args.num_bits == 4 and weight_args.type == QuantizationType.FLOAT.value: + if weight_args.is_mx: + return CompressionFormat.mxfp4_pack_quantized return CompressionFormat.nvfp4_pack_quantized if is_weight_only: # w4a16 and w8a16 @@ -55,6 +57,30 @@ def _get_quant_compression_format( return CompressionFormat.naive_quantized +def _get_unique_quant_args(model): + """ + Gets a list of all the unique quantization settings present in model + """ + from compressed_tensors.quantization.utils import ( + is_model_quantized, + is_module_quantized, + iter_named_leaf_modules, + ) + quant_info_weight = [] + quant_info_inputs = [] + for _, submodule in iter_named_leaf_modules(model): + if is_module_quantized(submodule): + weight_scheme = submodule.quantization_scheme.weights + input_scheme = submodule.quantization_scheme.input_activations + if weight_scheme is not None: + if weight_scheme not in quant_info_weight: + quant_info_weight.append(weight_scheme) + if input_scheme is not None: + if input_scheme not in quant_info_inputs: + quant_info_inputs.append(input_scheme) + + return quant_info_weight, quant_info_inputs + def infer_and_set_per_module_quantization_format( model, quantization_format: Optional[str] = None, @@ -79,6 +105,51 @@ def infer_and_set_per_module_quantization_format( if not save_compressed: return None + # if save_compressed: + # weight_args, input_args = _get_unique_quant_args(model) + # is_24_structure = ( + # SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR + # ) + # is_weight_only = len(input_args) == 0 and len(weight_args) > 0 + # if ( + # weight_args[0].num_bits == 4 + # and weight_args[0].type == QuantizationType.FLOAT.value + # ): + # if weight_args[0].is_mx: + # return CompressionFormat.mxfp4_pack_quantized + # else: + # return CompressionFormat.nvfp4_pack_quantized + + # if is_weight_only: # w4a16 and w8a16 + # is_valid_pack = all( + # weight_arg.num_bits in [4, 8] + # and weight_arg.type == QuantizationType.INT.value + # for weight_arg in weight_args + # ) + # if not is_valid_pack: # packing only valid for int4 and int 8 + # return CompressionFormat.naive_quantized + # if is_24_structure: + # for arg in weight_args: + # if ( + # arg.strategy is not QuantizationStrategy.CHANNEL.value + # and arg.strategy is not QuantizationStrategy.GROUP.value + # ): + # # marlin24 kernel only applicable for channel/group quantization + # return CompressionFormat.pack_quantized + # return CompressionFormat.marlin_24 + # return CompressionFormat.pack_quantized + # else: # w8a8 float and int + # if len(weight_args) == 1: + # if ( + # weight_args[0].type == QuantizationType.FLOAT.value + # and weight_args[0].num_bits == 8 + # ): + # return CompressionFormat.float_quantized + # if weight_args[0].type == QuantizationType.INT.value: + # return CompressionFormat.int_quantized + + # return CompressionFormat.naive_quantized + if quantization_format: return [quantization_format]