vllm-project · kylesayrs · Jul 31, 2025 · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025
diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
@@ -17,17 +17,17 @@ pip install -e .
 The provided example script demonstrates an end-to-end process for applying the quantization algorithm:
 
 ```bash
-python3 mixtral_moe_w8a8_fp8.py
+python3 mixtral_example.py
 ```
 
 ## Creating a Quantized MoE Model
 
-This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `open_platypus` dataset.
+This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset.
 
 You can follow the detailed steps below or simply run the example script with:
 
 ```bash
-python mixtral_moe_w8a8_fp8.py
+python mixtral_example.py
 ```
 
 ### Step 1: Select a Model, Dataset, and Recipe
@@ -61,7 +61,6 @@ oneshot(
     recipe=recipe,
     save_compressed=True,
     output_dir=output_dir,
-
     max_seq_length=2048,
     num_calibration_samples=512,
 )
@@ -74,7 +73,7 @@ NOTE: Only per-tensor quantization is supported in vLLM as of now (`vllm==0.6.1`
 
 The repository supports multiple quantization techniques configured via a recipe. Supported strategies include `tensor`, `group`, and `channel` quantization.
 
-In the above example, FP8 per-tensor quantization is used as specified by the `FP8` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library.
+In the above example, quantization is specified by the `FP8` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library.
 
 A custom scheme can also be specified using `config_groups`:
 
@@ -84,18 +83,18 @@ A custom scheme can also be specified using `config_groups`:
 from llmcompressor.modifiers.quantization.gptq import GPTQModifier
 
 config_groups = {
-                "group_0": {
-                    "targets": ["Linear"],
-                    "input_activations": None,
-                    "output_activations": None,
-                    "weights": {
-                        "num_bits": 8,
-                        "type": "int",
-                        "symmetric": true,
-                        "strategy": "group",
-                        "group_size": 128, 
-                    }
-               }
+    "group_0": {
+        "targets": ["Linear"],
+        "input_activations": None,
+        "output_activations": None,
+        "weights": {
+            "num_bits": 8,
+            "type": "int",
+            "symmetric": True,
+            "strategy": "group",
+            "group_size": 128, 
+        }
+    }
 }
 
 recipe = GPTQModifier(config_groups=config_groups)

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
diff --git a/...s/quantizing_moe/deepseek_moe_w8a8_fp8.py → examples/quantizing_moe/mixtral_example.py b/...s/quantizing_moe/deepseek_moe_w8a8_fp8.py → examples/quantizing_moe/mixtral_example.py
@@ -1,28 +1,23 @@
+import torch
 from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
 # select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 
@@ -56,16 +51,17 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
-# define a llmcompressor recipe for FP8 W8A8 quantization
+# Configure the quantization algorithm to run.
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
-recipe = [
-    QuantizationModifier(
-        targets="Linear",
-        scheme="FP8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
+recipe = QuantizationModifier(
+    scheme="FP8",
+    targets="Linear",
+    ignore=[
+        "lm_head",
+        "re:.*block_sparse_moe.gate",  # does not quantize well
+    ],
+)
 
 oneshot(
     model=model,
@@ -76,22 +72,13 @@ def tokenize(sample):
     trust_remote_code_model=True,
 )
 
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================")
 
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"