vllm-project · dsikka · Aug 7, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/examples/quantization_w8a8_fp8/fp8_block_example.py b/examples/quantization_w8a8_fp8/fp8_block_example.py
@@ -3,7 +3,7 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
-MODEL_ID = "Qwen/Qwen3-0.6B"
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
 # Load model.
 model = AutoModelForCausalLM.from_pretrained(
@@ -16,7 +16,7 @@
 #   * quantize the weights to fp8 with per channel via ptq
 #   * quantize the activations to fp8 with dynamic per token
 recipe = QuantizationModifier(
-    targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"]
+    targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
 )
 
 # Apply quantization.

diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -233,8 +233,10 @@ def get_qparams(
                         c0 = j * block_cols
                         c1 = min((j + 1) * block_cols, cols)
                         # reduce across both dims to get one scale and zp per block
+                        # Use unique tensor_id for each block to maintain separate running stats
+                        block_tensor_id = f"block_{i}_{j}"
                         scale_bp, zp_bp = self.calculate_qparams(
-                            observed[r0:r1, c0:c1], reduce_dims=(0, 1)
+                            observed[r0:r1, c0:c1], reduce_dims=(0, 1), tensor_id=block_tensor_id
                         )
                         self._scale[i, j] = scale_bp
                         self._zero_point[i, j] = zp_bp