Skip to content
4 changes: 2 additions & 2 deletions examples/quantization_w8a8_fp8/fp8_block_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "Qwen/Qwen3-0.6B"
MODEL_ID = "Qwen/Qwen3-30B-A3B"

# Load model.
model = AutoModelForCausalLM.from_pretrained(
Expand All @@ -16,7 +16,7 @@
# * quantize the weights to fp8 with per channel via ptq
# * quantize the activations to fp8 with dynamic per token
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"]
targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
)

# Apply quantization.
Expand Down
4 changes: 3 additions & 1 deletion src/llmcompressor/observers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,10 @@ def get_qparams(
c0 = j * block_cols
c1 = min((j + 1) * block_cols, cols)
# reduce across both dims to get one scale and zp per block
# Use unique tensor_id for each block to maintain separate running stats
block_tensor_id = f"block_{i}_{j}"
scale_bp, zp_bp = self.calculate_qparams(
observed[r0:r1, c0:c1], reduce_dims=(0, 1)
observed[r0:r1, c0:c1], reduce_dims=(0, 1), tensor_id=block_tensor_id
)
self._scale[i, j] = scale_bp
self._zero_point[i, j] = zp_bp
Expand Down
Loading