Skip to content
4 changes: 2 additions & 2 deletions examples/quantization_w8a8_fp8/fp8_block_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "Qwen/Qwen3-0.6B"
MODEL_ID = "Qwen/Qwen3-30B-A3B"

# Load model.
model = AutoModelForCausalLM.from_pretrained(
Expand All @@ -16,7 +16,7 @@
# * quantize the weights to fp8 with per channel via ptq
# * quantize the activations to fp8 with dynamic per token
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"]
targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head", "re:.*mlp.gate$"],
)

# Apply quantization.
Expand Down
12 changes: 11 additions & 1 deletion src/llmcompressor/observers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,18 @@ def calculate_qparams(
self,
observed: Tensor,
reduce_dims: Optional[Tuple[int]] = None,
tensor_id: Optional[Any] = None,
global_scale: Optional[Tensor] = None,
) -> Tuple[FloatTensor, IntTensor]:
"""
:param observed: observed tensor to calculate quantization parameters for
:param reduce_dims: optional tuple of dimensions to reduce along,
returned scale and zero point will be shaped (1,) along the
reduced dimensions
:param tensor_id: optional id for tracking separate statistics when different
ranges of observed tensors are passed, useful for sharding tensors by
group_size or block quantization
:param global_scale: optional scale to further scale local quantization scales
:return: tuple of scale and zero point derived from the observed tensor
"""
raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
Expand Down Expand Up @@ -233,8 +239,12 @@ def get_qparams(
c0 = j * block_cols
c1 = min((j + 1) * block_cols, cols)
# reduce across both dims to get one scale and zp per block
# Use unique tensor_id for each block to maintain separate stats
block_tensor_id = f"block_{i}_{j}"
scale_bp, zp_bp = self.calculate_qparams(
observed[r0:r1, c0:c1], reduce_dims=(0, 1)
observed[r0:r1, c0:c1],
reduce_dims=(0, 1),
tensor_id=block_tensor_id,
)
self._scale[i, j] = scale_bp
self._zero_point[i, j] = zp_bp
Expand Down
Loading