Skip to content

pass in tensor_id for calculate_qparam #1709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 7, 2025
4 changes: 2 additions & 2 deletions examples/quantization_w8a8_fp8/fp8_block_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "Qwen/Qwen3-0.6B"
MODEL_ID = "Qwen/Qwen3-30B-A3B"

# Load model.
model = AutoModelForCausalLM.from_pretrained(
Expand All @@ -16,7 +16,7 @@
# * quantize the weights to fp8 with per channel via ptq
# * quantize the activations to fp8 with dynamic per token
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"]
targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head", "re:.*mlp.gate$"],
)

# Apply quantization.
Expand Down
12 changes: 11 additions & 1 deletion src/llmcompressor/observers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,18 @@ def calculate_qparams(
self,
observed: Tensor,
reduce_dims: Optional[Tuple[int]] = None,
tensor_id: Optional[Any] = None,
global_scale: Optional[Tensor] = None,
) -> Tuple[FloatTensor, IntTensor]:
"""
:param observed: observed tensor to calculate quantization parameters for
:param reduce_dims: optional tuple of dimensions to reduce along,
returned scale and zero point will be shaped (1,) along the
reduced dimensions
:param tensor_id: optional id for tracking separate statistics when different
ranges of observed tensors are passed, useful for sharding tensors by
group_size or block quantization
:param global_scale: optional scale to further scale local quantization scales
:return: tuple of scale and zero point derived from the observed tensor
"""
raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
Expand Down Expand Up @@ -233,8 +239,12 @@ def get_qparams(
c0 = j * block_cols
c1 = min((j + 1) * block_cols, cols)
# reduce across both dims to get one scale and zp per block
# Use unique tensor_id for each block to maintain separate stats
block_tensor_id = f"block_{i}_{j}"
scale_bp, zp_bp = self.calculate_qparams(
observed[r0:r1, c0:c1], reduce_dims=(0, 1)
observed[r0:r1, c0:c1],
reduce_dims=(0, 1),
tensor_id=block_tensor_id,
)
self._scale[i, j] = scale_bp
self._zero_point[i, j] = zp_bp
Expand Down
Loading