diff --git a/examples/quantization_w8a8_fp8/fp8_block_example.py b/examples/quantization_w8a8_fp8/fp8_block_example.py index b5d6ca1f9..e977110ad 100644 --- a/examples/quantization_w8a8_fp8/fp8_block_example.py +++ b/examples/quantization_w8a8_fp8/fp8_block_example.py @@ -3,7 +3,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier -MODEL_ID = "Qwen/Qwen3-0.6B" +MODEL_ID = "Qwen/Qwen3-30B-A3B" # Load model. model = AutoModelForCausalLM.from_pretrained( @@ -16,7 +16,7 @@ # * quantize the weights to fp8 with per channel via ptq # * quantize the activations to fp8 with dynamic per token recipe = QuantizationModifier( - targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"] + targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head", "re:.*mlp.gate$"], ) # Apply quantization. diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index a435f3c73..aa9e1caab 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -63,12 +63,18 @@ def calculate_qparams( self, observed: Tensor, reduce_dims: Optional[Tuple[int]] = None, + tensor_id: Optional[Any] = None, + global_scale: Optional[Tensor] = None, ) -> Tuple[FloatTensor, IntTensor]: """ :param observed: observed tensor to calculate quantization parameters for :param reduce_dims: optional tuple of dimensions to reduce along, returned scale and zero point will be shaped (1,) along the reduced dimensions + :param tensor_id: optional id for tracking separate statistics when different + ranges of observed tensors are passed, useful for sharding tensors by + group_size or block quantization + :param global_scale: optional scale to further scale local quantization scales :return: tuple of scale and zero point derived from the observed tensor """ raise NotImplementedError(f"{self.__class__} must implement calculate_qparams") @@ -233,8 +239,12 @@ def get_qparams( c0 = j * block_cols c1 = min((j + 1) * block_cols, cols) # reduce across both dims to get one scale and zp per block + # Use unique tensor_id for each block to maintain separate stats + block_tensor_id = f"block_{i}_{j}" scale_bp, zp_bp = self.calculate_qparams( - observed[r0:r1, c0:c1], reduce_dims=(0, 1) + observed[r0:r1, c0:c1], + reduce_dims=(0, 1), + tensor_id=block_tensor_id, ) self._scale[i, j] = scale_bp self._zero_point[i, j] = zp_bp