Add NVFP4 QAT

andrewor14 · andrewor14 · commit 28b4136deb2a · 2025-08-21T09:53:38.000-07:00
**Summary:** This commit adds a QAT flow for NVFP4, following the numerics in `NVFP4Tensor` closely but without the dtyping casting, swizzling, and the packing/unpacking. Users can call this flow as follows: ``` from torchao.quantization import quantize_ from torchao.quantization.qat import NVFP4FakeQuantizeConfig, QATConfig qat_config = QATConfig( weight_config=NVFP4FakeQuantizeConfig(), step="prepare", ) quantize_(model, qat_config) ``` **Test Plan:** ``` python test/quantization/test_qat.py -k test_qat_nvfp4 ``` ghstack-source-id: 5548756 Pull Request resolved: #2666
diff --git a/docs/source/api_ref_qat.rst b/docs/source/api_ref_qat.rst
@@ -27,6 +27,7 @@ Custom QAT APIs
     FakeQuantizeConfigBase
     IntxFakeQuantizeConfig
     Float8FakeQuantizeConfig
+    NVFP4FakeQuantizeConfig
     FakeQuantizedLinear
     FakeQuantizedEmbedding
     FakeQuantizerBase
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -50,6 +50,7 @@
 from torchao.quantization.qat.fake_quantize_config import (
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
+    NVFP4FakeQuantizeConfig,
 )
 from torchao.quantization.qat.fake_quantizer import (
     Float8FakeQuantizer,
@@ -118,8 +119,8 @@ def __init__(self):
         self.sub = Sub()
         self.linear2 = torch.nn.Linear(256, 512, bias=False).to(torch.float)
 
-    def example_inputs(self):
-        return (torch.randn(1, 512).to(torch.float),)
+    def example_inputs(self, device: torch.device = None):
+        return (torch.randn((1, 512), device=device).to(torch.float),)
 
     def _get_all_weight_scales(self) -> List[torch.Tensor]:
         return [
@@ -1932,6 +1933,29 @@ def test_quantize_api_fp8_int4(self):
             target_convert_sqnr=float("inf"),
         )
 
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @parametrize("use_per_tensor_scale", [True, False])
+    def test_qat_nvfp4(self, use_per_tensor_scale: bool):
+        """
+        Test QAT with `NVFP4FakeQuantizeConfig`.
+        """
+        torch.manual_seed(self.SEED)
+        m = M().cuda()
+        baseline_model = copy.deepcopy(m)
+        qat_config = QATConfig(
+            weight_config=NVFP4FakeQuantizeConfig(use_per_tensor_scale),
+            step="prepare",
+        )
+        quantize_(m, qat_config)
+
+        # Compare prepared values
+        torch.manual_seed(self.SEED)
+        x = m.example_inputs("cuda")
+        out = m(*x)
+        baseline_out = baseline_model(*x)
+        sqnr = compute_error(out, baseline_out).item()
+        self.assertGreater(sqnr, 100)
+
 
 instantiate_parametrized_tests(TestQAT)
 
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -310,10 +310,10 @@ def nvfp4_to_copy(func, types, args, kwargs):
 
     if dtype is not None:
         res = NVFP4Tensor(
+            tensor.qdata,
             tensor._scale_e4m3,
             tensor._per_tensor_scale,
             tensor._act_per_tensor_scale,
-            tensor._data,
             tensor._block_size,
             dtype,
             tensor._is_swizzled_scales,
@@ -749,13 +749,23 @@ def nvfp4_quantize(
         AssertionError: If input dtype is not supported, tensor size is not
             divisible by block_size, tensor is not contiguous, or block_size != 16
     """
+    return _nvfp4_quantize(data_hp, block_size, per_tensor_scale)
+
+
+def _nvfp4_quantize(
+    data_hp: torch.Tensor,
+    block_size: int = 16,
+    per_tensor_scale: Optional[torch.Tensor] = None,
+    skip_dtype_cast_and_packing: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert data_hp.dtype in (torch.bfloat16, torch.float), (
         f"{data_hp.dtype} not supported"
     )
     assert data_hp.size(-1) % block_size == 0, "K dim must be divisible by block_size"
     assert data_hp.is_contiguous(), "Only support contiguous data for now"
     assert block_size == 16, "NVFP4 requires block_size=16"
 
+    orig_dtype = data_hp.dtype
     orig_shape = data_hp.shape
     # Convert to float32 early for consistent precision with Triton implementation
     data_hp = data_hp.float().reshape(orig_shape[0], -1, block_size)
@@ -767,9 +777,9 @@ def nvfp4_quantize(
     out_scales = None
     if per_tensor_scale is None:
         # We are doing single level scaling
-        block_scale_fp8 = torch.clamp(block_scale, min=E4M3_EPS, max=F8E4M3_MAX).to(
-            torch.float8_e4m3fn
-        )
+        block_scale_fp8 = torch.clamp(block_scale, min=E4M3_EPS, max=F8E4M3_MAX)
+        if not skip_dtype_cast_and_packing:
+            block_scale_fp8 = block_scale_fp8.to(torch.float8_e4m3fn)
         block_scale_fp32 = block_scale_fp8.to(torch.float32)
         data_scaled = data_hp / block_scale_fp32.unsqueeze(-1)
         out_scales = block_scale_fp8
@@ -782,7 +792,9 @@ def nvfp4_quantize(
         scaled_block_scales = block_scale_fp32 / per_tensor_scale
         scaled_block_scales_fp8 = torch.clamp(
             scaled_block_scales, min=E4M3_EPS, max=F8E4M3_MAX
-        ).to(torch.float8_e4m3fn)
+        )
+        if not skip_dtype_cast_and_packing:
+            scaled_block_scales_fp8 = scaled_block_scales_fp8.to(torch.float8_e4m3fn)
         scaled_block_scales_fp32 = scaled_block_scales_fp8.to(torch.float32)
         # We "temporarily" dequant the scaled_block_scales_fp32 to get the per_tensor_scale
         # To apply to data
@@ -792,8 +804,11 @@ def nvfp4_quantize(
 
     data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)
     data_scaled = data_scaled.view(orig_shape)
-    data_lp = f32_to_f4_unpacked(data_scaled)
-    # TODO: NotImplementedError: "copy_kernel" not implemented for 'Float4_e2m1fn_x2'
-    # data_lp = pack_uint4(data_lp).view(torch.float4_e2m1fn_x2)
-    data_lp = pack_uint4(data_lp)
-    return out_scales, data_lp
+    if skip_dtype_cast_and_packing:
+        return out_scales, data_scaled.to(orig_dtype)
+    else:
+        data_lp = f32_to_f4_unpacked(data_scaled)
+        # TODO: NotImplementedError: "copy_kernel" not implemented for 'Float4_e2m1fn_x2'
+        # data_lp = pack_uint4(data_lp).view(torch.float4_e2m1fn_x2)
+        data_lp = pack_uint4(data_lp)
+        return out_scales, data_lp
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -17,12 +17,14 @@
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
+    NVFP4FakeQuantizeConfig,
 )
 from .fake_quantizer import (
     FakeQuantizer,
     FakeQuantizerBase,
     Float8FakeQuantizer,
     IntxFakeQuantizer,
+    NVFP4FakeQuantizer,
 )
 from .linear import (
     FakeQuantizedLinear,
@@ -40,6 +42,8 @@
     "Float8FakeQuantizer",
     "IntxFakeQuantizeConfig",
     "IntxFakeQuantizer",
+    "NVFP4FakeQuantizeConfig",
+    "NVFP4FakeQuantizer",
     "FakeQuantizedLinear",
     "FakeQuantizedEmbedding",
     # Prototype
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
@@ -77,6 +77,22 @@ def __post_init__(self):
             )
 
 
+@dataclass
+class NVFP4FakeQuantizeConfig(FakeQuantizeConfigBase):
+    """
+    Config for fake quantizing weights or activations to NVIDIA's NVFP4 format
+    according to https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/.
+
+    Fake quantization numerics follow `NVFP4Tensor` closely: https://github.com/pytorch/ao/blob/main/torchao/prototype/mx_formats/nvfp4_tensor.py.
+
+    Args:
+        use_per_tensor_scale (bool): Whether to use two-level per-tensor fp32 scaling
+            after the initial fp8 (e4m3) block-wise scaling (default True)
+    """
+
+    use_per_tensor_scale: bool = True
+
+
 @dataclass
 class IntxFakeQuantizeConfig(FakeQuantizeConfigBase):
     """
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -34,6 +34,7 @@
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
+    NVFP4FakeQuantizeConfig,
 )
 from .utils import (
     _fake_quantize_per_channel_group,
@@ -59,8 +60,10 @@ def __repr__(self) -> str:
     def from_config(config: FakeQuantizeConfigBase) -> "FakeQuantizerBase":
         if isinstance(config, IntxFakeQuantizeConfig):
             return IntxFakeQuantizer(config)
-        if isinstance(config, Float8FakeQuantizeConfig):
+        elif isinstance(config, Float8FakeQuantizeConfig):
             return Float8FakeQuantizer(config)
+        elif isinstance(config, NVFP4FakeQuantizeConfig):
+            return NVFP4FakeQuantizer(config)
         else:
             raise ValueError(f"Unknown config type: {config}")
 
@@ -73,6 +76,7 @@ class Float8FakeQuantizer(FakeQuantizerBase):
     def __init__(self, config: Float8FakeQuantizeConfig):
         super().__init__()
         self.config = config
+        torch._C._log_api_usage_once("torchao.quantization.qat.Float8FakeQuantizer")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         original_dtype = x.dtype
@@ -91,14 +95,60 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return dq
 
 
+class NVFP4FakeQuantizer(FakeQuantizerBase):
+    """
+    Generic module for applying NVFP4 fake quantization to a tensor, as specified in the config.
+    """
+
+    def __init__(self, config: NVFP4FakeQuantizeConfig):
+        super().__init__()
+        torch._C._log_api_usage_once("torchao.quantization.qat.NVFP4FakeQuantizer")
+        self.config = config
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        from torchao.prototype.mx_formats.nvfp4_tensor import (
+            _nvfp4_quantize,
+            per_tensor_amax_to_scale,
+        )
+
+        block_size = 16
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.shape[-1])
+        if self.config.use_per_tensor_scale:
+            tensor_amax = torch.max(torch.abs(x))
+            per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+        else:
+            per_tensor_scale = None
+
+        # quantize
+        scale, q = _nvfp4_quantize(
+            x,
+            block_size=block_size,
+            per_tensor_scale=per_tensor_scale,
+            skip_dtype_cast_and_packing=True,
+        )
+        if self.config.use_per_tensor_scale:
+            scale = scale * per_tensor_scale
+        assert q.dtype == x.dtype
+        assert scale.dtype == torch.float32
+
+        # dequantize
+        M, K = q.shape[0], q.shape[1]
+        q = q.view(M, K // block_size, block_size)
+        scale = scale.view(M, K // block_size, 1)
+        dq = q * scale
+        return dq.view(original_shape).to(x.dtype)
+
+
 class IntxFakeQuantizer(FakeQuantizerBase):
     """
     Generic module for applying integer fake quantization to a tensor, as specified in the config.
     """
 
     def __init__(self, config: IntxFakeQuantizeConfig):
         super().__init__()
-        torch._C._log_api_usage_once("torchao.quantization.qat.FakeQuantizer")
+        torch._C._log_api_usage_once("torchao.quantization.qat.IntxFakeQuantizer")
         self.config = config
         self.enabled = True
         self.scale: Optional[torch.Tensor] = None
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -92,7 +92,9 @@ def __init__(
 
         # initialize weight fake quantizer
         if weight_config is not None:
-            if isinstance(weight_config.granularity, PerGroup):
+            if isinstance(weight_config, IntxFakeQuantizeConfig) and isinstance(
+                weight_config.granularity, PerGroup
+            ):
                 group_size = weight_config.group_size
                 if group_size is not None and in_features % group_size != 0:
                     raise ValueError(