Update on "New multi-step QAT API"

andrewor14 · andrewor14 · commit 7a9fe9057c58 · 2025-07-30T14:28:04.000-07:00
**Summary:** This commit adds a new multi-step QAT API with the
main goal of simplifying the existing UX. The new API uses the
same `QATConfig` for both the prepare and convert steps, and
automatically infers the fake quantization configs based on
a PTQ base config provided by the user:

```
from torchao.quantization import (
    quantize_,
    Int8DynamicActivationInt4WeightConfig
)
from torchao.quantization.qat import QATConfig

# prepare
base_config = Int8DynamicActivationInt4WeightConfig(group_size=32)
quantize_(m, QATConfig(base_config, step="prepare"))

# train (not shown)

# convert
quantize_(m, QATConfig(base_config, step="convert"))
```

The main improvements include:
- A single config for both prepare and convert steps
- A single quantize_ for convert (instead of 2)
- No chance for incompatible prepare vs convert configs
- Much less boilerplate code for most common use case
- Simpler config names

For less common use cases such as experimentation, users can
still specify arbitrary fake quantization configs for
activations and/or weights as before. This is still important
since there may not always be a corresponding PTQ base config.
For example:

```
from torchao.quantization import quantize_
from torchao.quantization.qat import IntxFakeQuantizeConfig, QATConfig

# prepare
activation_config = IntxFakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
weight_config = IntxFakeQuantizeConfig(torch.int4, group_size=32)
qat_config = QATConfig(
    activation_config=activation_config,
    weight_config=weight_config,
    step="prepare",
)
quantize_(model, qat_config)

# train and convert same as above (not shown)
```

**BC-breaking notes:** This change by itself is technically not
BC-breaking since we keep around the old path, but will become
so when we deprecate and remove the old path in the future.

Before:
```
# prepare
activation_config = IntxFakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
weight_config = IntxFakeQuantizeConfig(torch.int4, group_size=32)
qat_config = IntXQuantizationAwareTrainingConfig(activation_config, weight_config),
quantize_(model, qat_config)

# train (not shown)

# convert
quantize_(model, FromIntXQuantizationAwareTrainingConfig())
quantize_(model, Int8DynamicActivationInt4WeightConfig(group_size=32))
```

After: (see above)

**Test Plan:**
```
python test/quantization/test_qat.py
```

[ghstack-poisoned]
diff --git a/docs/source/api_ref_qat.rst b/docs/source/api_ref_qat.rst
@@ -16,7 +16,7 @@ please refer to the `QAT README <https://github.com/pytorch/ao/blob/main/torchao
     :nosignatures:
 
     QATConfig
-
+    QATConfigStep
 
 Custom QAT APIs
 ---------------
diff --git a/docs/source/finetuning.rst b/docs/source/finetuning.rst
@@ -205,21 +205,14 @@ because we are not actually casting the fake quantized values.
 
 .. code:: py
 
-  from torchao.quantization import (
-      quantize_,
-  )
-  from torchao.quantization.qat import (
-      FakeQuantizeConfig,
-      IntXQuantizationAwareTrainingConfig,
-  )
+  from torchao.quantization import quantize_, Int8DynamicActivationInt4WeightConfig
+  from torchao.quantization.qat import QATConfig
+
   model = get_model()
 
-  # prepare: insert fake quantization ops
-  # swaps `torch.nn.Linear` with `FakeQuantizedLinear`
-  activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
-  weight_config = FakeQuantizeConfig(torch.int4, group_size=32)
-  qat_config = IntXQuantizationAwareTrainingConfig(activation_config, weight_config)
-  quantize_(model, qat_config)
+  # prepare: swap `torch.nn.Linear` -> `FakeQuantizedLinear`
+  base_config = Int8DynamicActivationInt4WeightConfig(group_size=32)
+  quantize_(model, QATConfig(base_config, step="prepare"))
 
   # fine-tune
   train_loop(model)
@@ -232,18 +225,12 @@ The next step is to actually quantize the model:
 
 .. code:: py
 
-  from torchao.quantization import (
-      Int8DynamicActivationInt4WeightConfig,
-  )
-  from torchao.quantization.qat import (
-      FromIntXQuantizationAwareTrainingConfig,
-  )
+  from torchao.quantization import Int8DynamicActivationInt4WeightConfig
 
-  # convert: transform fake quantization ops into actual quantized ops
-  # swap `FakeQuantizedLinear` back to `torch.nn.Linear` and inserts
-  # quantized activation and weight tensor subclasses
-  quantize_(model, FromIntXQuantizationAwareTrainingConfig())
-  quantize_(model, Int8DynamicActivationInt4WeightConfig(group_size=32))
+  # convert: swap `FakeQuantizedLinear` -> `torch.nn.Linear`, then quantize using `base_config`
+  quantize_(model, QATConfig(base_config, step="convert"))
+
+  # inference or generate
 
 Now our model is ready for serving, and will typically have higher quantized
 accuracy than if we did not apply the prepare step (fake quantization) during
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1281,9 +1281,7 @@ def test_qat_config_init(self):
         self.assertEqual(QATConfig(base_config, step="CONVERT").step, "convert")
 
         # Bad step
-        with self.assertRaisesRegex(
-            ValueError, "`step` must be either 'prepare' or 'convert'"
-        ):
+        with self.assertRaisesRegex(ValueError, "`step` must be one of"):
             QATConfig(base_config, step="blah")
 
         # Step was not a keyword arg
diff --git a/torchao/quantization/qat/README.md b/torchao/quantization/qat/README.md
@@ -93,7 +93,8 @@ model = get_model()
 base_config = Int8DynamicActivationInt4WeightConfig(group_size=32)
 quantize_(model, QATConfig(base_config, step="prepare"))
 
-# train (not shown)
+# train
+train_loop(model)
 
 # convert: swap `FakeQuantizedLinear` -> `torch.nn.Linear`, then quantize using `base_config`
 quantize_(model, QATConfig(base_config, step="convert"))
@@ -123,7 +124,8 @@ qat_config = QATConfig(
 )
 quantize_(model, qat_config)
 
-# train (not shown)
+# train
+train_loop(model)
 
 # convert: (not shown, same as before)
 ```
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -3,6 +3,7 @@
     FromIntXQuantizationAwareTrainingConfig,
     IntXQuantizationAwareTrainingConfig,
     QATConfig,
+    QATConfigStep,
     from_intx_quantization_aware_training,
     initialize_fake_quantizers,
     intx_quantization_aware_training,
@@ -25,12 +26,13 @@
 )
 
 __all__ = [
+    "QATConfig",
+    "QATConfigStep",
     "FakeQuantizeConfigBase",
+    "IntxFakeQuantizeConfig",
+    "FakeQuantizer",
     "FakeQuantizedLinear",
     "FakeQuantizedEmbedding",
-    "FakeQuantizer",
-    "IntxFakeQuantizeConfig",
-    "QATConfig",
     # Prototype
     "initialize_fake_quantizers",
     # Legacy quantizers
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
+from enum import Enum
 from typing import Any, List, Optional, Tuple
 
 import torch
@@ -25,6 +26,15 @@
 from .linear import FakeQuantizedLinear
 
 
+class QATConfigStep(str, Enum):
+    """
+    Enum value for the `step` field in :class:`~torchao.quantization.qat.QATConfig`.
+    """
+
+    PREPARE = "prepare"
+    CONVERT = "convert"
+
+
 @dataclass
 class QATConfig(AOBaseConfig):
     """
@@ -114,7 +124,7 @@ class QATConfig(AOBaseConfig):
     base_config: Optional[AOBaseConfig]
     activation_config: Optional[FakeQuantizeConfigBase]
     weight_config: Optional[FakeQuantizeConfigBase]
-    step: str
+    step: QATConfigStep
 
     # Express `step` as a keyword argument
     # TODO: Use `kw_only=True` instead, added in python 3.10
@@ -124,7 +134,7 @@ def __init__(
         activation_config: Optional[FakeQuantizeConfigBase] = None,
         weight_config: Optional[FakeQuantizeConfigBase] = None,
         *,
-        step: str = "prepare",
+        step: QATConfigStep = "prepare",
     ):
         self.base_config = base_config
         self.activation_config = activation_config
@@ -134,8 +144,9 @@ def __init__(
 
     def __post_init__(self):
         self.step = self.step.lower()
-        if self.step not in ["prepare", "convert"]:
-            raise ValueError("`step` must be either 'prepare' or 'convert'")
+        all_step_values = [s.value for s in QATConfigStep]
+        if self.step not in all_step_values:
+            raise ValueError("`step` must be one of %s" % all_step_values)
         if self.base_config is None and self.weight_config is None:
             raise ValueError(
                 "One of `base_config` or `weight_config` must be specified"
@@ -178,7 +189,7 @@ def _qat_config_transform(
     # Swap nn.Embedding -> FakeQuantizedEmbedding
     base_config = config.base_config
     step = config.step
-    if step == "prepare":
+    if step == QATConfigStep.PREPARE:
         if base_config is not None:
             (act_config, weight_config) = _infer_fake_quantize_configs(base_config)
         else:
@@ -201,7 +212,7 @@ def _qat_config_transform(
         # Swap FakeQuantizedLinear -> nn.Linear
         # Swap FakeQuantizedEmbedding -> nn.Embedding
         # Then apply the base config's transform function to quantize the model
-        assert step == "convert", "unexpected step '%s' in QATConfig" % step
+        assert step == QATConfigStep.CONVERT, "unexpected step '%s' in QATConfig" % step
         assert base_config is not None, "expected `base_config` in convert step"
         if isinstance(module, FakeQuantizedLinear):
             module = module.to_linear()
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
@@ -287,7 +287,7 @@ def _infer_fake_quantize_configs(
             is_symmetric=base_config.act_mapping_type == MappingType.SYMMETRIC,
         )
         weight_config = IntxFakeQuantizeConfig(
-            dtype=torch.int4,
+            dtype=TorchAODType.INT4,
             group_size=base_config.group_size,
             is_symmetric=base_config.mapping_type == MappingType.SYMMETRIC,
         )

Original file line number	Diff line number	Diff line change
`@@ -287,7 +287,7 @@ def _infer_fake_quantize_configs(`
`287`	`287`	`is_symmetric=base_config.act_mapping_type == MappingType.SYMMETRIC,`
`288`	`288`	`)`
`289`	`289`	`weight_config = IntxFakeQuantizeConfig(`
`290`		`- dtype=torch.int4,`
	`290`	`+ dtype=TorchAODType.INT4,`
`291`	`291`	`group_size=base_config.group_size,`
`292`	`292`	`is_symmetric=base_config.mapping_type == MappingType.SYMMETRIC,`
`293`	`293`	`)`