Refactor quantization and add quantization options to masked modeling (#115)

metascroy · web-flow · commit ab6261d0619a · 2025-07-29T16:58:33.000-07:00
* Refactor quantization and add quantization options to masked modeling

* up

* up
diff --git a/optimum/exporters/executorch/quantization.py b/optimum/exporters/executorch/quantization.py
@@ -0,0 +1,82 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Optional
+
+import torch
+import torchao
+from packaging.version import parse
+
+
+def quantize_model_(
+    eager_model: torch.nn.Module, qlinear_config: Optional[str], qembedding_config: Optional[str]
+) -> torch.nn.Module:
+    if not (qlinear_config or qembedding_config):
+        return
+
+    # TODO: Update torchao to use 0.11.0 once released
+    if parse(torchao.__version__) < parse("0.11.0.dev0"):
+        raise RuntimeError("Quantization requires torchao >= 0.11.0. Please upgrade torchao.")
+
+    from torchao.quantization.granularity import PerAxis, PerGroup
+    from torchao.quantization.quant_api import (
+        Int8DynamicActivationIntxWeightConfig,
+        IntxWeightOnlyConfig,
+        quantize_,
+    )
+    from torchao.utils import unwrap_tensor_subclass
+
+    if qembedding_config:
+        logging.info("Quantizing embedding layers.")
+        embedding_config = {
+            "4w": IntxWeightOnlyConfig(
+                weight_dtype=torch.int4,
+                granularity=PerGroup(32),
+            ),
+            "8w": IntxWeightOnlyConfig(
+                weight_dtype=torch.int8,
+                granularity=PerAxis(0),
+            ),
+        }[qembedding_config]
+
+        # TODO: Should switch to `AOPerModuleConfig` once fix for tied weights is available.
+        quantize_(
+            eager_model,
+            embedding_config,
+            lambda m, fqn: isinstance(m, torch.nn.Embedding),
+        )
+
+    if qlinear_config:
+        logging.info("Quantizing linear layers.")
+        linear_config = {
+            "8da4w": Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int4,
+                weight_granularity=PerGroup(32),
+            ),
+            "4w": IntxWeightOnlyConfig(
+                weight_dtype=torch.int4,
+                granularity=PerGroup(32),
+            ),
+            "8w": IntxWeightOnlyConfig(
+                weight_dtype=torch.int8,
+                granularity=PerAxis(0),
+            ),
+        }[qlinear_config]
+        quantize_(
+            eager_model,
+            linear_config,
+        )
+
+    unwrap_tensor_subclass(eager_model)
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -14,12 +14,11 @@
 
 import logging
 
-import torch
 import torchao
-from packaging.version import parse
 from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig
 
 from ..integrations import CausalLMExportableModule
+from ..quantization import quantize_model_
 from ..task_registry import register_task
 
 
@@ -130,64 +129,8 @@ def _load_eager_pretrained(
         if isinstance(param, torchao.utils.TorchAOBaseTensor):
             param.requires_grad = False
 
-    # TODO: Move quantization recipe out for better composability.
-    # TODO: Should switch to `TorchAoConfig` once the quant issue on final lm_head layer is fixed.
     qlinear_config = kwargs.get("qlinear", None)
     qembedding_config = kwargs.get("qembedding", None)
-    if qlinear_config or qembedding_config:
-        # TODO: Update torchao to use 0.11.0 once released
-        if parse(torchao.__version__) < parse("0.11.0.dev0"):
-            raise RuntimeError("Quantization 8da4w requires torchao >= 0.11.0. Please upgrade torchao.")
-
-        from torchao.quantization.granularity import PerAxis, PerGroup
-        from torchao.quantization.quant_api import (
-            Int8DynamicActivationIntxWeightConfig,
-            IntxWeightOnlyConfig,
-            quantize_,
-        )
-        from torchao.utils import unwrap_tensor_subclass
-
-        if qembedding_config:
-            logging.info("Quantizing embedding layers.")
-            embedding_config = {
-                "4w": IntxWeightOnlyConfig(
-                    weight_dtype=torch.int4,
-                    granularity=PerGroup(32),
-                ),
-                "8w": IntxWeightOnlyConfig(
-                    weight_dtype=torch.int8,
-                    granularity=PerAxis(0),
-                ),
-            }[qembedding_config]
-
-            # TODO: Should switch to `AOPerModuleConfig` once fix for tied weights is available.
-            quantize_(
-                eager_model,
-                embedding_config,
-                lambda m, fqn: isinstance(m, torch.nn.Embedding),
-            )
-
-        if qlinear_config:
-            logging.info("Quantizing linear layers.")
-            linear_config = {
-                "8da4w": Int8DynamicActivationIntxWeightConfig(
-                    weight_dtype=torch.int4,
-                    weight_granularity=PerGroup(32),
-                ),
-                "4w": IntxWeightOnlyConfig(
-                    weight_dtype=torch.int4,
-                    granularity=PerGroup(32),
-                ),
-                "8w": IntxWeightOnlyConfig(
-                    weight_dtype=torch.int8,
-                    granularity=PerAxis(0),
-                ),
-            }[qlinear_config]
-            quantize_(
-                eager_model,
-                linear_config,
-            )
-
-        unwrap_tensor_subclass(eager_model)
+    quantize_model_(eager_model, qlinear_config=qlinear_config, qembedding_config=qembedding_config)
 
     return CausalLMExportableModule(eager_model, use_custom_kv_cache, use_custom_sdpa)
diff --git a/optimum/exporters/executorch/tasks/masked_lm.py b/optimum/exporters/executorch/tasks/masked_lm.py
@@ -15,6 +15,7 @@
 from transformers import AutoModelForMaskedLM
 
 from ..integrations import MaskedLMExportableModule
+from ..quantization import quantize_model_
 from ..task_registry import register_task
 
 
@@ -38,5 +39,10 @@ def load_masked_lm_model(model_name_or_path: str, **kwargs) -> MaskedLMExportabl
             An instance of `MaskedLMExportableModule` for exporting and lowering to ExecuTorch.
     """
 
-    eager_model = AutoModelForMaskedLM.from_pretrained(model_name_or_path, **kwargs).to("cpu").eval()
+    eager_model = AutoModelForMaskedLM.from_pretrained(model_name_or_path).to("cpu").eval()
+
+    qlinear_config = kwargs.get("qlinear", None)
+    qembedding_config = kwargs.get("qembedding", None)
+    quantize_model_(eager_model, qlinear_config=qlinear_config, qembedding_config=qembedding_config)
+
     return MaskedLMExportableModule(eager_model)
diff --git a/tests/models/test_modeling_bert.py b/tests/models/test_modeling_bert.py
@@ -20,13 +20,19 @@
 import unittest
 
 import pytest
+import torchao
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
 from optimum.executorch import ExecuTorchModelForMaskedLM
 
 
+@pytest.mark.skipif(
+    parse(torchao.__version__) < parse("0.11.0.dev0"),
+    reason="Only available on torchao >= 0.11.0.dev0",
+)
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -45,6 +51,20 @@ def test_bert_export_to_executorch(self):
             )
             self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
 
+    @slow
+    @pytest.mark.run_slow
+    def test_bert_export_to_executorch_quantized(self):
+        model_id = "google-bert/bert-base-uncased"
+        task = "fill-mask"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --qlinear 8da4w --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
     def _helper_bert_fill_mask(self, recipe: str):
         model_id = "google-bert/bert-base-uncased"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/tests/models/test_modeling_codegen.py b/tests/models/test_modeling_codegen.py
@@ -54,7 +54,7 @@ def test_codegen_text_generation_with_8da4w_8we(self):
             model_id,
             config=config,
             recipe="xnnpack",
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_glm.py b/tests/models/test_modeling_glm.py
@@ -52,7 +52,7 @@ def test_glm_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_gpt2.py b/tests/models/test_modeling_gpt2.py
@@ -52,7 +52,7 @@ def test_gpt2sw3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_gptj.py b/tests/models/test_modeling_gptj.py
@@ -54,7 +54,7 @@ def test_gptj_text_generation_with_8da4w_8we(self):
             model_id,
             config=config,
             recipe="xnnpack",
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_gptneox.py b/tests/models/test_modeling_gptneox.py
@@ -52,7 +52,7 @@ def test_gpt2neox_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_gptneoxjapanese.py b/tests/models/test_modeling_gptneoxjapanese.py
@@ -57,7 +57,7 @@ def test_gptneoxjapanese_text_generation_with_8da4w_8we(self):
             model_id,
             config=config,
             recipe="xnnpack",
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_granite.py b/tests/models/test_modeling_granite.py
@@ -52,7 +52,7 @@ def test_granite_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_mistral.py b/tests/models/test_modeling_mistral.py
@@ -52,7 +52,7 @@ def test_mistral_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_phi.py b/tests/models/test_modeling_phi.py
@@ -52,7 +52,7 @@ def test_phi_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_starcoder2.py b/tests/models/test_modeling_starcoder2.py
@@ -52,7 +52,7 @@ def test_starcoder2_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembeeding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def test_codegen_text_generation_with_8da4w_8we(self):`
`54`	`54`	`model_id,`
`55`	`55`	`config=config,`
`56`	`56`	`recipe="xnnpack",`
`57`		`- **{"qlinear": True, "qembeeding": True},`
	`57`	`+ **{"qlinear": "8da4w", "qembeeding": "8w"},`
`58`	`58`	`)`
`59`	`59`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`60`	`60`	`self.assertIsInstance(model.model, ExecuTorchModule)`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def test_glm_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):`
`52`	`52`	`recipe="xnnpack",`
`53`	`53`	`attn_implementation="custom_sdpa",`
`54`	`54`	`use_custom_kv_cache=True,`
`55`		`- **{"qlinear": True, "qembeeding": True},`
	`55`	`+ **{"qlinear": "8da4w", "qembeeding": "8w"},`
`56`	`56`	`)`
`57`	`57`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`58`	`58`	`self.assertIsInstance(model.model, ExecuTorchModule)`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def test_gpt2sw3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):`
`52`	`52`	`recipe="xnnpack",`
`53`	`53`	`attn_implementation="custom_sdpa",`
`54`	`54`	`use_custom_kv_cache=True,`
`55`		`- **{"qlinear": True, "qembeeding": True},`
	`55`	`+ **{"qlinear": "8da4w", "qembeeding": "8w"},`
`56`	`56`	`)`
`57`	`57`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`58`	`58`	`self.assertIsInstance(model.model, ExecuTorchModule)`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def test_gptj_text_generation_with_8da4w_8we(self):`
`54`	`54`	`model_id,`
`55`	`55`	`config=config,`
`56`	`56`	`recipe="xnnpack",`
`57`		`- **{"qlinear": True, "qembeeding": True},`
	`57`	`+ **{"qlinear": "8da4w", "qembeeding": "8w"},`
`58`	`58`	`)`
`59`	`59`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`60`	`60`	`self.assertIsInstance(model.model, ExecuTorchModule)`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def test_gpt2neox_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):`
`52`	`52`	`recipe="xnnpack",`
`53`	`53`	`attn_implementation="custom_sdpa",`
`54`	`54`	`use_custom_kv_cache=True,`
`55`		`- **{"qlinear": True, "qembeeding": True},`
	`55`	`+ **{"qlinear": "8da4w", "qembeeding": "8w"},`
`56`	`56`	`)`
`57`	`57`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`58`	`58`	`self.assertIsInstance(model.model, ExecuTorchModule)`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def test_gptneoxjapanese_text_generation_with_8da4w_8we(self):`
`57`	`57`	`model_id,`
`58`	`58`	`config=config,`
`59`	`59`	`recipe="xnnpack",`
`60`		`- **{"qlinear": True, "qembeeding": True},`
	`60`	`+ **{"qlinear": "8da4w", "qembeeding": "8w"},`
`61`	`61`	`)`
`62`	`62`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`63`	`63`	`self.assertIsInstance(model.model, ExecuTorchModule)`