Make quant scheme configurable (#109)

metascroy · web-flow · commit 1f5c4d852881 · 2025-07-24T19:15:04.000-07:00
* Make quant scheme configurable

* up
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ model = ExecuTorchModelForCausalLM.from_pretrained(
     recipe="xnnpack",
     attn_implementation="custom_sdpa",  # Use custom SDPA implementation for better performance
     use_custom_kv_cache=True,  # Use custom KV cache for better performance
-    **{"qlinear": True, "qembeeding": True},  # Quantize linear and embedding layers
+    **{"qlinear": "8da4w", "qembedding": "8w"},  # Quantize linear and embedding layers
 )
 
 # Generate text right away
@@ -90,8 +90,8 @@ optimum-cli export executorch \
     --recipe "xnnpack" \
     --use_custom_sdpa \
     --use_custom_kv_cache \
-    --qlinear \
-    --qembedding \
+    --qlinear 8da4w \
+    --qembedding 8w \
     --output_dir="hf_smollm2"
 ```
 Explore the various export options by running the command: `optimum-cli export executorch --help`
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -69,15 +69,28 @@ def parse_args_executorch(parser):
     )
     required_group.add_argument(
         "--qlinear",
+        type=str,
+        choices=["8da4w", "4w", "8w"],
         required=False,
-        action="store_true",
-        help="Quantization config for linear layers. If set, defaults to '8da4w' w/ groupsize 32.",
+        help=(
+            "Quantization config for linear layers.\n\n"
+            "Options:\n"
+            "  8da4w - 8-bit dynamic activation, 4-bit weight with group_size = 32\n"
+            "  4w    - 4-bit weight only, per group with group_size = 32\n"
+            "  8w    - 8-bit weight only, per channel"
+        ),
     )
     required_group.add_argument(
         "--qembedding",
+        type=str,
+        choices=["4w", "8w"],
         required=False,
-        action="store_true",
-        help="Quantization config for embedding. If set, defaults to int8 channelwise.",
+        help=(
+            "Quantization config for embedding layer.\n\n"
+            "Options:\n"
+            "  4w    - 4-bit weight only, per group with group_size = 32\n"
+            "  8w    - 8-bit weight only, per channel"
+        ),
     )
 
 
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -149,11 +149,18 @@ def _load_eager_pretrained(
 
         if qembedding_config:
             logging.info("Quantizing embedding layers.")
+            embedding_config = {
+                "4w": IntxWeightOnlyConfig(
+                    weight_dtype=torch.int4,
+                    granularity=PerGroup(32),
+                ),
+                "8w": IntxWeightOnlyConfig(
+                    weight_dtype=torch.int8,
+                    granularity=PerAxis(0),
+                ),
+            }[qembedding_config]
+
             # TODO: Should switch to `AOPerModuleConfig` once fix for tied weights is available.
-            embedding_config = IntxWeightOnlyConfig(
-                weight_dtype=torch.int8,
-                granularity=PerAxis(0),
-            )
             quantize_(
                 eager_model,
                 embedding_config,
@@ -162,10 +169,20 @@ def _load_eager_pretrained(
 
         if qlinear_config:
             logging.info("Quantizing linear layers.")
-            linear_config = Int8DynamicActivationIntxWeightConfig(
-                weight_dtype=torch.int4,
-                weight_granularity=PerGroup(32),
-            )
+            linear_config = {
+                "8da4w": Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=torch.int4,
+                    weight_granularity=PerGroup(32),
+                ),
+                "4w": IntxWeightOnlyConfig(
+                    weight_dtype=torch.int4,
+                    granularity=PerGroup(32),
+                ),
+                "8w": IntxWeightOnlyConfig(
+                    weight_dtype=torch.int8,
+                    granularity=PerAxis(0),
+                ),
+            }[qlinear_config]
             quantize_(
                 eager_model,
                 linear_config,
diff --git a/tests/models/test_modeling_gemma.py b/tests/models/test_modeling_gemma.py
@@ -56,8 +56,8 @@ def test_gemma_export_to_executorch(self):
                     --recipe {recipe} \
                     --output_dir {tempdir}/executorch \
                     --use_custom_sdpa \
-                    --qlinear \
-                    --qembedding",
+                    --qlinear 8da4w \
+                    --qembedding 8w",
                 shell=True,
                 check=True,
             )
@@ -76,7 +76,7 @@ def test_gemma_text_generation_with_custom_sdpa_8da4w_8we(self):
         # model_id = "google/gemma-2b"
         model_id = "weqweasdas/RM-Gemma-2B"
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
diff --git a/tests/models/test_modeling_gemma2.py b/tests/models/test_modeling_gemma2.py
@@ -61,8 +61,8 @@ def test_gemma2_export_to_executorch(self):
                     --recipe {recipe} \
                     --output_dir {tempdir}/executorch \
                     --use_custom_sdpa \
-                    --qlinear \
-                    --qembedding",
+                    --qlinear 8da4w \
+                    --qembedding 8w",
                 shell=True,
                 check=True,
             )
@@ -81,7 +81,7 @@ def test_gemma2_text_generation_with_custom_sdpa_8da4w_8we(self):
         # model_id = "google/gemma-2-2b"
         model_id = "unsloth/gemma-2-2b-it"
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -66,8 +66,8 @@ def test_gemma3_export_to_executorch(self):
                     --recipe {recipe} \
                     --output_dir {tempdir}/executorch \
                     --use_custom_sdpa \
-                    --qlinear \
-                    --qembedding",
+                    --qlinear 8da4w \
+                    --qembedding 8w",
                 shell=True,
                 check=True,
             )
@@ -202,7 +202,7 @@ def test_gemma3_text_generation_with_custom_sdpa_8da4w_8we(self):
         prompt = "Write a poem about a machine learning."
 
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -241,7 +241,7 @@ def test_gemma3_text_generation_with_custom_sdpa_kv_cache_8da4w_8we(self):
         prompt = "Write a poem about a machine learning."
 
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
diff --git a/tests/models/test_modeling_llama.py b/tests/models/test_modeling_llama.py
@@ -55,8 +55,8 @@ def test_llama3_2_1b_export_to_executorch(self):
                     --recipe {recipe} \
                     --use_custom_sdpa \
                     --use_custom_kv_cache \
-                    --qlinear \
-                    --qembedding \
+                    --qlinear 8da4w \
+                    --qembedding 8w \
                     --output_dir {tempdir}/executorch",
                 shell=True,
                 check=True,
@@ -74,7 +74,7 @@ def test_llama3_2_1b_export_to_executorch(self):
     def test_llama_text_generation_with_custom_sdpa_8da4w_8we(self):
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
         model_id = "NousResearch/Llama-3.2-1B"
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -109,7 +109,7 @@ def test_llama_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_olmo.py b/tests/models/test_modeling_olmo.py
@@ -58,8 +58,8 @@ def test_olmo_export_to_executorch(self):
                     --recipe {recipe} \
                     --output_dir {tempdir}/executorch \
                     --use_custom_sdpa \
-                    --qlinear \
-                    --qembedding",
+                    --qlinear 8da4w \
+                    --qembedding 8w",
                 shell=True,
                 check=True,
             )
@@ -95,7 +95,7 @@ def test_olmo_text_generation_with_xnnpack(self):
     def test_olmo_text_generation_with_custom_sdpa_8da4w_8we(self):
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
         model_id = "allenai/OLMo-1B-hf"
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -130,7 +130,7 @@ def test_olmo_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py
@@ -56,7 +56,7 @@ def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_qwen3.py b/tests/models/test_modeling_qwen3.py
@@ -62,8 +62,8 @@ def test_qwen3_export_to_executorch(self):
                     --recipe {recipe} \
                     --output_dir {tempdir}/executorch \
                     --use_custom_sdpa \
-                    --qlinear \
-                    --qembedding",
+                    --qlinear 8da4w \
+                    --qembedding 8w",
                 shell=True,
                 check=True,
             )
@@ -188,7 +188,7 @@ def test_qwen3_text_generation_with_custom_sdpa_8da4w_8we(self):
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -262,7 +262,7 @@ def test_qwen3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_qwen3_embedding.py b/tests/models/test_modeling_qwen3_embedding.py
@@ -54,7 +54,7 @@ def test_qwen3_embedding_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_smollm.py b/tests/models/test_modeling_smollm.py
@@ -139,7 +139,7 @@ def test_smollm_text_generation_with_custom_sdpa_8da4w(self):
         prompt = "My favourite condiment is "
 
         # ExecuTorch model + custom sdpa + 8da4w linear quantization
-        kwargs = {"qlinear": True}
+        kwargs = {"qlinear": "8da4w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -176,7 +176,7 @@ def test_smollm_text_generation_with_custom_sdpa_8da4w_8we(self):
         prompt = "My favourite condiment is "
 
         # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization
-        kwargs = {"qlinear": True, "qembedding": True}
+        kwargs = {"qlinear": "8da4w", "qembedding": "8w"}
         model = ExecuTorchModelForCausalLM.from_pretrained(
             model_id,
             recipe="xnnpack",
@@ -217,7 +217,7 @@ def test_smollm_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/models/test_modeling_smollm3.py b/tests/models/test_modeling_smollm3.py
@@ -54,7 +54,7 @@ def test_smollm3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
             recipe="xnnpack",
             attn_implementation="custom_sdpa",
             use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
+            **{"qlinear": "8da4w", "qembedding": "8w"},
         )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):`
`56`	`56`	`recipe="xnnpack",`
`57`	`57`	`attn_implementation="custom_sdpa",`
`58`	`58`	`use_custom_kv_cache=True,`
`59`		`- **{"qlinear": True, "qembeeding": True},`
	`59`	`+ **{"qlinear": "8da4w", "qembedding": "8w"},`
`60`	`60`	`)`
`61`	`61`	`self.assertIsInstance(model, ExecuTorchModelForCausalLM)`
`62`	`62`	`self.assertIsInstance(model.model, ExecuTorchModule)`