yiliu30 · yiliu30 · Sep 17, 2025
diff --git a/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static.yaml b/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static.yaml
@@ -0,0 +1,19 @@
+quantization_stage:
+  quantization_modifiers:
+    QuantizationModifier:
+      ignore: ["lm_head", "re:.*mlp\\.gate.*"]
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 8
+            type: "float"
+            symmetric: true
+            strategy: "channel"
+            dynamic: false
+          targets: ["Linear"]
+          input_activations:
+            num_bits: 8
+            type: "float"
+            symmetric: true
+            strategy: "tensor"
+            dynamic: false
diff --git a/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static_llama4.yaml b/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static_llama4.yaml
@@ -0,0 +1,25 @@
+quantization_stage:
+  quantization_modifiers:
+    QuantizationModifier:
+      ignore: [
+        "re:.*lm_head",
+        "re:.*self_attn",
+        "re:.*router",
+        "re:vision_model.*",
+        "re:multi_modal_projector.*",
+        "Llama4TextAttention"]
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 8
+            type: "float"
+            symmetric: true
+            strategy: "channel"
+            dynamic: false
+          targets: ["Linear"]
+          input_activations:
+            num_bits: 8
+            type: "float"
+            symmetric: true
+            strategy: "tensor"
+            dynamic: false
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -6,6 +6,8 @@
 from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "/models/Qwen3-0.6B-Base"
+MODEL_ID = "/models/Qwen3-15B-A2B-Base/"
 
 # Load model.
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
@@ -17,7 +19,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 20
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -56,7 +58,7 @@ def tokenize(sample):
 #   * calibrate a global_scale for activations, which will be used to
 #       quantize activations to fp4 on the fly
 recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])
-
+recipe = "fp8_weight_pcs_act_static.yaml"
 # Apply quantization.
 oneshot(
     model=model,

diff --git a/examples/quantization_w4a4_fp4/llama4_example.py b/examples/quantization_w4a4_fp4/llama4_example.py
@@ -8,6 +8,7 @@
 
 # Select model and load it.
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+model_id = "/models/Llama-4-Maverick-17B-128E-Instruct/"
 model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = Llama4Processor.from_pretrained(model_id)
 # We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
@@ -60,19 +61,20 @@ def data_collator(batch):
 
 
 # Configure the quantization algorithm to run.
-recipe = QuantizationModifier(
-    targets="Linear",
-    scheme="NVFP4",
-    ignore=[
-        "re:.*lm_head",
-        "re:.*self_attn",
-        "re:.*router",
-        "re:vision_model.*",
-        "re:multi_modal_projector.*",
-        "Llama4TextAttention",
-    ],
-)
-
+# recipe = QuantizationModifier(
+#     targets="Linear",
+#     scheme="NVFP4",
+#     ignore=[
+#         "re:.*lm_head",
+#         "re:.*self_attn",
+#         "re:.*router",
+#         "re:vision_model.*",
+#         "re:multi_modal_projector.*",
+#         "Llama4TextAttention",
+#     ],
+# )
+
+recipe = "fp8_weight_pcs_act_static_llama4.yaml"
 # Apply algorithms.
 # due to the large size of Llama4, we specify sequential targets such that
 # only one MLP is loaded into GPU memory at a time
@@ -88,6 +90,6 @@ def data_collator(batch):
 
 
 # Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
+SAVE_DIR = f'/data2/yiliu4/{model_id.rstrip("/").split("/")[-1]}' + "-NVFP4"
 model.save_pretrained(SAVE_DIR)
 processor.save_pretrained(SAVE_DIR)