diff --git a/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static.yaml b/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static.yaml new file mode 100644 index 0000000000..b8257b7c80 --- /dev/null +++ b/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static.yaml @@ -0,0 +1,19 @@ +quantization_stage: + quantization_modifiers: + QuantizationModifier: + ignore: ["lm_head", "re:.*mlp\\.gate.*"] + config_groups: + group_0: + weights: + num_bits: 8 + type: "float" + symmetric: true + strategy: "channel" + dynamic: false + targets: ["Linear"] + input_activations: + num_bits: 8 + type: "float" + symmetric: true + strategy: "tensor" + dynamic: false \ No newline at end of file diff --git a/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static_llama4.yaml b/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static_llama4.yaml new file mode 100644 index 0000000000..ca89b9d05a --- /dev/null +++ b/examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static_llama4.yaml @@ -0,0 +1,25 @@ +quantization_stage: + quantization_modifiers: + QuantizationModifier: + ignore: [ + "re:.*lm_head", + "re:.*self_attn", + "re:.*router", + "re:vision_model.*", + "re:multi_modal_projector.*", + "Llama4TextAttention"] + config_groups: + group_0: + weights: + num_bits: 8 + type: "float" + symmetric: true + strategy: "channel" + dynamic: false + targets: ["Linear"] + input_activations: + num_bits: 8 + type: "float" + symmetric: true + strategy: "tensor" + dynamic: false \ No newline at end of file diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py index 63068fde12..60603aed62 100644 --- a/examples/quantization_w4a4_fp4/llama3_example.py +++ b/examples/quantization_w4a4_fp4/llama3_example.py @@ -6,6 +6,8 @@ from llmcompressor.utils import dispatch_for_generation MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_ID = "/models/Qwen3-0.6B-Base" +MODEL_ID = "/models/Qwen3-15B-A2B-Base/" # Load model. model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") @@ -17,7 +19,7 @@ # Select number of samples. 512 samples is a good place to start. # Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 20 +NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 # Load dataset and preprocess. @@ -56,7 +58,7 @@ def tokenize(sample): # * calibrate a global_scale for activations, which will be used to # quantize activations to fp4 on the fly recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"]) - +recipe = "fp8_weight_pcs_act_static.yaml" # Apply quantization. oneshot( model=model, diff --git a/examples/quantization_w4a4_fp4/llama4_example.py b/examples/quantization_w4a4_fp4/llama4_example.py index 28b57dda9b..a94f7777d2 100644 --- a/examples/quantization_w4a4_fp4/llama4_example.py +++ b/examples/quantization_w4a4_fp4/llama4_example.py @@ -8,6 +8,7 @@ # Select model and load it. model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" +model_id = "/models/Llama-4-Maverick-17B-128E-Instruct/" model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") processor = Llama4Processor.from_pretrained(model_id) # We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`. @@ -60,19 +61,20 @@ def data_collator(batch): # Configure the quantization algorithm to run. -recipe = QuantizationModifier( - targets="Linear", - scheme="NVFP4", - ignore=[ - "re:.*lm_head", - "re:.*self_attn", - "re:.*router", - "re:vision_model.*", - "re:multi_modal_projector.*", - "Llama4TextAttention", - ], -) - +# recipe = QuantizationModifier( +# targets="Linear", +# scheme="NVFP4", +# ignore=[ +# "re:.*lm_head", +# "re:.*self_attn", +# "re:.*router", +# "re:vision_model.*", +# "re:multi_modal_projector.*", +# "Llama4TextAttention", +# ], +# ) + +recipe = "fp8_weight_pcs_act_static_llama4.yaml" # Apply algorithms. # due to the large size of Llama4, we specify sequential targets such that # only one MLP is loaded into GPU memory at a time @@ -88,6 +90,6 @@ def data_collator(batch): # Save to disk compressed. -SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4" +SAVE_DIR = f'/data2/yiliu4/{model_id.rstrip("/").split("/")[-1]}' + "-NVFP4" model.save_pretrained(SAVE_DIR) processor.save_pretrained(SAVE_DIR)