Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions examples/quantization_w4a4_fp4/fp8_weight_pcs_act_static.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
quantization_stage:
quantization_modifiers:
QuantizationModifier:
ignore: ["lm_head", "re:.*mlp\\.gate.*"]
config_groups:
group_0:
weights:
num_bits: 8
type: "float"
symmetric: true
strategy: "channel"
dynamic: false
targets: ["Linear"]
input_activations:
num_bits: 8
type: "float"
symmetric: true
strategy: "tensor"
dynamic: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
quantization_stage:
quantization_modifiers:
QuantizationModifier:
ignore: [
"re:.*lm_head",
"re:.*self_attn",
"re:.*router",
"re:vision_model.*",
"re:multi_modal_projector.*",
"Llama4TextAttention"]
config_groups:
group_0:
weights:
num_bits: 8
type: "float"
symmetric: true
strategy: "channel"
dynamic: false
targets: ["Linear"]
input_activations:
num_bits: 8
type: "float"
symmetric: true
strategy: "tensor"
dynamic: false
6 changes: 4 additions & 2 deletions examples/quantization_w4a4_fp4/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from llmcompressor.utils import dispatch_for_generation

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID = "/models/Qwen3-0.6B-Base"
MODEL_ID = "/models/Qwen3-15B-A2B-Base/"

# Load model.
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
Expand All @@ -17,7 +19,7 @@

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 20
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
Expand Down Expand Up @@ -56,7 +58,7 @@ def tokenize(sample):
# * calibrate a global_scale for activations, which will be used to
# quantize activations to fp4 on the fly
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])

recipe = "fp8_weight_pcs_act_static.yaml"
# Apply quantization.
oneshot(
model=model,
Expand Down
30 changes: 16 additions & 14 deletions examples/quantization_w4a4_fp4/llama4_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

# Select model and load it.
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
model_id = "/models/Llama-4-Maverick-17B-128E-Instruct/"
model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = Llama4Processor.from_pretrained(model_id)
# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
Expand Down Expand Up @@ -60,19 +61,20 @@ def data_collator(batch):


# Configure the quantization algorithm to run.
recipe = QuantizationModifier(
targets="Linear",
scheme="NVFP4",
ignore=[
"re:.*lm_head",
"re:.*self_attn",
"re:.*router",
"re:vision_model.*",
"re:multi_modal_projector.*",
"Llama4TextAttention",
],
)

# recipe = QuantizationModifier(
# targets="Linear",
# scheme="NVFP4",
# ignore=[
# "re:.*lm_head",
# "re:.*self_attn",
# "re:.*router",
# "re:vision_model.*",
# "re:multi_modal_projector.*",
# "Llama4TextAttention",
# ],
# )

recipe = "fp8_weight_pcs_act_static_llama4.yaml"
# Apply algorithms.
# due to the large size of Llama4, we specify sequential targets such that
# only one MLP is loaded into GPU memory at a time
Expand All @@ -88,6 +90,6 @@ def data_collator(batch):


# Save to disk compressed.
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
SAVE_DIR = f'/data2/yiliu4/{model_id.rstrip("/").split("/")[-1]}' + "-NVFP4"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
Loading