Add AWQ-INT4 option to release script (#2906)

jerryzh168 · web-flow · commit 08b15911c765 · 2025-08-29T10:25:20.000-07:00
Summary: Test Plan: ``` python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant AWQ-INT4 --push_to_hub --task bbh --calibration_limit 2 python quantize_and_upload.py --model_id microsoft/Phi-4-mini-instruct --quant AWQ-INT4 --push_to_hub --task mmlu_pro --calibration_limit 2 ``` https://huggingface.co/pytorch/Qwen3-8B-AWQ-INT4 https://huggingface.co/pytorch/Phi-4-mini-instruct-AWQ-INT4 ``` export TASK=bbh export MODEL=pytorch/Qwen3-8B-AWQ-INT4 lm_eval --model hf --model_args pretrained=$MODEL --tasks $TASK --device cuda:0 --batch_size auto --limit 50 export MODEL=jerryzh168/Qwen3-8B-INT4 lm_eval --model hf --model_args pretrained=$MODEL --tasks $TASK --device cuda:0 --batch_size auto --limit 50 ``` Qwen3-8B-INT4 hf (pretrained=jerryzh168/Qwen3-8B-INT4), gen_kwargs: (None), limit: 50.0, num_fewshot: None, batch_size: auto | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr| |----------------------------------------------------------|------:|----------|-----:|-----------|---|-----:|---|-----:| |bbh | 3|get-answer| |exact_match|↑ |0.7444|± |0.0107| | - bbh_cot_fewshot_boolean_expressions | 3|get-answer| 3|exact_match|↑ |0.9400|± |0.0339| | - bbh_cot_fewshot_causal_judgement | 3|get-answer| 3|exact_match|↑ |0.5600|± |0.0709| | - bbh_cot_fewshot_date_understanding | 3|get-answer| 3|exact_match|↑ |0.7600|± |0.0610| | - bbh_cot_fewshot_disambiguation_qa | 3|get-answer| 3|exact_match|↑ |0.5600|± |0.0709| | - bbh_cot_fewshot_dyck_languages | 3|get-answer| 3|exact_match|↑ |0.3000|± |0.0655| | - bbh_cot_fewshot_formal_fallacies | 3|get-answer| 3|exact_match|↑ |0.6400|± |0.0686| | - bbh_cot_fewshot_geometric_shapes | 3|get-answer| 3|exact_match|↑ |0.5400|± |0.0712| | - bbh_cot_fewshot_hyperbaton | 3|get-answer| 3|exact_match|↑ |0.9800|± |0.0200| | - bbh_cot_fewshot_logical_deduction_five_objects | 3|get-answer| 3|exact_match|↑ |0.6600|± |0.0677| | - bbh_cot_fewshot_logical_deduction_seven_objects | 3|get-answer| 3|exact_match|↑ |0.3000|± |0.0655| | - bbh_cot_fewshot_logical_deduction_three_objects | 3|get-answer| 3|exact_match|↑ |0.9400|± |0.0339| | - bbh_cot_fewshot_movie_recommendation | 3|get-answer| 3|exact_match|↑ |0.6400|± |0.0686| | - bbh_cot_fewshot_multistep_arithmetic_two | 3|get-answer| 3|exact_match|↑ |1.0000|± |0.0000| | - bbh_cot_fewshot_navigate | 3|get-answer| 3|exact_match|↑ |0.8800|± |0.0464| | - bbh_cot_fewshot_object_counting | 3|get-answer| 3|exact_match|↑ |0.8200|± |0.0549| | - bbh_cot_fewshot_penguins_in_a_table | 3|get-answer| 3|exact_match|↑ |0.9000|± |0.0429| | - bbh_cot_fewshot_reasoning_about_colored_objects | 3|get-answer| 3|exact_match|↑ |0.9000|± |0.0429| | - bbh_cot_fewshot_ruin_names | 3|get-answer| 3|exact_match|↑ |0.7000|± |0.0655| | - bbh_cot_fewshot_salient_translation_error_detection | 3|get-answer| 3|exact_match|↑ |0.5200|± |0.0714| | - bbh_cot_fewshot_snarks | 3|get-answer| 3|exact_match|↑ |0.6000|± |0.0700| | - bbh_cot_fewshot_sports_understanding | 3|get-answer| 3|exact_match|↑ |0.8200|± |0.0549| | - bbh_cot_fewshot_temporal_sequences | 3|get-answer| 3|exact_match|↑ |0.9200|± |0.0388| | - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 3|get-answer| 3|exact_match|↑ |0.8600|± |0.0496| | - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 3|get-answer| 3|exact_match|↑ |0.8200|± |0.0549| | - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 3|get-answer| 3|exact_match|↑ |0.9400|± |0.0339| | - bbh_cot_fewshot_web_of_lies | 3|get-answer| 3|exact_match|↑ |1.0000|± |0.0000| | - bbh_cot_fewshot_word_sorting | 3|get-answer| 3|exact_match|↑ |0.6000|± |0.0700| |Groups|Version| Filter |n-shot| Metric | |Value | |Stderr| |------|------:|----------|------|-----------|---|-----:|---|-----:| |bbh | 3|get-answer| |exact_match|↑ |0.7444|± |0.0107| AWQ-INT4 hf (pretrained=jerryzh168/Qwen3-8B-AWQ-INT4), gen_kwargs: (None), limit: 50.0, num_fewshot: None, batch_size: auto | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr| |----------------------------------------------------------|------:|----------|-----:|-----------|---|-----:|---|-----:| |bbh | 3|get-answer| |exact_match|↑ |0.7844|± |0.0101| | - bbh_cot_fewshot_boolean_expressions | 3|get-answer| 3|exact_match|↑ |1.0000|± |0.0000| | - bbh_cot_fewshot_causal_judgement | 3|get-answer| 3|exact_match|↑ |0.5800|± |0.0705| | - bbh_cot_fewshot_date_understanding | 3|get-answer| 3|exact_match|↑ |0.8000|± |0.0571| | - bbh_cot_fewshot_disambiguation_qa | 3|get-answer| 3|exact_match|↑ |0.5600|± |0.0709| | - bbh_cot_fewshot_dyck_languages | 3|get-answer| 3|exact_match|↑ |0.5600|± |0.0709| | - bbh_cot_fewshot_formal_fallacies | 3|get-answer| 3|exact_match|↑ |0.6000|± |0.0700| | - bbh_cot_fewshot_geometric_shapes | 3|get-answer| 3|exact_match|↑ |0.4200|± |0.0705| | - bbh_cot_fewshot_hyperbaton | 3|get-answer| 3|exact_match|↑ |0.9600|± |0.0280| | - bbh_cot_fewshot_logical_deduction_five_objects | 3|get-answer| 3|exact_match|↑ |0.7000|± |0.0655| | - bbh_cot_fewshot_logical_deduction_seven_objects | 3|get-answer| 3|exact_match|↑ |0.4000|± |0.0700| | - bbh_cot_fewshot_logical_deduction_three_objects | 3|get-answer| 3|exact_match|↑ |0.9600|± |0.0280| | - bbh_cot_fewshot_movie_recommendation | 3|get-answer| 3|exact_match|↑ |0.7000|± |0.0655| | - bbh_cot_fewshot_multistep_arithmetic_two | 3|get-answer| 3|exact_match|↑ |1.0000|± |0.0000| | - bbh_cot_fewshot_navigate | 3|get-answer| 3|exact_match|↑ |0.9400|± |0.0339| | - bbh_cot_fewshot_object_counting | 3|get-answer| 3|exact_match|↑ |0.9200|± |0.0388| | - bbh_cot_fewshot_penguins_in_a_table | 3|get-answer| 3|exact_match|↑ |0.8200|± |0.0549| | - bbh_cot_fewshot_reasoning_about_colored_objects | 3|get-answer| 3|exact_match|↑ |0.9200|± |0.0388| | - bbh_cot_fewshot_ruin_names | 3|get-answer| 3|exact_match|↑ |0.7400|± |0.0627| | - bbh_cot_fewshot_salient_translation_error_detection | 3|get-answer| 3|exact_match|↑ |0.6400|± |0.0686| | - bbh_cot_fewshot_snarks | 3|get-answer| 3|exact_match|↑ |0.6800|± |0.0666| | - bbh_cot_fewshot_sports_understanding | 3|get-answer| 3|exact_match|↑ |0.8400|± |0.0524| | - bbh_cot_fewshot_temporal_sequences | 3|get-answer| 3|exact_match|↑ |0.9400|± |0.0339| | - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 3|get-answer| 3|exact_match|↑ |0.9600|± |0.0280| | - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 3|get-answer| 3|exact_match|↑ |0.9400|± |0.0339| | - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 3|get-answer| 3|exact_match|↑ |0.9600|± |0.0280| | - bbh_cot_fewshot_web_of_lies | 3|get-answer| 3|exact_match|↑ |1.0000|± |0.0000| | - bbh_cot_fewshot_word_sorting | 3|get-answer| 3|exact_match|↑ |0.6400|± |0.0686| |Groups|Version| Filter |n-shot| Metric | |Value | |Stderr| |------|------:|----------|------|-----------|---|-----:|---|-----:| |bbh | 3|get-answer| |exact_match|↑ |0.7844|± |0.0101| ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py
@@ -10,6 +10,10 @@
 from huggingface_hub import ModelCard, get_token, whoami
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
 
+from torchao._models._eval import TransformerEvalWrapper
+from torchao.prototype.awq import (
+    AWQConfig,
+)
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Int4WeightOnlyConfig,
@@ -19,6 +23,7 @@
     PerAxis,
     PerGroup,
     PerRow,
+    quantize_,
 )
 
 
@@ -103,8 +108,6 @@ def _untie_weights_and_save_locally(model_id):
 model_to_quantize = "{untied_model}"
 
 {quant_code}
-quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Push to hub
 USER_ID = "YOUR_USER_ID"
@@ -204,12 +207,16 @@ def _untie_weights_and_save_locally(model_id):
 from torchao.quantization import Int4WeightOnlyConfig
 quant_config = Int4WeightOnlyConfig(group_size=128, use_hqq=True)
 quantization_config = TorchAoConfig(quant_type=quant_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 """
 
 _fp8_quant_code = """
 from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
 quant_config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
 quantization_config = TorchAoConfig(quant_type=quant_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 """
 
 _int8_int4_quant_code = """
@@ -230,8 +237,46 @@ def _untie_weights_and_save_locally(model_id):
 )
 quant_config = ModuleFqnToConfig({{"_default": linear_config, "model.embed_tokens": embedding_config}})
 quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])
+quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+"""
+
+_awq_int4_quant_code = """
+from torchao.quantization import Int4WeightOnlyConfig, quantize_
+from torchao.prototype.awq import (
+    AWQConfig,
+)
+from torchao._models._eval import TransformerEvalWrapper
+model = AutoModelForCausalLM.from_pretrained(
+    model_to_quantize,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+quant_config = AWQConfig(base_config, step="prepare")
+quantize_(
+    model,
+    quant_config,
+)
+TransformerEvalWrapper(
+    model=model,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+).run_eval(
+    tasks=tasks,
+    limit=calibration_limit,
+)
+quant_config = AWQConfig(base_config, step="convert")
+quantize_(model, quant_config)
+
+quantized_model = model
+quant_config = AWQConfig(base_config, step="prepare_for_loading")
+quantized_model.config.quantization_config = TorchAoConfig(quant_config)
 """
 
+
 _server_inference_recipe = """
 # Inference with vLLM
 Install vllm nightly and torchao nightly to get some recent changes:
@@ -568,7 +613,9 @@ def _untie_weights_and_save_locally(model_id):
 """
 
 
-def quantize_and_upload(model_id, quant, push_to_hub):
+def quantize_and_upload(
+    model_id, quant, tasks, calibration_limit, max_seq_length, push_to_hub
+):
     _int8_int4_linear_config = Int8DynamicActivationIntxWeightConfig(
         weight_dtype=torch.int4,
         weight_granularity=PerGroup(32),
@@ -580,7 +627,7 @@ def quantize_and_upload(model_id, quant, push_to_hub):
     )
     quant_to_config = {
         "FP8": Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
-        "INT4": Int4WeightOnlyConfig(group_size=128),
+        "INT4": Int4WeightOnlyConfig(group_size=128, version=2),
         "INT8-INT4": ModuleFqnToConfig(
             {
                 "_default": _int8_int4_linear_config,
@@ -593,23 +640,58 @@ def quantize_and_upload(model_id, quant, push_to_hub):
         "FP8": _fp8_quant_code,
         "INT4": _int4_quant_code,
         "INT8-INT4": _int8_int4_quant_code,
+        "AWQ-INT4": _awq_int4_quant_code,
     }
 
-    assert quant in quant_to_config, f"Unsupported quant option: {quant}"
-    quant_config = quant_to_config[quant]
-
+    # preparation
     model_to_quantize = model_id
     if quant == "INT8-INT4":
         model_to_quantize = _untie_weights_and_save_locally(model_to_quantize)
 
-    quantization_config = TorchAoConfig(quant_type=quant_config)
-    quantized_model = AutoModelForCausalLM.from_pretrained(
-        model_to_quantize,
-        device_map="auto",
-        torch_dtype=torch.bfloat16,
-        quantization_config=quantization_config,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # quantization
+
+    if "AWQ" in quant:
+        # awq will use torchao API directly
+        assert quant == "AWQ-INT4", "Only support AWQ-INT4 for now"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_to_quantize,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+        quant_config = AWQConfig(base_config, step="prepare")
+        quantize_(
+            model,
+            quant_config,
+        )
+        TransformerEvalWrapper(
+            model=model,
+            tokenizer=tokenizer,
+            max_seq_length=max_seq_length,
+        ).run_eval(
+            tasks=tasks,
+            limit=calibration_limit,
+        )
+        quant_config = AWQConfig(base_config, step="convert")
+        quantize_(model, quant_config)
+
+        quantized_model = model
+        quant_config = AWQConfig(base_config, step="prepare_for_loading")
+        quantized_model.config.quantization_config = TorchAoConfig(quant_config)
+    else:
+        # other quantization are integrated with `from_pretrained` in huggingface transformers
+        assert quant in quant_to_config, f"Unsupported quant option: {quant}"
+        quant_config = quant_to_config[quant]
+        quantization_config = TorchAoConfig(quant_type=quant_config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            model_to_quantize,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            quantization_config=quantization_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     username = _get_username()
 
@@ -702,7 +784,26 @@ def quantize_and_upload(model_id, quant, push_to_hub):
     parser.add_argument(
         "--quant",
         type=str,
-        help="Quantization method. Options are FP8, INT4, INT8_INT4, AWQ-INT4",
+        help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4",
+    )
+    parser.add_argument(
+        "--tasks",
+        nargs="+",
+        type=str,
+        help="lm-eval task to optimize for in awq, we'll select a sample from the task dataset and run awq calibration based on that",
+        default=["gsm8k"],
+    )
+    parser.add_argument(
+        "--calibration_limit",
+        type=int,
+        default=10,
+        help="Number of samples to use for calibration. Default is 10.",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=2048,
+        help="Maximum sequence length of examples to calibrate and evaluate model on. Default is 2048",
     )
     parser.add_argument(
         "--push_to_hub",
@@ -711,4 +812,11 @@ def quantize_and_upload(model_id, quant, push_to_hub):
         help="Flag to indicate whether push to huggingface hub or not",
     )
     args = parser.parse_args()
-    quantize_and_upload(args.model_id, args.quant, args.push_to_hub)
+    quantize_and_upload(
+        args.model_id,
+        args.quant,
+        args.tasks,
+        args.calibration_limit,
+        args.max_seq_length,
+        args.push_to_hub,
+    )