Fix negative activation values in awq scale calculation #1788

fynnsu · 2025-08-29T19:14:16Z

SUMMARY:
We're currently missing an abs() call when computing activation means. This results in negative mean values, which then interferes with the scales calculation, causing the scales to be set to nan.

As seen here, this abs() call existed in the AutoAWQ repo but was lost at some stage.

TEST PLAN:
After fix, applied AWQ to Qwen3-Coder-30B-A3B-Instruct and evaluated on humaneval and humaneval+

Algorithm	humaneval@1	humaneval@10	humaneval+@1	humaneval+@10
base	0.93	0.939	0.887	0.898
GPTQ	0.927	0.945	0.885	0.905
RTN	0.924	0.939	0.87	0.885
AWQ	0.937	0.945	0.893	0.902

awq script

from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier

MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"

SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym-main-abs2-no-duo-scaling"


# Configure the quantization algorithm to run.
recipe = [
    AWQModifier(
        duo_scaling=False,
        ignore=[
            "lm_head",
            "re:.*mlp.gate$",
            "re:.*mlp.shared_expert_gate$",
            "re:visual.*",
        ],
        scheme="W4A16",
        targets=["Linear"],
    ),
]

# Select calibration dataset.
DATASET_ID = "codeparrot/self-instruct-starcoder"
DATASET_SPLIT = "curated"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048


def get_calib_dataset(tokenizer):
    from datasets import load_dataset

    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
    )

    def preprocess(example):
        chat_messages = [
            {"role": "user", "content": example["instruction"].strip()},
            {"role": "assistant", "content": example["output"].strip()},
        ]
        tokenized_messages = tokenizer.apply_chat_template(
            chat_messages, tokenize=True
        )
        return {"input_ids": tokenized_messages}

    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )

    return ds


if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype="auto", trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ###
    ### Apply algorithms.
    ###
    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer),
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        log_dir=None,
        trust_remote_code_model=True,
    )

    # Save to disk compressed.
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)

gptq script

from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier

MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"

SAVE_DIR = MODEL_ID.split("/")[-1] + "-gptq-sym"


# Configure the quantization algorithm to run.
recipe = [
    GPTQModifier(
        ignore=[
            "lm_head",
            "re:.*mlp.gate$",
            "re:.*mlp.shared_expert_gate$",
            "re:visual.*",
        ],
        scheme="W4A16",
        targets=["Linear"],
    ),
]

# Select calibration dataset.
DATASET_ID = "codeparrot/self-instruct-starcoder"
DATASET_SPLIT = "curated"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048


def get_calib_dataset(tokenizer):
    from datasets import load_dataset

    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
    )

    def preprocess(example):
        chat_messages = [
            {"role": "user", "content": example["instruction"].strip()},
            {"role": "assistant", "content": example["output"].strip()},
        ]
        tokenized_messages = tokenizer.apply_chat_template(
            chat_messages, tokenize=True
        )
        return {"input_ids": tokenized_messages}

    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )

    return ds


if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype="auto", trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ###
    ### Apply algorithms.
    ###
    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer),
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        log_dir=None,
        trust_remote_code_model=True,
    )


    # Save to disk compressed.
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)

rtn script

from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"

SAVE_DIR = MODEL_ID.split("/")[-1] + "-rtn-sym"


# Configure the quantization algorithm to run.
recipe = [
    QuantizationModifier(
        ignore=[
            "lm_head",
            "re:.*mlp.gate$",
            "re:.*mlp.shared_expert_gate$",
            "re:visual.*",
        ],
        scheme="W4A16",
        targets=["Linear"],
    ),
]

# Select calibration dataset.
DATASET_ID = "codeparrot/self-instruct-starcoder"
DATASET_SPLIT = "curated"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048


def get_calib_dataset(tokenizer):
    from datasets import load_dataset

    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
    )

    def preprocess(example):
        chat_messages = [
            {"role": "user", "content": example["instruction"].strip()},
            {"role": "assistant", "content": example["output"].strip()},
        ]
        tokenized_messages = tokenizer.apply_chat_template(
            chat_messages, tokenize=True
        )
        return {"input_ids": tokenized_messages}

    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )

    return ds


if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype="auto", trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ###
    ### Apply algorithms.
    ###
    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer),
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        log_dir=None,
        trust_remote_code_model=True,
    )

    # Save to disk compressed.
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)

github-actions · 2025-08-29T19:14:25Z

👋 Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.

Note: This is required to complete the testing suite, please only add the label once the PR is code complete and local testing has been performed.

gemini-code-assist

Summary of Changes

Hello @fynnsu, I'm Gemini Code Assist¹! I'm currently reviewing this pull request and will post my feedback shortly. In the meantime, here's a summary to help you and other reviewers quickly get up to speed!

This pull request addresses a critical bug in the AWQ quantization implementation that resulted in incorrect scale calculations due to negative activation values. By reintroducing a missing absolute value operation, the change ensures numerical stability and improves the overall accuracy and robustness of the quantization process, as validated by comprehensive evaluation tests.

Highlights

Corrected AWQ Activation Mean Calculation: Implemented abs() calls to ensure activation means are always positive, resolving an issue where negative means led to NaN values in AWQ scale calculations.
Improved Quantization Stability: This fix enhances the stability and reliability of the AWQ quantization process, as demonstrated by improved evaluation metrics for the Qwen3-Coder-30B-A3B-Instruct model.

Using Gemini Code Assist

The full guide for Gemini Code Assist can be found on our documentation page, here are some quick tips.

Invoking Gemini

You can request assistance from Gemini at any point in your pull request via creating an issue comment (i.e. comment on the pull request page) using either /gemini <command> or @gemini-code-assist <command>. Below is a summary of the supported commands.

Feature	Command	Description
Code Review	`/gemini review`	Performs a code review for the current pull request in its current state.
Pull Request Summary	`/gemini summary`	Provides a summary of the current pull request in its current state.
Comment	@gemini-code-assist	Responds in comments when explicitly tagged, both in issue comments and review comments.
Help	`/gemini help`	Displays a list of available commands.

Customization

To customize Gemini Code Assist for GitHub experience, repository maintainers can create a configuration file and/or provide a custom code review style guide (such as PEP-8 for Python) by creating and adding files to a .gemini/ folder in the base of the repository. Detailed instructions can be found here.

Limitations & Feedback

Gemini Code Assist may make mistakes. Please leave feedback on any instances where its feedback is incorrect or counter productive. You can react with 👍 and 👎 on @gemini-code-assist comments. If you're interested in giving your feedback about your experience with Gemini Code Assist for Github and other Google products, sign up here.

You can also get AI-powered code generation, chat, as well as code reviews directly in the IDE at no cost with the Gemini Code Assist IDE Extension.

Review the Privacy Notices, Generative AI Prohibited Use Policy, Terms of Service, and learn how to configure Gemini Code Assist in GitHub here. Gemini can make mistakes, so double check it and use code with caution. ↩

gemini-code-assist

Code Review

This pull request correctly identifies and fixes a bug where negative activation values could lead to nan scales in the AWQ algorithm by adding an abs() call during the accumulation of activation means. This aligns with the original AutoAWQ implementation, and the test results confirm the fix is effective.

I've added one comment regarding a potentially redundant abs() call that could be removed to improve code clarity.

src/llmcompressor/modifiers/awq/base.py

brian-dellabetta

this looks good after removing the redundant line

src/llmcompressor/modifiers/awq/base.py

brian-dellabetta

thanks!

kylesayrs

Great catch, nice job

Signed-off-by: Fynn Schmitt-Ulms <[email protected]>

gemini-code-assist bot reviewed Aug 29, 2025

View reviewed changes

src/llmcompressor/modifiers/awq/base.py Outdated Show resolved Hide resolved

brian-dellabetta reviewed Aug 29, 2025

View reviewed changes

src/llmcompressor/modifiers/awq/base.py Show resolved Hide resolved

brian-dellabetta reviewed Sep 2, 2025

View reviewed changes

src/llmcompressor/modifiers/awq/base.py Show resolved Hide resolved

src/llmcompressor/modifiers/awq/base.py Outdated Show resolved Hide resolved

fynnsu force-pushed the awq_abs_fix branch 2 times, most recently from dcaf3c0 to 09b2eeb Compare September 2, 2025 14:31

brian-dellabetta approved these changes Sep 2, 2025

View reviewed changes

brian-dellabetta mentioned this pull request Sep 2, 2025

AWQ - Compressed tensors LoRa finetuning lead to OOM while GEMM AWQ works normally neuralmagic/compressed-tensors#437

Open

kylesayrs added the ready When a PR is ready for review label Sep 2, 2025

kylesayrs approved these changes Sep 2, 2025

View reviewed changes

fynnsu added 2 commits September 8, 2025 11:22

Fix negative activation values in awq scale calculation

aaf2746

Signed-off-by: Fynn Schmitt-Ulms <[email protected]>

Remove redundant abs() call

173245a

Signed-off-by: Fynn Schmitt-Ulms <[email protected]>

fynnsu force-pushed the awq_abs_fix branch from 09b2eeb to 173245a Compare September 8, 2025 15:24

Merge branch 'main' into awq_abs_fix

10f65d0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Fix negative activation values in awq scale calculation #1788

Fix negative activation values in awq scale calculation #1788

fynnsu commented Aug 29, 2025

Uh oh!

github-actions bot commented Aug 29, 2025

Uh oh!

gemini-code-assist bot left a comment

Uh oh!

gemini-code-assist bot left a comment

Uh oh!

Uh oh!

Uh oh!

brian-dellabetta left a comment

Uh oh!

Uh oh!

Uh oh!

brian-dellabetta left a comment

Uh oh!

kylesayrs left a comment

Uh oh!

Uh oh!

Fix negative activation values in awq scale calculation #1788

Are you sure you want to change the base?

Fix negative activation values in awq scale calculation #1788

Conversation

fynnsu commented Aug 29, 2025

Uh oh!

github-actions bot commented Aug 29, 2025

Uh oh!

gemini-code-assist bot left a comment

Choose a reason for hiding this comment

Summary of Changes

Highlights

Footnotes

Uh oh!

gemini-code-assist bot left a comment

Choose a reason for hiding this comment

Code Review

Uh oh!

Uh oh!

Uh oh!

brian-dellabetta left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

brian-dellabetta left a comment

Choose a reason for hiding this comment

Uh oh!

kylesayrs left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!