Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/features/quantization/torchao.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,20 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
```

Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.

## Online Quantization in vLLM

To perform online quantization with TorchAO in vLLM, use `--quantization torchao`
and pass the TorchAO config through `--hf-overrides`.

You can inline the overrides as JSON:

```bash
vllm serve meta-llama/Meta-Llama-3-8B \
--quantization torchao \
--hf-overrides '{"quantization_config_file": "/path/to/torchao_config.json"}'
```

When you need to skip specific modules (for example, excluding
`vocab_parallel_embedding`), configure that in the TorchAO config with
`FqnToConfig` rather than changing vLLM model code.
67 changes: 67 additions & 0 deletions tests/engine/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,3 +523,70 @@ def test_human_readable_model_len():
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
with pytest.raises(ArgumentError):
parser.parse_args(["--max-model-len", invalid])


def test_hf_overrides_json_string_normalized():
"""Test that hf_overrides JSON string is normalized into a dict."""
json_overrides = '{"architectures": ["TestModel"]}'
engine_args = EngineArgs(model="test-model", hf_overrides=json_overrides)
assert engine_args.hf_overrides == {"architectures": ["TestModel"]}


def test_hf_overrides_file_normalized(tmp_path):
"""Test that hf_overrides can be loaded from a JSON file."""
override_file = tmp_path / "hf_overrides.json"
override_file.write_text('{"quantization_config_file": "foo.json"}')
engine_args = EngineArgs(
model="test-model",
hf_overrides=f"@{override_file}",
)
assert engine_args.hf_overrides == {"quantization_config_file": "foo.json"}


def test_hf_overrides_file_invalid_json(tmp_path):
"""Test that invalid JSON file raises a clear error."""
override_file = tmp_path / "hf_overrides.json"
override_file.write_text('{"not": }')
with pytest.raises(ValueError):
EngineArgs(model="test-model", hf_overrides=f"@{override_file}")


def test_hf_overrides_invalid_string():
"""Test that non-JSON string raises a clear error."""
with pytest.raises(ValueError):
EngineArgs(model="test-model", hf_overrides="not-json")


def test_hf_overrides_json_string_from_cli_normalized():
"""Test hf_overrides JSON string normalization through CLI path."""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args(
[
"--model",
"test-model",
"--hf-overrides",
'{"quantization_config_file": "/tmp/torchao.json"}',
]
)

engine_args = EngineArgs.from_cli_args(args)
assert engine_args.hf_overrides == {"quantization_config_file": "/tmp/torchao.json"}


def test_hf_overrides_file_from_cli_normalized(tmp_path):
"""Test hf_overrides @file loading through CLI path."""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
override_file = tmp_path / "hf_overrides.json"
override_file.write_text('{"quantization_config_file": "/tmp/torchao.json"}')

args = parser.parse_args(
[
"--model",
"test-model",
"--hf-overrides",
f"@{override_file}",
Copy link
Contributor

@jerryzh168 jerryzh168 Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to have this option? does passing around '{"quantization_config_file": "/tmp/torchao.json"} in command line work?

]
)

engine_args = EngineArgs.from_cli_args(args)
assert engine_args.hf_overrides == {"quantization_config_file": "/tmp/torchao.json"}
42 changes: 42 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ def __post_init__(self):
self.weight_transfer_config = WeightTransferConfig(
**self.weight_transfer_config
)
self._normalize_hf_overrides()
# Setup plugins
from vllm.plugins import load_general_plugins

Expand All @@ -634,6 +635,47 @@ def __post_init__(self):
self.tokenizer,
)

def _normalize_hf_overrides(self) -> None:
"""Normalize hf_overrides to a dict or callable.

Supports JSON strings and JSON files when prefixed with '@'.
"""
if self.hf_overrides is None or callable(self.hf_overrides):
return
if isinstance(self.hf_overrides, dict):
return
if not isinstance(self.hf_overrides, str):
raise TypeError(
"hf_overrides must be a dict, a callable, or a JSON string."
)

raw = self.hf_overrides.strip()
if raw.startswith("@"):
path = raw[1:]
if not path:
raise ValueError("hf_overrides file path is empty.")
try:
with open(path, encoding="utf-8") as handle:
self.hf_overrides = json.load(handle)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The loaded JSON from the file should be validated to be a dictionary. The hf_overrides argument is expected to be a dictionary of overrides. Currently, if the file contains a JSON primitive (like a string or a number), it will be parsed successfully but will cause an AttributeError later when .items() is called on it. This can be confusing for the user. It's better to fail early with a clear error message.

                    loaded_json = json.load(handle)
                    if not isinstance(loaded_json, dict):
                        raise ValueError(
                            f"hf_overrides file must contain a JSON object: {path}"
                        )
                    self.hf_overrides = loaded_json

except FileNotFoundError as exc:
raise FileNotFoundError(f"hf_overrides file not found: {path}") from exc
except json.JSONDecodeError as exc:
raise ValueError(
f"hf_overrides file is not valid JSON: {path}"
) from exc
return

if re.match(r"(?s)^\s*{.*}\s*$", raw):
try:
self.hf_overrides = json.loads(raw)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to the file loading case, the loaded JSON from the string should be validated to be a dictionary. If the string is a valid JSON primitive (e.g., \"a string\" or 123), it will be parsed successfully but will cause an AttributeError later. It's better to fail early with a clear error message ensuring the provided string is a JSON object.

                loaded_json = json.loads(raw)
                if not isinstance(loaded_json, dict):
                    raise ValueError("hf_overrides string must be a JSON object.")
                self.hf_overrides = loaded_json

return
except json.JSONDecodeError as exc:
raise ValueError("hf_overrides is not valid JSON.") from exc

raise ValueError(
"hf_overrides must be a JSON object string or '@' followed by a JSON file."
)

@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"""Shared CLI arguments for vLLM engine."""
Expand Down