vllm-project · jwpark33 · Feb 12, 2026 · jerryzh168 · Feb 13, 2026 · gemini-code-assist
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
@@ -41,3 +41,20 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
     ```
 
 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
+
+## Online Quantization in vLLM
+
+To perform online quantization with TorchAO in vLLM, use `--quantization torchao`
+and pass the TorchAO config through `--hf-overrides`.
+
+You can inline the overrides as JSON:
+
+```bash
+vllm serve meta-llama/Meta-Llama-3-8B \
+  --quantization torchao \
+  --hf-overrides '{"quantization_config_file": "/path/to/torchao_config.json"}'
+```
+
+When you need to skip specific modules (for example, excluding
+`vocab_parallel_embedding`), configure that in the TorchAO config with
+`FqnToConfig` rather than changing vLLM model code.
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
@@ -523,3 +523,70 @@ def test_human_readable_model_len():
     for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
         with pytest.raises(ArgumentError):
             parser.parse_args(["--max-model-len", invalid])
+
+
+def test_hf_overrides_json_string_normalized():
+    """Test that hf_overrides JSON string is normalized into a dict."""
+    json_overrides = '{"architectures": ["TestModel"]}'
+    engine_args = EngineArgs(model="test-model", hf_overrides=json_overrides)
+    assert engine_args.hf_overrides == {"architectures": ["TestModel"]}
+
+
+def test_hf_overrides_file_normalized(tmp_path):
+    """Test that hf_overrides can be loaded from a JSON file."""
+    override_file = tmp_path / "hf_overrides.json"
+    override_file.write_text('{"quantization_config_file": "foo.json"}')
+    engine_args = EngineArgs(
+        model="test-model",
+        hf_overrides=f"@{override_file}",
+    )
+    assert engine_args.hf_overrides == {"quantization_config_file": "foo.json"}
+
+
+def test_hf_overrides_file_invalid_json(tmp_path):
+    """Test that invalid JSON file raises a clear error."""
+    override_file = tmp_path / "hf_overrides.json"
+    override_file.write_text('{"not": }')
+    with pytest.raises(ValueError):
+        EngineArgs(model="test-model", hf_overrides=f"@{override_file}")
+
+
+def test_hf_overrides_invalid_string():
+    """Test that non-JSON string raises a clear error."""
+    with pytest.raises(ValueError):
+        EngineArgs(model="test-model", hf_overrides="not-json")
+
+
+def test_hf_overrides_json_string_from_cli_normalized():
+    """Test hf_overrides JSON string normalization through CLI path."""
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args(
+        [
+            "--model",
+            "test-model",
+            "--hf-overrides",
+            '{"quantization_config_file": "/tmp/torchao.json"}',
+        ]
+    )
+
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.hf_overrides == {"quantization_config_file": "/tmp/torchao.json"}
+
+
+def test_hf_overrides_file_from_cli_normalized(tmp_path):
+    """Test hf_overrides @file loading through CLI path."""
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    override_file = tmp_path / "hf_overrides.json"
+    override_file.write_text('{"quantization_config_file": "/tmp/torchao.json"}')
+
+    args = parser.parse_args(
+        [
+            "--model",
+            "test-model",
+            "--hf-overrides",
+            f"@{override_file}",
+        ]
+    )
+
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.hf_overrides == {"quantization_config_file": "/tmp/torchao.json"}
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -609,6 +609,7 @@ def __post_init__(self):
             self.weight_transfer_config = WeightTransferConfig(
                 **self.weight_transfer_config
             )
+        self._normalize_hf_overrides()
         # Setup plugins
         from vllm.plugins import load_general_plugins
 
@@ -634,6 +635,47 @@ def __post_init__(self):
                         self.tokenizer,
                     )
 
+    def _normalize_hf_overrides(self) -> None:
+        """Normalize hf_overrides to a dict or callable.
+
+        Supports JSON strings and JSON files when prefixed with '@'.
+        """
+        if self.hf_overrides is None or callable(self.hf_overrides):
+            return
+        if isinstance(self.hf_overrides, dict):
+            return
+        if not isinstance(self.hf_overrides, str):
+            raise TypeError(
+                "hf_overrides must be a dict, a callable, or a JSON string."
+            )
+
+        raw = self.hf_overrides.strip()
+        if raw.startswith("@"):
+            path = raw[1:]
+            if not path:
+                raise ValueError("hf_overrides file path is empty.")
+            try:
+                with open(path, encoding="utf-8") as handle:
+                    self.hf_overrides = json.load(handle)
+            except FileNotFoundError as exc:
+                raise FileNotFoundError(f"hf_overrides file not found: {path}") from exc
+            except json.JSONDecodeError as exc:
+                raise ValueError(
+                    f"hf_overrides file is not valid JSON: {path}"
+                ) from exc
+            return
+
+        if re.match(r"(?s)^\s*{.*}\s*$", raw):
+            try:
+                self.hf_overrides = json.loads(raw)
+                return
+            except json.JSONDecodeError as exc:
+                raise ValueError("hf_overrides is not valid JSON.") from exc
+
+        raise ValueError(
+            "hf_overrides must be a JSON object string or '@' followed by a JSON file."
+        )
+
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""