From a5a7cb2fe3cdf1855350afe4bea8125387e1952a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 22 Jul 2025 01:22:08 +0000 Subject: [PATCH 01/15] add speculators support Signed-off-by: Dipika Sikka --- .../configs/speculators/__init__.py | 0 .../configs/speculators/base.py | 188 ++++++++++++++++++ .../configs/speculators/eagle.py | 35 ++++ .../configs/speculators/eagle3.py | 41 ++++ 4 files changed, 264 insertions(+) create mode 100644 vllm/transformers_utils/configs/speculators/__init__.py create mode 100644 vllm/transformers_utils/configs/speculators/base.py create mode 100644 vllm/transformers_utils/configs/speculators/eagle.py create mode 100644 vllm/transformers_utils/configs/speculators/eagle3.py diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py new file mode 100644 index 000000000000..2e780ad4b5f5 --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from typing import Any, Union + +from transformers import PretrainedConfig + +from vllm.transformers_utils.configs.speculators import ( + Eagle3SpeculatorsConfig, EagleSpeculatorsConfig) + +DEFAULT_NUM_LOOKAHEAD_TOKENS = 5 +SPECULATORS_WEIGHT_MAP = { + "fusion_fc.weight": "fc.weight", + "fusion_fc.bias": "fc.bias", + "embedding_layernorm.weight": "embedding_layernorm.weight", + "pre_lm_head_layernorm.weight": "hidden_states_layernorm.weight", +} + +SUPPORTED_SPECULATORS_TYPES = { + "eagle": EagleSpeculatorsConfig, + "eagle3": Eagle3SpeculatorsConfig +} + + +class SpeculatorsConfig(PretrainedConfig): + + def __init__(self, config=None, **kwargs): + super().__init__(**kwargs) + self.config = config or {} + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + **kwargs, + ) -> "SpeculatorsConfig": + """Load speculators Eagle config and convert to vLLM format.""" + config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, + **kwargs) + + speculators_type = config_dict.get("speculators_model_type") + if speculators_type not in SUPPORTED_SPECULATORS_TYPES: + return super().from_pretrained(pretrained_model_name_or_path, + **kwargs) + + spec_class = SUPPORTED_SPECULATORS_TYPES.get(speculators_type) + spec_class_instance = spec_class(config_dict) + + # Validate + spec_class_instance.validate_speculators_config() + num_lookahead_tokens = spec_class_instance.extract_num_lookahead_tokens( # noqa: E501 + ) + vllm_config, transformer_config = spec_class_instance.convert_speculators_to_vllm( # noqa: E501 + num_lookahead_tokens) + + # Process / Update + spec_class_instance.update_defaults(transformer_config, vllm_config) + spec_class_instance.ensure_transformer_architectures( + transformer_config) # Is this needed? + vllm_config = spec_class_instance.preserve_additional_fields( + vllm_config) + + # Create + return cls(**vllm_config) + + def extract_num_lookahead_tokens(self) -> int: + """ + Extract number of lookahead tokens from proposal methods. + + Args: + config: Speculators config dictionary + + Returns: + Number of speculative tokens + + Note: + Currently only supports the first proposal method. + Future versions may support multiple proposal methods. + """ + speculators_cfg = self.config["speculators_config"] + proposal_methods = speculators_cfg["proposal_methods"] + + # Currently we only support one proposal method + first_method = proposal_methods[0] + num_lookahead_tokens = first_method.get("speculative_tokens") + + if num_lookahead_tokens is None: + raise ValueError( + "Missing 'speculative_tokens' in proposal method. " + f"Got: {first_method}") + + return num_lookahead_tokens + + def validate_speculators_config(self) -> None: + """Validate required speculators format fields.""" + # Check required top-level fields + if "speculators_model_type" not in self.config_dict: + raise ValueError( + "Missing 'speculators_model_type' in config. " + f"Expected one of: {sorted(SUPPORTED_SPECULATORS_TYPES)}. " + "Please ensure you're loading a speculators-format Eagle model." + ) + + model_type = self.config["speculators_model_type"] + if model_type not in SUPPORTED_SPECULATORS_TYPES: + raise ValueError( + f"Unsupported speculators_model_type: '{model_type}'. " + f"Supported types: {sorted(SUPPORTED_SPECULATORS_TYPES)}") + + # Check transformer config + if "transformer_layer_config" not in self.config_dict: + raise ValueError( + "Missing 'transformer_layer_config' in speculators config. " + "This field should contain the transformer architecture " + "configuration.") + + # Check proposal methods + speculators_cfg = self.config.get("speculators_config", {}) + if not isinstance(speculators_cfg, dict): + raise ValueError("'speculators_config' must be a dictionary. " + f"Got: {type(speculators_cfg).__name__}") + + proposal_methods = speculators_cfg.get("proposal_methods", []) + if not proposal_methods: + raise ValueError( + "No proposal methods found in speculators_config. " + "Expected: {'speculators_config': {'proposal_methods': " + "[{'speculative_tokens': N}]}}. " + "Check that your model config follows the speculators format.") + + def convert_speculators_to_vllm( + self, num_lookahead_tokens: int) -> dict[str, Any]: + """ + Convert speculators Eagle config format to vLLM format. + + This method handles the translation of field names and structure + between speculators and vLLM formats. It supports both Eagle-1 + and Eagle-3 variants based on speculators_model_type. + + Args: + speculators_config: Dictionary containing speculators format config + + Returns: + Dictionary with vLLM-compatible Eagle configuration + """ + speculators_model_type = self.config_dict["speculators_model_type"] + transformer_config = self.config_dict["transformer_layer_config"] + + # Build base vLLM config + vllm_config = { + "model": transformer_config, + "method": + speculators_model_type, # Use speculators_model_type as method + "num_lookahead_tokens": num_lookahead_tokens, + } + return vllm_config, transformer_config + + def ensure_transformer_architectures( + self, transformer_config: dict[str, Any]) -> None: + """Ensure transformer config has required architecture field.""" + if "architectures" not in transformer_config: + default_arch = "LlamaDecoderLayer" + arch = self.config.get("transformer_layer_architecture", + default_arch) + if arch == "LlamaDecoderLayer": + transformer_config["architectures"] = ["LlamaForCausalLM"] + else: + transformer_config["architectures"] = [arch] + return transformer_config + + def preserve_additional_fields(self, vllm_config: dict[str, Any]) -> None: + """Preserve additional fields for forward compatibility.""" + handled_fields = { + "speculators_model_type", + "transformer_layer_config", + "speculators_config", + "layernorms", + "fusion_bias", + "architectures", + "draft_vocab_size", + "target_hidden_size", + "norm_before_residual", + } + + for key, value in self.config.items(): + if key not in handled_fields: + vllm_config[key] = value + return vllm_config diff --git a/vllm/transformers_utils/configs/speculators/eagle.py b/vllm/transformers_utils/configs/speculators/eagle.py new file mode 100644 index 000000000000..be5ddba209ed --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/eagle.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +from vllm.transformers_utils.configs.speculators import SpeculatorsConfig + + +class EagleSpeculatorsConfig(SpeculatorsConfig): + + def update_defualts(self, transformer_config: dict[str, Any], + vllm_config: dict[str, Any]) -> None: + """ + Apply Eagle-1 specific configuration transformations. + + Eagle-1 specific fields: + - fusion_bias → eagle_fc_bias + - layernorms → add_para_norm (for HASH variant) + - Uses truncated_vocab_size + """ + # Handle HASH variant with additional layernorms + if self.config.get("layernorms", False): + transformer_config["add_para_norm"] = True + # When using extra layernorms, ensure skip flags are set correctly + # to maintain the expected architecture behavior + transformer_config["skip_prenorm"] = False + transformer_config["skip_output_norm"] = False + + if self.config.get("fusion_bias", False): + # If fusion_bias is set, add it to the transformer config + transformer_config["fusion_bias"] = True + + # Map Eagle-1 specific fields + vocab_size = transformer_config.get("vocab_size") + vllm_config["truncated_vocab_size"] = vocab_size + vllm_config["architectures"] = ["EAGLEModel"] diff --git a/vllm/transformers_utils/configs/speculators/eagle3.py b/vllm/transformers_utils/configs/speculators/eagle3.py new file mode 100644 index 000000000000..62d4e681b6fe --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/eagle3.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +from vllm.transformers_utils.configs.speculators import SpeculatorsConfig + + +class Eagle3SpeculatorsConfig(SpeculatorsConfig): + + def update_defaults(self, transformer_config: dict[str, Any], + vllm_config: dict[str, Any]) -> None: + """ + Apply Eagle-3 specific configuration transformations. + + Eagle-3 specific fields: + - draft_vocab_size: Size of the draft model's vocabulary + - target_hidden_size: Hidden size of the target model + - norm_before_residual: Whether to apply norm before residual connection + """ + # Copy Eagle-3 specific fields + if self.config.get("draft_vocab_size") is not None: + draft_vocab_size = self.config["draft_vocab_size"] + vllm_config["draft_vocab_size"] = draft_vocab_size + + # Handle target_hidden_size + if self.config.get("target_hidden_size") is not None: + target_hidden_size = self.config["target_hidden_size"] + vllm_config["target_hidden_size"] = target_hidden_size + else: + # Default to the draft model's hidden size + # In practice, this should match the target model's hidden size + vllm_config["target_hidden_size"] = transformer_config.get( + "hidden_size") + + if "norm_before_residual" in self.config: + # Add to transformer config which becomes the model config + transformer_config["norm_before_residual"] = self.config[ + "norm_before_residual"] + + # Eagle-3 uses a different architecture + vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] From 2748f013768b14037daa5f0bbb0c33fc68dfcebd Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 23 Jul 2025 01:02:21 +0000 Subject: [PATCH 02/15] fix imports Signed-off-by: Dipika Sikka --- vllm/transformers_utils/configs/__init__.py | 2 + .../configs/speculators/__init__.py | 2 + .../configs/speculators/base.py | 69 +++++++++++-------- .../configs/speculators/eagle.py | 15 ++-- .../configs/speculators/eagle3.py | 9 ++- 5 files changed, 54 insertions(+), 43 deletions(-) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 5d84d648f1c5..8a9967941345 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -27,6 +27,7 @@ from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig from vllm.transformers_utils.configs.solar import SolarConfig +from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.telechat2 import Telechat2Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig @@ -53,6 +54,7 @@ "NVLM_D_Config", "OvisConfig", "SkyworkR1VChatConfig", + "SpeculatorsConfig", "SolarConfig", "Telechat2Config", "UltravoxConfig", diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py index e69de29bb2d1..208f01a7cb5e 100644 --- a/vllm/transformers_utils/configs/speculators/__init__.py +++ b/vllm/transformers_utils/configs/speculators/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index 2e780ad4b5f5..5b9a23d55330 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -5,9 +5,6 @@ from transformers import PretrainedConfig -from vllm.transformers_utils.configs.speculators import ( - Eagle3SpeculatorsConfig, EagleSpeculatorsConfig) - DEFAULT_NUM_LOOKAHEAD_TOKENS = 5 SPECULATORS_WEIGHT_MAP = { "fusion_fc.weight": "fc.weight", @@ -16,10 +13,7 @@ "pre_lm_head_layernorm.weight": "hidden_states_layernorm.weight", } -SUPPORTED_SPECULATORS_TYPES = { - "eagle": EagleSpeculatorsConfig, - "eagle3": Eagle3SpeculatorsConfig -} +__all__ = ["SpeculatorsConfig"] class SpeculatorsConfig(PretrainedConfig): @@ -39,6 +33,17 @@ def from_pretrained( **kwargs) speculators_type = config_dict.get("speculators_model_type") + + from vllm.transformers_utils.configs.speculators.eagle import ( + EagleSpeculatorsConfig) + from vllm.transformers_utils.configs.speculators.eagle3 import ( + Eagle3SpeculatorsConfig) + + SUPPORTED_SPECULATORS_TYPES = { + "eagle": EagleSpeculatorsConfig, + "eagle3": Eagle3SpeculatorsConfig + } + if speculators_type not in SUPPORTED_SPECULATORS_TYPES: return super().from_pretrained(pretrained_model_name_or_path, **kwargs) @@ -46,21 +51,24 @@ def from_pretrained( spec_class = SUPPORTED_SPECULATORS_TYPES.get(speculators_type) spec_class_instance = spec_class(config_dict) - # Validate - spec_class_instance.validate_speculators_config() + # Validate that the config contains the correct fields + # TODO: use speculators to validate in the future + spec_class_instance.validate_speculators_config( + list(SUPPORTED_SPECULATORS_TYPES.keys())) + + # Extract the number of look ahead tokens num_lookahead_tokens = spec_class_instance.extract_num_lookahead_tokens( # noqa: E501 ) - vllm_config, transformer_config = spec_class_instance.convert_speculators_to_vllm( # noqa: E501 - num_lookahead_tokens) - # Process / Update - spec_class_instance.update_defaults(transformer_config, vllm_config) - spec_class_instance.ensure_transformer_architectures( - transformer_config) # Is this needed? - vllm_config = spec_class_instance.preserve_additional_fields( - vllm_config) + # Build a vllm_config using the required fields + vllm_config = spec_class_instance.convert_speculators_to_vllm( + num_lookahead_tokens=num_lookahead_tokens) - # Create + # Update method specific defaults + spec_class_instance.update_defaults(vllm_config=vllm_config) + # Ensure all required field are present + spec_class_instance.preserve_additional_fields(vllm_config=vllm_config) + # Create using proper vllm_config return cls(**vllm_config) def extract_num_lookahead_tokens(self) -> int: @@ -91,24 +99,24 @@ def extract_num_lookahead_tokens(self) -> int: return num_lookahead_tokens - def validate_speculators_config(self) -> None: + def validate_speculators_config(self, supported_types) -> None: """Validate required speculators format fields.""" # Check required top-level fields - if "speculators_model_type" not in self.config_dict: + if "speculators_model_type" not in self.config: raise ValueError( "Missing 'speculators_model_type' in config. " - f"Expected one of: {sorted(SUPPORTED_SPECULATORS_TYPES)}. " + f"Expected one of: {supported_types}. " "Please ensure you're loading a speculators-format Eagle model." ) model_type = self.config["speculators_model_type"] - if model_type not in SUPPORTED_SPECULATORS_TYPES: + if model_type not in supported_types: raise ValueError( f"Unsupported speculators_model_type: '{model_type}'. " - f"Supported types: {sorted(SUPPORTED_SPECULATORS_TYPES)}") + f"Supported types: {supported_types}") # Check transformer config - if "transformer_layer_config" not in self.config_dict: + if "transformer_layer_config" not in self.config: raise ValueError( "Missing 'transformer_layer_config' in speculators config. " "This field should contain the transformer architecture " @@ -143,8 +151,8 @@ def convert_speculators_to_vllm( Returns: Dictionary with vLLM-compatible Eagle configuration """ - speculators_model_type = self.config_dict["speculators_model_type"] - transformer_config = self.config_dict["transformer_layer_config"] + speculators_model_type = self.config["speculators_model_type"] + transformer_config = self.config["transformer_layer_config"] # Build base vLLM config vllm_config = { @@ -153,11 +161,13 @@ def convert_speculators_to_vllm( speculators_model_type, # Use speculators_model_type as method "num_lookahead_tokens": num_lookahead_tokens, } - return vllm_config, transformer_config + return vllm_config - def ensure_transformer_architectures( - self, transformer_config: dict[str, Any]) -> None: + # TODO: update / fix for Qwen - this is wrong + def ensure_transformer_architectures(self, vllm_config: dict[str, + Any]) -> None: """Ensure transformer config has required architecture field.""" + transformer_config = vllm_config["model"] if "architectures" not in transformer_config: default_arch = "LlamaDecoderLayer" arch = self.config.get("transformer_layer_architecture", @@ -166,7 +176,6 @@ def ensure_transformer_architectures( transformer_config["architectures"] = ["LlamaForCausalLM"] else: transformer_config["architectures"] = [arch] - return transformer_config def preserve_additional_fields(self, vllm_config: dict[str, Any]) -> None: """Preserve additional fields for forward compatibility.""" diff --git a/vllm/transformers_utils/configs/speculators/eagle.py b/vllm/transformers_utils/configs/speculators/eagle.py index be5ddba209ed..5799ffa6d403 100644 --- a/vllm/transformers_utils/configs/speculators/eagle.py +++ b/vllm/transformers_utils/configs/speculators/eagle.py @@ -2,13 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.transformers_utils.configs.speculators import SpeculatorsConfig +from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig class EagleSpeculatorsConfig(SpeculatorsConfig): - def update_defualts(self, transformer_config: dict[str, Any], - vllm_config: dict[str, Any]) -> None: + def update_defualts(self, vllm_config: dict[str, Any]) -> None: """ Apply Eagle-1 specific configuration transformations. @@ -19,17 +18,17 @@ def update_defualts(self, transformer_config: dict[str, Any], """ # Handle HASH variant with additional layernorms if self.config.get("layernorms", False): - transformer_config["add_para_norm"] = True + vllm_config["model"]["add_para_norm"] = True # When using extra layernorms, ensure skip flags are set correctly # to maintain the expected architecture behavior - transformer_config["skip_prenorm"] = False - transformer_config["skip_output_norm"] = False + vllm_config["model"]["skip_prenorm"] = False + vllm_config["model"]["skip_output_norm"] = False if self.config.get("fusion_bias", False): # If fusion_bias is set, add it to the transformer config - transformer_config["fusion_bias"] = True + vllm_config["model"]["fusion_bias"] = True # Map Eagle-1 specific fields - vocab_size = transformer_config.get("vocab_size") + vocab_size = vllm_config["model"].get("vocab_size") vllm_config["truncated_vocab_size"] = vocab_size vllm_config["architectures"] = ["EAGLEModel"] diff --git a/vllm/transformers_utils/configs/speculators/eagle3.py b/vllm/transformers_utils/configs/speculators/eagle3.py index 62d4e681b6fe..b6c2dc0ef299 100644 --- a/vllm/transformers_utils/configs/speculators/eagle3.py +++ b/vllm/transformers_utils/configs/speculators/eagle3.py @@ -2,13 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.transformers_utils.configs.speculators import SpeculatorsConfig +from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig class Eagle3SpeculatorsConfig(SpeculatorsConfig): - def update_defaults(self, transformer_config: dict[str, Any], - vllm_config: dict[str, Any]) -> None: + def update_defaults(self, vllm_config: dict[str, Any]) -> None: """ Apply Eagle-3 specific configuration transformations. @@ -29,12 +28,12 @@ def update_defaults(self, transformer_config: dict[str, Any], else: # Default to the draft model's hidden size # In practice, this should match the target model's hidden size - vllm_config["target_hidden_size"] = transformer_config.get( + vllm_config["target_hidden_size"] = vllm_config["model"].get( "hidden_size") if "norm_before_residual" in self.config: # Add to transformer config which becomes the model config - transformer_config["norm_before_residual"] = self.config[ + vllm_config["model"]["norm_before_residual"] = self.config[ "norm_before_residual"] # Eagle-3 uses a different architecture From e2238325a449c07fa0cc2abeb2ca8347201c37e3 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 25 Jul 2025 00:18:08 +0000 Subject: [PATCH 03/15] update --- vllm/engine/arg_utils.py | 13 ++++++- vllm/transformers_utils/config.py | 35 ++++++++++++++++++- .../configs/speculators/base.py | 6 +++- .../configs/speculators/eagle.py | 34 ------------------ 4 files changed, 51 insertions(+), 37 deletions(-) delete mode 100644 vllm/transformers_utils/configs/speculators/eagle.py diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 28b1c1c363a7..df5ca293918d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -39,6 +39,7 @@ from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file +from vllm.transformers_utils.config import maybe_override_with_speculators_configs from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -983,8 +984,18 @@ def create_speculative_config( provided as a JSON string input via CLI arguments or directly as a dictionary from the engine. """ + + from vllm.transformers_utils.config import maybe_fetch_verifier_config + if self.speculative_config is None: - return None + # TODO: we need a condition here + hf_config = maybe_fetch_verifier_config(self.hf_config_path or self.model, runner="draft") + # We create one since we dont create one + self.speculative_config = {} + self.speculative_config["num_speculative_tokens"] = hf_config.num_lookahead_tokens + self.speculative_config["model"] = self.model + self.speculative_config["method"] = hf_config.method + # return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2e66dc16b47a..7d6a363a89e9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -40,6 +40,7 @@ NemotronConfig, NVLM_D_Config, OvisConfig, RWConfig, SkyworkR1VChatConfig, SolarConfig, + SpeculatorsConfig, Telechat2Config, UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict @@ -87,6 +88,7 @@ def _get_hf_token() -> Optional[str]: "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, "eagle": EAGLEConfig, + "speculators": SpeculatorsConfig, "exaone": ExaoneConfig, "exaone4": Exaone4Config, "minimax_text_01": MiniMaxText01Config, @@ -299,6 +301,33 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config +def maybe_override_with_speculators_configs(model, tokenizer): + config_dict, _ = PretrainedConfig.get_config_dict( + model, + token=_get_hf_token(), + ) + spec_config = config_dict["speculators_config"] + # Return the target model + if spec_config is not None: + model = tokenizer = spec_config["verifier"]["name_or_path"] + return model, tokenizer + + +def maybe_fetch_verifier_config(config, runner): + if isinstance(config, str): + config = SpeculatorsConfig.from_pretrained( + config, + token=_get_hf_token(), + ) + + if runner != "draft": + config = AutoConfig.from_pretrained( + config.target_model, + token=_get_hf_token(), + ) + return config + + def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -308,6 +337,7 @@ def get_config( hf_overrides_kw: Optional[dict[str, Any]] = None, hf_overrides_fn: Optional[Callable[[PretrainedConfig], PretrainedConfig]] = None, + runner: Optional[str] = None, **kwargs, ) -> PretrainedConfig: # Separate model folder from file path for GGUF models @@ -357,9 +387,9 @@ def get_config( token=_get_hf_token(), **kwargs, ) - # Use custom model class if it's in our registry model_type = config_dict.get("model_type") + model_type = "speculators" if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] config = config_class.from_pretrained( @@ -369,6 +399,9 @@ def get_config( token=_get_hf_token(), **kwargs, ) + if model_type == "speculators": + config = maybe_fetch_verifier_config(config, runner) + return config else: try: config = AutoConfig.from_pretrained( diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index 5b9a23d55330..a2ebd9da6da7 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -17,6 +17,7 @@ class SpeculatorsConfig(PretrainedConfig): + model_type = "speculators" def __init__(self, config=None, **kwargs): super().__init__(**kwargs) @@ -66,6 +67,8 @@ def from_pretrained( # Update method specific defaults spec_class_instance.update_defaults(vllm_config=vllm_config) + spec_class_instance.ensure_transformer_architectures( + vllm_config=vllm_config) # Ensure all required field are present spec_class_instance.preserve_additional_fields(vllm_config=vllm_config) # Create using proper vllm_config @@ -153,13 +156,14 @@ def convert_speculators_to_vllm( """ speculators_model_type = self.config["speculators_model_type"] transformer_config = self.config["transformer_layer_config"] - + speculators_cfg = self.config.get("speculators_config", {}) # Build base vLLM config vllm_config = { "model": transformer_config, "method": speculators_model_type, # Use speculators_model_type as method "num_lookahead_tokens": num_lookahead_tokens, + "target_model": speculators_cfg["verifier"]["name_or_path"] } return vllm_config diff --git a/vllm/transformers_utils/configs/speculators/eagle.py b/vllm/transformers_utils/configs/speculators/eagle.py deleted file mode 100644 index 5799ffa6d403..000000000000 --- a/vllm/transformers_utils/configs/speculators/eagle.py +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any - -from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig - - -class EagleSpeculatorsConfig(SpeculatorsConfig): - - def update_defualts(self, vllm_config: dict[str, Any]) -> None: - """ - Apply Eagle-1 specific configuration transformations. - - Eagle-1 specific fields: - - fusion_bias → eagle_fc_bias - - layernorms → add_para_norm (for HASH variant) - - Uses truncated_vocab_size - """ - # Handle HASH variant with additional layernorms - if self.config.get("layernorms", False): - vllm_config["model"]["add_para_norm"] = True - # When using extra layernorms, ensure skip flags are set correctly - # to maintain the expected architecture behavior - vllm_config["model"]["skip_prenorm"] = False - vllm_config["model"]["skip_output_norm"] = False - - if self.config.get("fusion_bias", False): - # If fusion_bias is set, add it to the transformer config - vllm_config["model"]["fusion_bias"] = True - - # Map Eagle-1 specific fields - vocab_size = vllm_config["model"].get("vocab_size") - vllm_config["truncated_vocab_size"] = vocab_size - vllm_config["architectures"] = ["EAGLEModel"] From 2fe7dd2e39b46dea36af8ddfd790f17cc6c5abf7 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 25 Jul 2025 03:48:09 +0000 Subject: [PATCH 04/15] updatE Signed-off-by: Dipika Sikka --- vllm/config.py | 48 ++++++++++++++----- vllm/engine/arg_utils.py | 26 ++++++---- vllm/model_executor/models/llama_eagle3.py | 25 ++++++++-- vllm/transformers_utils/config.py | 21 ++++---- .../configs/speculators/base.py | 15 +----- .../configs/speculators/eagle3.py | 25 ++++++---- vllm/v1/spec_decode/eagle.py | 9 +++- 7 files changed, 110 insertions(+), 59 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3e6aa2a93e6a..46829decd15f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -38,8 +38,8 @@ ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, try_get_safetensors_metadata, - try_get_tokenizer_config, uses_mrope) + maybe_override_with_speculators_configs, try_get_generation_config, + try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect # yapf conflicts with isort for this block @@ -468,6 +468,15 @@ def __post_init__(self) -> None: "affect the random state of the Python process that " "launched vLLM.", self.seed) + draft_model = None + if self.runner != "draft": + # If we're not runnign the draft model, + # assume we're running the target / config model + # override self.model with the target model stub + draft_model = self.model + self.model, self.tokenizer = maybe_override_with_speculators_configs( # noqa: E501 + model=self.model, tokenizer=self.tokenizer) + # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name(self.model, self.served_model_name) @@ -532,15 +541,25 @@ def __post_init__(self) -> None: if isinstance(self.config_format, str): self.config_format = ConfigFormat(self.config_format) - hf_config = get_config(self.hf_config_path or self.model, - self.trust_remote_code, - self.revision, - self.code_revision, - self.config_format, - hf_overrides_kw=hf_overrides_kw, - hf_overrides_fn=hf_overrides_fn) - self.hf_config = hf_config + # Check if we're running the draft model or target model + if draft_model is not None: + hf_config = get_config(draft_model, + self.revision, + self.code_revision, + self.config_format, + hf_overrides_kw=hf_overrides_kw, + hf_overrides_fn=hf_overrides_fn, + runner=self.runner) + else: + hf_config = get_config(self.hf_config_path or self.model, + self.revision, + self.code_revision, + self.config_format, + hf_overrides_kw=hf_overrides_kw, + hf_overrides_fn=hf_overrides_fn, + runner=self.runner) + self.hf_config = hf_config self.hf_text_config = get_hf_text_config(self.hf_config) self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None) @@ -2835,8 +2854,13 @@ def __post_init__(self): from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) - if isinstance(self.draft_model_config.hf_config, - EAGLEConfig): + from vllm.transformers_utils.configs.speculators.base import ( # noqa: E501 + SpeculatorsConfig) + + # TODO: use isinstance + if type(self.draft_model_config.hf_config) in [ + EAGLEConfig, SpeculatorsConfig + ]: pass else: eagle_config = EAGLEConfig( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index df5ca293918d..09ee05ada61b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -39,7 +39,6 @@ from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file -from vllm.transformers_utils.config import maybe_override_with_speculators_configs from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -986,16 +985,25 @@ def create_speculative_config( """ from vllm.transformers_utils.config import maybe_fetch_verifier_config + from vllm.transformers_utils.configs.speculators.base import ( + SpeculatorsConfig) if self.speculative_config is None: - # TODO: we need a condition here - hf_config = maybe_fetch_verifier_config(self.hf_config_path or self.model, runner="draft") - # We create one since we dont create one - self.speculative_config = {} - self.speculative_config["num_speculative_tokens"] = hf_config.num_lookahead_tokens - self.speculative_config["model"] = self.model - self.speculative_config["method"] = hf_config.method - # return None + hf_config = maybe_fetch_verifier_config(self.hf_config_path + or self.model, + runner="draft") + + # if loading a SpeculatorsConfig, load the specualtive_config + # details from the config directly - no user input required + if isinstance(hf_config, SpeculatorsConfig): + # We create one since we dont create one + self.speculative_config = {} + self.speculative_config[ + "num_speculative_tokens"] = hf_config.num_lookahead_tokens + self.speculative_config["model"] = self.model + self.speculative_config["method"] = hf_config.method + else: + return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 7fc9fe2ebb6f..edea780cb744 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -51,6 +51,21 @@ def __init__( self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if getattr(config, "norm_before_residual", False): + self._residual_norm = self._norm_before_residual + else: + self._residual_norm = self._norm_after_residual + + def _norm_before_residual(self, hidden_states: torch.Tensor): + hidden_states = self.hidden_norm(hidden_states) + residual = hidden_states + return hidden_states, residual + + def _norm_after_residual(self, hidden_states: torch.Tensor): + residual = hidden_states + hidden_states = self.hidden_norm(hidden_states) + return hidden_states, residual + def forward( self, positions: torch.Tensor, @@ -59,9 +74,10 @@ def forward( residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: - residual = hidden_states embeds = self.input_layernorm(embeds) - hidden_states = self.hidden_norm(hidden_states) + + hidden_states, residual = self._residual_norm( + hidden_states=hidden_states) hidden_states = torch.cat([embeds, hidden_states], dim=-1) # Self Attention @@ -102,7 +118,8 @@ def __init__( self.layers = nn.ModuleList([ LlamaDecoderLayer( - self.config, + # ToDo: condition + config=LlamaConfig.from_dict(self.config.model), prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) @@ -116,7 +133,7 @@ def __init__( bias=False) self.norm = RMSNorm( self.config.hidden_size, - eps=self.config.rms_norm_eps, + eps=self.config.model.get("rms_norm_eps"), ) def forward( diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 7d6a363a89e9..6d941fccea15 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -314,13 +314,13 @@ def maybe_override_with_speculators_configs(model, tokenizer): def maybe_fetch_verifier_config(config, runner): - if isinstance(config, str): - config = SpeculatorsConfig.from_pretrained( - config, - token=_get_hf_token(), - ) - - if runner != "draft": + if runner == "draft": + if isinstance(config, str): + config = SpeculatorsConfig.from_pretrained( + config, + token=_get_hf_token(), + ) + else: config = AutoConfig.from_pretrained( config.target_model, token=_get_hf_token(), @@ -389,7 +389,11 @@ def get_config( ) # Use custom model class if it's in our registry model_type = config_dict.get("model_type") - model_type = "speculators" + # TODO: why isn't model_type loading correctly + if model_type is None: + model_type = "speculators" if config_dict.get( + "speculators_config") is not None else model_type + if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] config = config_class.from_pretrained( @@ -401,7 +405,6 @@ def get_config( ) if model_type == "speculators": config = maybe_fetch_verifier_config(config, runner) - return config else: try: config = AutoConfig.from_pretrained( diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index a2ebd9da6da7..ca0b9da0eeb0 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -5,14 +5,6 @@ from transformers import PretrainedConfig -DEFAULT_NUM_LOOKAHEAD_TOKENS = 5 -SPECULATORS_WEIGHT_MAP = { - "fusion_fc.weight": "fc.weight", - "fusion_fc.bias": "fc.bias", - "embedding_layernorm.weight": "embedding_layernorm.weight", - "pre_lm_head_layernorm.weight": "hidden_states_layernorm.weight", -} - __all__ = ["SpeculatorsConfig"] @@ -35,15 +27,10 @@ def from_pretrained( speculators_type = config_dict.get("speculators_model_type") - from vllm.transformers_utils.configs.speculators.eagle import ( - EagleSpeculatorsConfig) from vllm.transformers_utils.configs.speculators.eagle3 import ( Eagle3SpeculatorsConfig) - SUPPORTED_SPECULATORS_TYPES = { - "eagle": EagleSpeculatorsConfig, - "eagle3": Eagle3SpeculatorsConfig - } + SUPPORTED_SPECULATORS_TYPES = {"eagle3": Eagle3SpeculatorsConfig} if speculators_type not in SUPPORTED_SPECULATORS_TYPES: return super().from_pretrained(pretrained_model_name_or_path, diff --git a/vllm/transformers_utils/configs/speculators/eagle3.py b/vllm/transformers_utils/configs/speculators/eagle3.py index b6c2dc0ef299..ec27c62dc11b 100644 --- a/vllm/transformers_utils/configs/speculators/eagle3.py +++ b/vllm/transformers_utils/configs/speculators/eagle3.py @@ -16,25 +16,30 @@ def update_defaults(self, vllm_config: dict[str, Any]) -> None: - target_hidden_size: Hidden size of the target model - norm_before_residual: Whether to apply norm before residual connection """ + # The way we store hidden size and vocab size is confusing in out config + # we store taarget_hidden_size and hidden_size + # Copy Eagle-3 specific fields - if self.config.get("draft_vocab_size") is not None: - draft_vocab_size = self.config["draft_vocab_size"] + draft_vocab_size = self.config.get("draft_vocab_size", None) + if draft_vocab_size is not None: vllm_config["draft_vocab_size"] = draft_vocab_size - # Handle target_hidden_size + # Target vocab size + vllm_config["vocab_size"] = vllm_config["model"]["vocab_size"] + + # Handle target_hidden_size - if different than the draft hidden size if self.config.get("target_hidden_size") is not None: - target_hidden_size = self.config["target_hidden_size"] - vllm_config["target_hidden_size"] = target_hidden_size + vllm_config["target_hidden_size"] = self.config[ + "target_hidden_size"] else: # Default to the draft model's hidden size # In practice, this should match the target model's hidden size - vllm_config["target_hidden_size"] = vllm_config["model"].get( + vllm_config["hidden_size"] = vllm_config["model"].get( "hidden_size") - if "norm_before_residual" in self.config: - # Add to transformer config which becomes the model config - vllm_config["model"]["norm_before_residual"] = self.config[ - "norm_before_residual"] + # Norm before residual + vllm_config["model"]["norm_before_residual"] = self.config.get( + "norm_before_residual", True) # Eagle-3 uses a different architecture vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 967847c02ff2..59f6e68431cc 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -49,7 +49,14 @@ def __init__( # We need to get the hidden size from the draft model config because # the draft model's hidden size can be different from the target model's # hidden size (e.g., Llama 3.3 70B). - self.hidden_size = self.draft_model_config.get_hidden_size() + from vllm.transformers_utils.configs.speculators.base import ( + SpeculatorsConfig) + + if isinstance(self.draft_model_config.hf_config, SpeculatorsConfig): + self.hidden_size = self.draft_model_config.hf_config.model.get( + "hidden_size") + else: + self.hidden_size = self.draft_model_config.get_hidden_size() self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and From 7c718cac1da75ea061f7821053cc30faa822fcf0 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 25 Jul 2025 17:23:01 +0000 Subject: [PATCH 05/15] make it a lot simpler Signed-off-by: Dipika Sikka --- vllm/model_executor/models/llama_eagle3.py | 5 ++--- .../configs/speculators/base.py | 18 +----------------- .../configs/speculators/eagle3.py | 10 +--------- vllm/v1/spec_decode/eagle.py | 9 +-------- 4 files changed, 5 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index edea780cb744..05105271811d 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -118,8 +118,7 @@ def __init__( self.layers = nn.ModuleList([ LlamaDecoderLayer( - # ToDo: condition - config=LlamaConfig.from_dict(self.config.model), + config=self.config, prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) @@ -133,7 +132,7 @@ def __init__( bias=False) self.norm = RMSNorm( self.config.hidden_size, - eps=self.config.model.get("rms_norm_eps"), + eps=self.config.rms_norm_eps, ) def forward( diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index ca0b9da0eeb0..162e40999264 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -54,8 +54,6 @@ def from_pretrained( # Update method specific defaults spec_class_instance.update_defaults(vllm_config=vllm_config) - spec_class_instance.ensure_transformer_architectures( - vllm_config=vllm_config) # Ensure all required field are present spec_class_instance.preserve_additional_fields(vllm_config=vllm_config) # Create using proper vllm_config @@ -146,28 +144,14 @@ def convert_speculators_to_vllm( speculators_cfg = self.config.get("speculators_config", {}) # Build base vLLM config vllm_config = { - "model": transformer_config, "method": speculators_model_type, # Use speculators_model_type as method "num_lookahead_tokens": num_lookahead_tokens, "target_model": speculators_cfg["verifier"]["name_or_path"] } + vllm_config.update(transformer_config) return vllm_config - # TODO: update / fix for Qwen - this is wrong - def ensure_transformer_architectures(self, vllm_config: dict[str, - Any]) -> None: - """Ensure transformer config has required architecture field.""" - transformer_config = vllm_config["model"] - if "architectures" not in transformer_config: - default_arch = "LlamaDecoderLayer" - arch = self.config.get("transformer_layer_architecture", - default_arch) - if arch == "LlamaDecoderLayer": - transformer_config["architectures"] = ["LlamaForCausalLM"] - else: - transformer_config["architectures"] = [arch] - def preserve_additional_fields(self, vllm_config: dict[str, Any]) -> None: """Preserve additional fields for forward compatibility.""" handled_fields = { diff --git a/vllm/transformers_utils/configs/speculators/eagle3.py b/vllm/transformers_utils/configs/speculators/eagle3.py index ec27c62dc11b..5227157e927f 100644 --- a/vllm/transformers_utils/configs/speculators/eagle3.py +++ b/vllm/transformers_utils/configs/speculators/eagle3.py @@ -24,21 +24,13 @@ def update_defaults(self, vllm_config: dict[str, Any]) -> None: if draft_vocab_size is not None: vllm_config["draft_vocab_size"] = draft_vocab_size - # Target vocab size - vllm_config["vocab_size"] = vllm_config["model"]["vocab_size"] - # Handle target_hidden_size - if different than the draft hidden size if self.config.get("target_hidden_size") is not None: vllm_config["target_hidden_size"] = self.config[ "target_hidden_size"] - else: - # Default to the draft model's hidden size - # In practice, this should match the target model's hidden size - vllm_config["hidden_size"] = vllm_config["model"].get( - "hidden_size") # Norm before residual - vllm_config["model"]["norm_before_residual"] = self.config.get( + vllm_config["norm_before_residual"] = self.config.get( "norm_before_residual", True) # Eagle-3 uses a different architecture diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 59f6e68431cc..967847c02ff2 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -49,14 +49,7 @@ def __init__( # We need to get the hidden size from the draft model config because # the draft model's hidden size can be different from the target model's # hidden size (e.g., Llama 3.3 70B). - from vllm.transformers_utils.configs.speculators.base import ( - SpeculatorsConfig) - - if isinstance(self.draft_model_config.hf_config, SpeculatorsConfig): - self.hidden_size = self.draft_model_config.hf_config.model.get( - "hidden_size") - else: - self.hidden_size = self.draft_model_config.get_hidden_size() + self.hidden_size = self.draft_model_config.get_hidden_size() self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and From d1e490c18b17970d12db64c16a6fe7884dfb09db Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 25 Jul 2025 21:55:36 +0000 Subject: [PATCH 06/15] clean-up + simplify spec config Signed-off-by: Dipika Sikka --- vllm/config.py | 4 +- .../configs/speculators/algos.py | 32 ++++ .../configs/speculators/base.py | 175 +++++------------- .../configs/speculators/eagle3.py | 37 ---- 4 files changed, 80 insertions(+), 168 deletions(-) create mode 100644 vllm/transformers_utils/configs/speculators/algos.py delete mode 100644 vllm/transformers_utils/configs/speculators/eagle3.py diff --git a/vllm/config.py b/vllm/config.py index 46829decd15f..9c8eb2942d4f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2852,10 +2852,10 @@ def __post_init__(self): "Chunked prefill and EAGLE are not compatible " "when using V0.") + from vllm.transformers_utils.configs import ( + SpeculatorsConfig) from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) - from vllm.transformers_utils.configs.speculators.base import ( # noqa: E501 - SpeculatorsConfig) # TODO: use isinstance if type(self.draft_model_config.hf_config) in [ diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py new file mode 100644 index 000000000000..efc87b6bcf26 --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/algos.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +SUPPORTED_SPECULATORS_TYPES = {} + + +def register_speculator(name): + + def decorator(fn): + SUPPORTED_SPECULATORS_TYPES[name] = fn + return fn + + return decorator + + +@register_speculator("eagle3") +def update_eagle3(config_dict: dict, vllm_config: dict) -> None: + """ + Apply Eagle-3 specific configuration transformations. + + Eagle-3 specific fields: + - draft_vocab_size: Size of the draft model's vocabulary + - target_hidden_size: Hidden size of the target model + - norm_before_residual: Whether to apply norm before residual connection + """ + + vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size") + if config_dict.get("target_hidden_size") is not None: + vllm_config["target_hidden_size"] = config_dict["target_hidden_size"] + vllm_config["norm_before_residual"] = config_dict.get( + "norm_before_residual", True) + vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index 162e40999264..1fb22b131205 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -5,16 +5,15 @@ from transformers import PretrainedConfig +from vllm.transformers_utils.configs.speculators.algos import ( + SUPPORTED_SPECULATORS_TYPES) + __all__ = ["SpeculatorsConfig"] class SpeculatorsConfig(PretrainedConfig): model_type = "speculators" - def __init__(self, config=None, **kwargs): - super().__init__(**kwargs) - self.config = config or {} - @classmethod def from_pretrained( cls, @@ -25,59 +24,54 @@ def from_pretrained( config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - speculators_type = config_dict.get("speculators_model_type") - - from vllm.transformers_utils.configs.speculators.eagle3 import ( - Eagle3SpeculatorsConfig) - - SUPPORTED_SPECULATORS_TYPES = {"eagle3": Eagle3SpeculatorsConfig} - - if speculators_type not in SUPPORTED_SPECULATORS_TYPES: - return super().from_pretrained(pretrained_model_name_or_path, - **kwargs) - - spec_class = SUPPORTED_SPECULATORS_TYPES.get(speculators_type) - spec_class_instance = spec_class(config_dict) - - # Validate that the config contains the correct fields - # TODO: use speculators to validate in the future - spec_class_instance.validate_speculators_config( - list(SUPPORTED_SPECULATORS_TYPES.keys())) - - # Extract the number of look ahead tokens - num_lookahead_tokens = spec_class_instance.extract_num_lookahead_tokens( # noqa: E501 - ) - - # Build a vllm_config using the required fields - vllm_config = spec_class_instance.convert_speculators_to_vllm( - num_lookahead_tokens=num_lookahead_tokens) - - # Update method specific defaults - spec_class_instance.update_defaults(vllm_config=vllm_config) - # Ensure all required field are present - spec_class_instance.preserve_additional_fields(vllm_config=vllm_config) - # Create using proper vllm_config + speculators_model_type = config_dict.get("speculators_model_type") + if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES: + raise ValueError( + f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. " + "Please ensure you're loading a speculators-format model.") + + # validate fields + # TODO: @dsikka - use speculators pydantic model to validate + cls.validate_speculators_config(config_dict=config_dict) + # Convert from speculators config -> format that can be ingested by vLLM + vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) + # Apply anything specific to the supported algorithm + algo_updater = SUPPORTED_SPECULATORS_TYPES.get(speculators_model_type) + algo_updater(config_dict=config_dict, vllm_config=vllm_config) return cls(**vllm_config) - def extract_num_lookahead_tokens(self) -> int: + @classmethod + def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: + try: + spec_config = config_dict["speculators_config"] + methods = spec_config["proposal_methods"] + first_method = methods[0] + _ = first_method["speculative_tokens"] + _ = spec_config["verifier"]["name_or_path"] + _ = config_dict["speculators_model_type"] + except (KeyError, IndexError, TypeError) as e: + raise ValueError("Invalid speculators config structure") from e + + if "transformer_layer_config" in config_dict and not isinstance( + config_dict["transformer_layer_config"], dict): + raise TypeError( + "'transformer_layer_config' must be a dictionary if provided") + + @classmethod + def convert_speculators_to_vllm( + cls, config_dict: dict[str, Any]) -> dict[str, Any]: """ - Extract number of lookahead tokens from proposal methods. + Convert speculators config format to vLLM format. - Args: - config: Speculators config dictionary + This method handles the translation of field names and structure + between speculators and vLLM formats. Returns: - Number of speculative tokens - - Note: - Currently only supports the first proposal method. - Future versions may support multiple proposal methods. + Dictionary with vLLM-compatible configuration """ - speculators_cfg = self.config["speculators_config"] - proposal_methods = speculators_cfg["proposal_methods"] - # Currently we only support one proposal method - first_method = proposal_methods[0] + spec_config = config_dict["speculators_config"] + first_method = spec_config.get("proposal_methods")[0] num_lookahead_tokens = first_method.get("speculative_tokens") if num_lookahead_tokens is None: @@ -85,88 +79,11 @@ def extract_num_lookahead_tokens(self) -> int: "Missing 'speculative_tokens' in proposal method. " f"Got: {first_method}") - return num_lookahead_tokens - - def validate_speculators_config(self, supported_types) -> None: - """Validate required speculators format fields.""" - # Check required top-level fields - if "speculators_model_type" not in self.config: - raise ValueError( - "Missing 'speculators_model_type' in config. " - f"Expected one of: {supported_types}. " - "Please ensure you're loading a speculators-format Eagle model." - ) - - model_type = self.config["speculators_model_type"] - if model_type not in supported_types: - raise ValueError( - f"Unsupported speculators_model_type: '{model_type}'. " - f"Supported types: {supported_types}") - - # Check transformer config - if "transformer_layer_config" not in self.config: - raise ValueError( - "Missing 'transformer_layer_config' in speculators config. " - "This field should contain the transformer architecture " - "configuration.") - - # Check proposal methods - speculators_cfg = self.config.get("speculators_config", {}) - if not isinstance(speculators_cfg, dict): - raise ValueError("'speculators_config' must be a dictionary. " - f"Got: {type(speculators_cfg).__name__}") - - proposal_methods = speculators_cfg.get("proposal_methods", []) - if not proposal_methods: - raise ValueError( - "No proposal methods found in speculators_config. " - "Expected: {'speculators_config': {'proposal_methods': " - "[{'speculative_tokens': N}]}}. " - "Check that your model config follows the speculators format.") - - def convert_speculators_to_vllm( - self, num_lookahead_tokens: int) -> dict[str, Any]: - """ - Convert speculators Eagle config format to vLLM format. - - This method handles the translation of field names and structure - between speculators and vLLM formats. It supports both Eagle-1 - and Eagle-3 variants based on speculators_model_type. - - Args: - speculators_config: Dictionary containing speculators format config - - Returns: - Dictionary with vLLM-compatible Eagle configuration - """ - speculators_model_type = self.config["speculators_model_type"] - transformer_config = self.config["transformer_layer_config"] - speculators_cfg = self.config.get("speculators_config", {}) # Build base vLLM config vllm_config = { - "method": - speculators_model_type, # Use speculators_model_type as method + "method": config_dict.get("speculators_model_type"), "num_lookahead_tokens": num_lookahead_tokens, - "target_model": speculators_cfg["verifier"]["name_or_path"] + "target_model": spec_config.get("verifier")["name_or_path"] } - vllm_config.update(transformer_config) - return vllm_config - - def preserve_additional_fields(self, vllm_config: dict[str, Any]) -> None: - """Preserve additional fields for forward compatibility.""" - handled_fields = { - "speculators_model_type", - "transformer_layer_config", - "speculators_config", - "layernorms", - "fusion_bias", - "architectures", - "draft_vocab_size", - "target_hidden_size", - "norm_before_residual", - } - - for key, value in self.config.items(): - if key not in handled_fields: - vllm_config[key] = value + vllm_config.update(config_dict.get("transformer_layer_config")) return vllm_config diff --git a/vllm/transformers_utils/configs/speculators/eagle3.py b/vllm/transformers_utils/configs/speculators/eagle3.py deleted file mode 100644 index 5227157e927f..000000000000 --- a/vllm/transformers_utils/configs/speculators/eagle3.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any - -from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig - - -class Eagle3SpeculatorsConfig(SpeculatorsConfig): - - def update_defaults(self, vllm_config: dict[str, Any]) -> None: - """ - Apply Eagle-3 specific configuration transformations. - - Eagle-3 specific fields: - - draft_vocab_size: Size of the draft model's vocabulary - - target_hidden_size: Hidden size of the target model - - norm_before_residual: Whether to apply norm before residual connection - """ - # The way we store hidden size and vocab size is confusing in out config - # we store taarget_hidden_size and hidden_size - - # Copy Eagle-3 specific fields - draft_vocab_size = self.config.get("draft_vocab_size", None) - if draft_vocab_size is not None: - vllm_config["draft_vocab_size"] = draft_vocab_size - - # Handle target_hidden_size - if different than the draft hidden size - if self.config.get("target_hidden_size") is not None: - vllm_config["target_hidden_size"] = self.config[ - "target_hidden_size"] - - # Norm before residual - vllm_config["norm_before_residual"] = self.config.get( - "norm_before_residual", True) - - # Eagle-3 uses a different architecture - vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] From d46cc42e79f6c79836e7853edc9a718522a84055 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 25 Jul 2025 23:03:54 +0000 Subject: [PATCH 07/15] update Signed-off-by: Dipika Sikka --- vllm/config.py | 35 ++++++++++--------------------- vllm/engine/arg_utils.py | 11 +++++----- vllm/transformers_utils/config.py | 25 +++++----------------- 3 files changed, 22 insertions(+), 49 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9c8eb2942d4f..2b79d7e6b7de 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -38,7 +38,7 @@ ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - maybe_override_with_speculators_configs, try_get_generation_config, + maybe_override_with_speculators_target_model, try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect @@ -468,13 +468,10 @@ def __post_init__(self) -> None: "affect the random state of the Python process that " "launched vLLM.", self.seed) - draft_model = None if self.runner != "draft": - # If we're not runnign the draft model, - # assume we're running the target / config model - # override self.model with the target model stub - draft_model = self.model - self.model, self.tokenizer = maybe_override_with_speculators_configs( # noqa: E501 + # If we're not running the draft model, check for speculators config + # If speculators config, set model / tokenizer to be target model + self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 model=self.model, tokenizer=self.tokenizer) # Keep set served_model_name before maybe_model_redirect(self.model) @@ -541,24 +538,14 @@ def __post_init__(self) -> None: if isinstance(self.config_format, str): self.config_format = ConfigFormat(self.config_format) - # Check if we're running the draft model or target model + hf_config = get_config(self.hf_config_path or self.model, + self.trust_remote_code, + self.revision, + self.code_revision, + self.config_format, + hf_overrides_kw=hf_overrides_kw, + hf_overrides_fn=hf_overrides_fn) - if draft_model is not None: - hf_config = get_config(draft_model, - self.revision, - self.code_revision, - self.config_format, - hf_overrides_kw=hf_overrides_kw, - hf_overrides_fn=hf_overrides_fn, - runner=self.runner) - else: - hf_config = get_config(self.hf_config_path or self.model, - self.revision, - self.code_revision, - self.config_format, - hf_overrides_kw=hf_overrides_kw, - hf_overrides_fn=hf_overrides_fn, - runner=self.runner) self.hf_config = hf_config self.hf_text_config = get_hf_text_config(self.hf_config) self.attention_chunk_size = getattr(self.hf_text_config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 09ee05ada61b..401b3d74043a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -984,17 +984,18 @@ def create_speculative_config( dictionary from the engine. """ - from vllm.transformers_utils.config import maybe_fetch_verifier_config + from vllm.transformers_utils.config import get_config from vllm.transformers_utils.configs.speculators.base import ( SpeculatorsConfig) if self.speculative_config is None: - hf_config = maybe_fetch_verifier_config(self.hf_config_path - or self.model, - runner="draft") + hf_config = get_config(self.hf_config_path or self.model, + self.trust_remote_code, self.revision, + self.code_revision, self.config_format) # if loading a SpeculatorsConfig, load the specualtive_config - # details from the config directly - no user input required + # details from the config directly + # no user input required / expected if isinstance(hf_config, SpeculatorsConfig): # We create one since we dont create one self.speculative_config = {} diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6d941fccea15..99b035137b81 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -301,33 +301,21 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config -def maybe_override_with_speculators_configs(model, tokenizer): +def maybe_override_with_speculators_target_model(model: str, tokenizer: str): + """ + If running a speculators config, override running model with target model + """ config_dict, _ = PretrainedConfig.get_config_dict( model, token=_get_hf_token(), ) - spec_config = config_dict["speculators_config"] + spec_config = config_dict.get("speculators_config") # Return the target model if spec_config is not None: model = tokenizer = spec_config["verifier"]["name_or_path"] return model, tokenizer -def maybe_fetch_verifier_config(config, runner): - if runner == "draft": - if isinstance(config, str): - config = SpeculatorsConfig.from_pretrained( - config, - token=_get_hf_token(), - ) - else: - config = AutoConfig.from_pretrained( - config.target_model, - token=_get_hf_token(), - ) - return config - - def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -337,7 +325,6 @@ def get_config( hf_overrides_kw: Optional[dict[str, Any]] = None, hf_overrides_fn: Optional[Callable[[PretrainedConfig], PretrainedConfig]] = None, - runner: Optional[str] = None, **kwargs, ) -> PretrainedConfig: # Separate model folder from file path for GGUF models @@ -403,8 +390,6 @@ def get_config( token=_get_hf_token(), **kwargs, ) - if model_type == "speculators": - config = maybe_fetch_verifier_config(config, runner) else: try: config = AutoConfig.from_pretrained( From f30541989994480827653f4af8f3ab5a57b5d21a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 25 Jul 2025 23:52:57 +0000 Subject: [PATCH 08/15] use instance Signed-off-by: Dipika Sikka --- vllm/config.py | 6 ++---- vllm/transformers_utils/config.py | 1 - vllm/transformers_utils/configs/speculators/base.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 2b79d7e6b7de..3213450bfe4e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2844,10 +2844,8 @@ def __post_init__(self): from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) - # TODO: use isinstance - if type(self.draft_model_config.hf_config) in [ - EAGLEConfig, SpeculatorsConfig - ]: + if isinstance(self.draft_model_config.hf_config, + (EAGLEConfig, SpeculatorsConfig)): pass else: eagle_config = EAGLEConfig( diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 99b035137b81..92e14d6dc5be 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -376,7 +376,6 @@ def get_config( ) # Use custom model class if it's in our registry model_type = config_dict.get("model_type") - # TODO: why isn't model_type loading correctly if model_type is None: model_type = "speculators" if config_dict.get( "speculators_config") is not None else model_type diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index 1fb22b131205..cbf35a01975a 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -36,7 +36,7 @@ def from_pretrained( # Convert from speculators config -> format that can be ingested by vLLM vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) # Apply anything specific to the supported algorithm - algo_updater = SUPPORTED_SPECULATORS_TYPES.get(speculators_model_type) + algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type] algo_updater(config_dict=config_dict, vllm_config=vllm_config) return cls(**vllm_config) From bbc0a56638d4f5d6c1508a9d553a2583c9c3d549 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Sat, 26 Jul 2025 00:10:44 +0000 Subject: [PATCH 09/15] update Signed-off-by: Dipika Sikka --- vllm/config.py | 5 ++++- vllm/transformers_utils/config.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3213450bfe4e..462f3fe53a67 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -472,7 +472,10 @@ def __post_init__(self) -> None: # If we're not running the draft model, check for speculators config # If speculators config, set model / tokenizer to be target model self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 - model=self.model, tokenizer=self.tokenizer) + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code) # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name(self.model, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 92e14d6dc5be..6549983f2471 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -301,12 +301,16 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config -def maybe_override_with_speculators_target_model(model: str, tokenizer: str): +def maybe_override_with_speculators_target_model(model: str, tokenizer: str, + revision: str, + trust_remote_code: bool): """ If running a speculators config, override running model with target model """ config_dict, _ = PretrainedConfig.get_config_dict( model, + revision=revision, + trust_remote_code=trust_remote_code, token=_get_hf_token(), ) spec_config = config_dict.get("speculators_config") From 94699746757f9d68e2f4bd89ad2c4b10ed962457 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Sat, 26 Jul 2025 00:18:25 +0000 Subject: [PATCH 10/15] update Signed-off-by: Dipika Sikka --- vllm/transformers_utils/configs/speculators/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index cbf35a01975a..d7c16e180c70 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -52,8 +52,10 @@ def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: except (KeyError, IndexError, TypeError) as e: raise ValueError("Invalid speculators config structure") from e - if "transformer_layer_config" in config_dict and not isinstance( - config_dict["transformer_layer_config"], dict): + if "transformer_layer_config" not in config_dict: + raise ValueError("Must provide transformer_layer_config") + + if not isinstance(config_dict["transformer_layer_config"], dict): raise TypeError( "'transformer_layer_config' must be a dictionary if provided") @@ -85,5 +87,5 @@ def convert_speculators_to_vllm( "num_lookahead_tokens": num_lookahead_tokens, "target_model": spec_config.get("verifier")["name_or_path"] } - vllm_config.update(config_dict.get("transformer_layer_config")) + vllm_config.update(config_dict["transformer_layer_config"]) return vllm_config From b1577d25223e643bc8ec52c822f29bc2eefb71e3 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Sat, 26 Jul 2025 00:22:59 +0000 Subject: [PATCH 11/15] update Signed-off-by: Dipika Sikka --- vllm/transformers_utils/config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6549983f2471..73e4e5f06861 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -301,9 +301,11 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config -def maybe_override_with_speculators_target_model(model: str, tokenizer: str, - revision: str, - trust_remote_code: bool): +def maybe_override_with_speculators_target_model( + model: str, + tokenizer: str, + trust_remote_code: bool, + revision: Optional[str] = None): """ If running a speculators config, override running model with target model """ From f9c0a8b943fec2b128c703cdebaf14fc226b0177 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 29 Jul 2025 13:06:27 +0000 Subject: [PATCH 12/15] update return types; add smoke test Signed-off-by: Dipika Sikka --- .../speculators/test_eagle3.py | 16 ++++++++++++++++ vllm/model_executor/models/llama_eagle3.py | 8 ++++++-- vllm/transformers_utils/config.py | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 tests/speculative_decoding/speculators/test_eagle3.py diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py new file mode 100644 index 000000000000..c58fc8c0dc5f --- /dev/null +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + + +@pytest.mark.parametrize( + "model_path", + [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"), + ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) +def test_llama(vllm_runner, example_prompts, model_path): + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens=20) + print(vllm_outputs) + assert vllm_outputs diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 05105271811d..184a49c4d0fb 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -56,12 +56,16 @@ def __init__( else: self._residual_norm = self._norm_after_residual - def _norm_before_residual(self, hidden_states: torch.Tensor): + def _norm_before_residual( + self, + hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: hidden_states = self.hidden_norm(hidden_states) residual = hidden_states return hidden_states, residual - def _norm_after_residual(self, hidden_states: torch.Tensor): + def _norm_after_residual( + self, + hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states hidden_states = self.hidden_norm(hidden_states) return hidden_states, residual diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 73e4e5f06861..5b0af0e249a6 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -305,7 +305,7 @@ def maybe_override_with_speculators_target_model( model: str, tokenizer: str, trust_remote_code: bool, - revision: Optional[str] = None): + revision: Optional[str] = None) -> tuple[str, str]: """ If running a speculators config, override running model with target model """ From cb7f2cae708165eed5a119b24117ed6ae742a396 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 30 Jul 2025 12:58:38 +0000 Subject: [PATCH 13/15] format post rebase Signed-off-by: Dipika Sikka --- vllm/transformers_utils/config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fc72110f739f..ce9a960cebce 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,8 +35,7 @@ MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, RWConfig, - SpeculatorsConfig, - UltravoxConfig) + SpeculatorsConfig, UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file From 7f627bbce64a09c59152f2e4709715e59ec57855 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 30 Jul 2025 20:50:50 +0000 Subject: [PATCH 14/15] format Signed-off-by: Dipika Sikka --- vllm/transformers_utils/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 45e5f7b44224..a0a60e44dd4a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -37,7 +37,6 @@ NemotronConfig, NVLM_D_Config, RWConfig, SpeculatorsConfig, UltravoxConfig) - # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file From 65a93f1e57c97d6e6934622271e82df9bc7e47ff Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 31 Jul 2025 21:15:35 +0000 Subject: [PATCH 15/15] foramt Signed-off-by: Dipika Sikka --- vllm/transformers_utils/config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ef9aea3fffda..0e633c2c0b6a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -36,9 +36,8 @@ Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, RWConfig, SpeculatorsConfig, - Step3TextConfig, Step3VLConfig, + Step3TextConfig, Step3VLConfig, UltravoxConfig) - # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file