vllm-project
diff --git a/‎tests/models/registry.py‎
Lines changed: 5 additions & 1 deletion b/‎tests/models/registry.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tests/v1/e2e/test_spec_decode.py‎
Lines changed: 23 additions & 18 deletions b/‎tests/v1/e2e/test_spec_decode.py‎
Lines changed: 23 additions & 18 deletions
diff --git a/‎vllm/config.py‎
Lines changed: 0 additions & 6 deletions b/‎vllm/config.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎vllm/model_executor/models/hunyuan_v1.py‎
Lines changed: 21 additions & 1 deletion b/‎vllm/model_executor/models/hunyuan_v1.py‎
Lines changed: 21 additions & 1 deletion
@@ -521,7 +521,11 @@ def check_available_online(
                                         is_available_online=False),
     "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                     trust_remote_code=True,
-                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
+                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
+    "Eagle3HunYuanDenseV1ForCausalLM": _HfExamplesInfo(
+        "tencent/Hunyuan-1.8B-Instruct",
+        speculative_model="AngelSlim/Hunyuan-1.8B-Instruct_eagle3",
+        tokenizer="tencent/Hunyuan-1.8B-Instruct"),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
 
@@ -123,24 +123,29 @@ def test_ngram_correctness(
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize(
-    ["model_setup", "mm_enabled"], [
-        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-        pytest.param(
-            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-            False,
-            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-        pytest.param(
-            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-            True,
-            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-    ],
-    ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"])
+@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+    (("eagle3", "tencent/Hunyuan-1.8B-Instruct",
+      "AngelSlim/Hunyuan-1.8B-Instruct_eagle3", 1), False),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        False,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        True,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+],
+                         ids=[
+                             "llama3_eagle", "llama3_eagle3",
+                             "hunyuan_v1_eagle3", "llama4_eagle",
+                             "llama4_eagle_mm"
+                         ])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
 
@@ -3154,12 +3154,6 @@ def _verify_args(self) -> Self:
                              "speculative decoding is > 1, but got "
                              f"{self.disable_by_batch_size=}")
 
-        if self.method == "eagle3" and self.target_model_config and \
-            "llama" not in self.target_model_config.hf_text_config.model_type:
-            raise ValueError(
-                "Eagle3 is only supported for Llama models. "
-                f"Got {self.target_model_config.hf_text_config.model_type=}")
-
         return self
 
     @property
 
@@ -37,6 +37,7 @@
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -60,6 +61,8 @@
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_layers)
 
+logger = init_logger(__name__)
+
 
 def _is_moe(config: PretrainedConfig) -> bool:
     num_experts = getattr(config, "num_experts", None)
@@ -215,7 +218,7 @@ def forward(
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
         kv_states: Optional[tuple[torch.Tensor]] = None,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
@@ -596,6 +599,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.norm = PPMissingLayer()
 
+        self.aux_hidden_state_layers: tuple[int] = tuple()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
@@ -619,8 +624,13 @@ def forward(
 
         cla_factor = _get_cla_factor(self.config)
         prev_kv_states = None
+        aux_hidden_states = []
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
+            if i in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states if residual is
+                                         None else hidden_states + residual)
+
             hidden_states, residual, kv_states = layer(
                 positions,
                 hidden_states,
@@ -641,6 +651,9 @@ def forward(
             })
 
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def _split_qkv_weight(self, qkv: torch.Tensor):
@@ -928,6 +941,13 @@ def load_weights(self, weights: Iterable[tuple[str,
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
         return self.model.get_expert_mapping()
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
 
 class HunYuanDenseV1ForCausalLM(HunYuanV1Base):
     pass