[Bugfix] Fix MTP support for lmhead_tensor_parallel_size (vllm-project#3915)

Nagisa125 · web-flow · commit 8428e0c16c9b · 2025-10-31T10:30:28.000+08:00
### What this PR does / why we need it? Fix the issue of MTP being enabled and setting Imhead_tensor_parallel_size=16 causing the inference to hang. - vLLM version: v0.11.0 - vLLM main: vllm-project/vllm@83f478b Signed-off-by: wyh145 <1987244901@qq.com> Signed-off-by: luolun <luolun1995@cmbchina.com>
diff --git a/vllm_ascend/ops/vocab_parallel_embedding.py b/vllm_ascend/ops/vocab_parallel_embedding.py
@@ -51,7 +51,7 @@ def __init__(self,
                  prefix: str = ""):
         nn.Module.__init__(self)
 
-        if lmhead_tp_enable() and prefix.find("lm_head") != -1:
+        if lmhead_tp_enable() and prefix.find("head") != -1:
             self.comm_group = get_lmhead_tp_group()
         else:
             self.comm_group = get_tp_group()
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2913,7 +2913,8 @@ def dummy_compute_logits(hidden_states):
                     aclgraph_runtime_mode=aclgraph_runtime_mode,
                     batch_descriptor=batch_descriptor)
                 if need_dummy_logits:
-                    dummy_compute_logits(hidden_states)
+                    self.drafter.model.compute_logits(
+                        hidden_states[dummy_indices])
             if self.in_profile_run and self.dynamic_eplb:
                 self.model.clear_all_moe_loads()
             if not self.in_profile_run and self.dynamic_eplb: