From 0d9eb4cfb9d8ea266b007a077d6d137627de338c Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Wed, 16 Jul 2025 13:28:51 +0800 Subject: [PATCH 1/8] [BUGFIX][v0.9.1] fix enable_multistream_moe bug when DBO is enabled (#1727) Signed-off-by: weijinqian_v1 --- vllm_ascend/models/deepseek_dbo.py | 41 ------------------------------ 1 file changed, 41 deletions(-) diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index 20dafdf7ac..a33a69b80a 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -154,47 +154,6 @@ def __init__( CustomDeepseekDBOMoE.top_k = config.num_experts_per_tok self.config = config - def forward( - self, - hidden_states: torch.Tensor, - attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: - forward_context = get_forward_context() - if attn_metadata is None: - attn_metadata = forward_context.attn_metadata - - # when profile runs, force experts to load balanced tokens - # to avoid high memory consumption on a single rank. - enable_force_load_balance = forward_context.in_profile_run - - is_prefill = forward_context.with_prefill - # If this node is kv_consumer, we force the moe always runs in decode path to make sure - # the behaviour aligned between dummy_run and normal model_execute. - if self.kv_consumer: - is_prefill = False - - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - - experts_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits, - is_prefill=is_prefill, - top_k=CustomDeepseekDBOMoE.top_k, - enable_force_load_balance=enable_force_load_balance, - shared_experts=self.shared_experts) - - shared_experts_hidden = experts_hidden_states[1] - if not (self.shared_experts.down_proj.reduce_results - and self.shared_experts.down_proj.tp_size > 1): - shared_experts_hidden = tensor_model_parallel_all_reduce( - shared_experts_hidden) - - hidden_states = ( - experts_hidden_states[0] * self.routed_scaling_factor + - shared_experts_hidden) - - return hidden_states - # ----------------------------------------- TBO-related -------------------------------------------- def _forward_ms_op_shared_expert( self, From 8da5ac1ccc6646215d9e567345de220895e7c919 Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Wed, 16 Jul 2025 15:52:11 +0800 Subject: [PATCH 2/8] [BUGFIX][v0.9.1] fix enable_multistream_moe bug when DBO is enabled (#1727) Signed-off-by: weijinqian_v1 --- vllm_ascend/ops/fused_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 3fa9c8be74..57a798ab2a 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1266,6 +1266,8 @@ def forward( if shared_experts: if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) + if not shared_experts.down_proj.reduce_results and self.shared_experts.down_proj.tp_size > 1: + shared_hidden_states = tensor_model_parallel_all_reduce(shared_hidden_states) mc2_mask = forward_context.mc2_mask tp_size = get_tensor_model_parallel_world_size() From 9c4903fba37e22e6d0f14e817dc1190cd54cf1cd Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Wed, 16 Jul 2025 16:02:03 +0800 Subject: [PATCH 3/8] [BUGFIX][v0.9.1] fix enable_multistream_moe bug when DBO is enabled (#1727) Signed-off-by: weijinqian_v1 --- vllm_ascend/ops/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 57a798ab2a..3b58f81d97 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1266,7 +1266,7 @@ def forward( if shared_experts: if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) - if not shared_experts.down_proj.reduce_results and self.shared_experts.down_proj.tp_size > 1: + if not shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: shared_hidden_states = tensor_model_parallel_all_reduce(shared_hidden_states) mc2_mask = forward_context.mc2_mask From e6cdf862674b161fabf94370f28108bca4fa5afa Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Wed, 16 Jul 2025 16:14:18 +0800 Subject: [PATCH 4/8] [BUGFIX][v0.9.1] fix enable_multistream_moe bug when DBO is enabled (#1727) Signed-off-by: weijinqian_v1 --- vllm_ascend/ops/fused_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 3b58f81d97..d2ce4114f4 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1267,7 +1267,8 @@ def forward( if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) if not shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: - shared_hidden_states = tensor_model_parallel_all_reduce(shared_hidden_states) + shared_hidden_states = tensor_model_parallel_all_reduce( + shared_hidden_states) mc2_mask = forward_context.mc2_mask tp_size = get_tensor_model_parallel_world_size() From 6b3b082775722f91dbb0d2c45c82a652a908e5cb Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Thu, 17 Jul 2025 16:29:19 +0800 Subject: [PATCH 5/8] [BUGFIX][v0.9.1] fix enable_multistream_moe bug when DBO is enabled (#1727) Signed-off-by: weijinqian_v1 --- vllm_ascend/ops/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index d2ce4114f4..d0578458b0 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1266,7 +1266,7 @@ def forward( if shared_experts: if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) - if not shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: + if not (shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1): shared_hidden_states = tensor_model_parallel_all_reduce( shared_hidden_states) From db7887c5194543dc8ea0955fc2c118d35c7e8389 Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Thu, 17 Jul 2025 16:48:12 +0800 Subject: [PATCH 6/8] [BUGFIX][v0.9.1] fix enable_multistream_moe bug when DBO is enabled (#1727) Signed-off-by: weijinqian_v1 --- vllm_ascend/ops/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index d0578458b0..d2ce4114f4 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1266,7 +1266,7 @@ def forward( if shared_experts: if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) - if not (shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1): + if not shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: shared_hidden_states = tensor_model_parallel_all_reduce( shared_hidden_states) From ca62bc0cc46f65efb1e06859a282655719b04c30 Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Fri, 18 Jul 2025 09:11:24 +0800 Subject: [PATCH 7/8] [BUGFIX][v0.9.1] ep_group is not equal to word_size in some cases. Signed-off-by: weijinqian_v1 --- vllm_ascend/ascend_forward_context.py | 4 +-- vllm_ascend/models/deepseek_dbo.py | 41 +++++++++++++++++++++++++++ vllm_ascend/ops/fused_moe.py | 3 -- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index e4a9b5adce..a8c4dfae5b 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -5,7 +5,7 @@ import torch from vllm.config import VllmConfig -from vllm.distributed import get_dp_group, get_tp_group +from vllm.distributed import get_dp_group, get_tp_group, get_ep_group from vllm.forward_context import get_forward_context, set_forward_context from vllm.platforms import current_platform @@ -63,7 +63,7 @@ def set_ascend_forward_context( ): forward_context = get_forward_context() forward_context.with_prefill = with_prefill - ep_size = (torch.distributed.get_world_size() if + ep_size = (get_ep_group().world_size if vllm_config.parallel_config.enable_expert_parallel else 1) fused_moe_state = get_fused_moe_state(ep_size, with_prefill) diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index a33a69b80a..20dafdf7ac 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -154,6 +154,47 @@ def __init__( CustomDeepseekDBOMoE.top_k = config.num_experts_per_tok self.config = config + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + forward_context = get_forward_context() + if attn_metadata is None: + attn_metadata = forward_context.attn_metadata + + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + enable_force_load_balance = forward_context.in_profile_run + + is_prefill = forward_context.with_prefill + # If this node is kv_consumer, we force the moe always runs in decode path to make sure + # the behaviour aligned between dummy_run and normal model_execute. + if self.kv_consumer: + is_prefill = False + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + + experts_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + is_prefill=is_prefill, + top_k=CustomDeepseekDBOMoE.top_k, + enable_force_load_balance=enable_force_load_balance, + shared_experts=self.shared_experts) + + shared_experts_hidden = experts_hidden_states[1] + if not (self.shared_experts.down_proj.reduce_results + and self.shared_experts.down_proj.tp_size > 1): + shared_experts_hidden = tensor_model_parallel_all_reduce( + shared_experts_hidden) + + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + shared_experts_hidden) + + return hidden_states + # ----------------------------------------- TBO-related -------------------------------------------- def _forward_ms_op_shared_expert( self, diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index d2ce4114f4..3fa9c8be74 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1266,9 +1266,6 @@ def forward( if shared_experts: if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) - if not shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: - shared_hidden_states = tensor_model_parallel_all_reduce( - shared_hidden_states) mc2_mask = forward_context.mc2_mask tp_size = get_tensor_model_parallel_world_size() From dd08fc34fdac7623813caa024995cde86c2c3bdc Mon Sep 17 00:00:00 2001 From: weijinqian_v1 Date: Fri, 18 Jul 2025 09:13:52 +0800 Subject: [PATCH 8/8] [BUGFIX][v0.9.1] clean code. Signed-off-by: weijinqian_v1 --- vllm_ascend/ascend_forward_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index a8c4dfae5b..aec82d426b 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -5,7 +5,7 @@ import torch from vllm.config import VllmConfig -from vllm.distributed import get_dp_group, get_tp_group, get_ep_group +from vllm.distributed import get_dp_group, get_ep_group, get_tp_group from vllm.forward_context import get_forward_context, set_forward_context from vllm.platforms import current_platform