diff --git a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu index 369a92ee2eb..4416d204517 100644 --- a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu +++ b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu @@ -58,6 +58,11 @@ __VA_ARGS__ \ break; \ } \ + case 20: { \ + constexpr size_t NUM_EXPERTS_PER_RANK = 20; \ + __VA_ARGS__ \ + break; \ + } \ case 32: { \ constexpr size_t NUM_EXPERTS_PER_RANK = 32; \ __VA_ARGS__ \ diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index d87894b8105..4802a6aab48 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -146,8 +146,10 @@ def apply_ep_prefill( recv_topk_weights, recv_num_tokens_per_expert_list, handle, - _, + event, ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights) + if self.ep_prefill_runner.ep_engine.async_finish: + event.current_stream_wait() token_all_num = sum(recv_num_tokens_per_expert_list) # 3. Compute ffn diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index a095b7b0435..1100347807d 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -499,7 +499,7 @@ def empty_input_forward(self): empty_input_forward """ fake_hidden_states = paddle.ones( - shape=[1, self.fd_config.model_config.hidden_size], + shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) for i in range( diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 3e9a72d7688..74adb5cc3b9 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -421,7 +421,7 @@ def empty_input_forward(self): empty_input_forward """ fake_hidden_states = paddle.empty( - shape=[1, self.fd_config.model_config.hidden_size], + shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) for i in range(