Runable, ibcorrect output

wenscarl · wenscarl · commit 00d0877ecfc3 · 2025-07-24T19:48:34.000Z
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -192,7 +192,8 @@ def use_deepep_ll_kernels(self):
     @property
     def use_flashinfer_cutlass_kernels(self):
         return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe())
+                and has_flashinfer_cutlass_fused_moe()
+                and envs.VLLM_FLASHINFER_MOE_BACKEND=="flashinfer_moe_high_throughput")
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -10,8 +10,8 @@
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate)
 from vllm.model_executor.layers.fused_moe.utils import extract_required_args
-from vllm.utils.flashinfer import (has_flashinfer_trtllm_fused_moe,
-                                   trtllm_fp4_block_scale_moe)
+from vllm.utils.flashinfer import (has_flashinfer_trtllm_fused_moe)#,
+                                #    trtllm_fp4_block_scale_moe)
 
 logger = init_logger(__name__)
 
@@ -180,47 +180,49 @@ def apply(
             w2_scale.view(torch.int32),
             g2_alphas,
         ]
-        out = trtllm_fp4_block_scale_moe(
-            routing_logits,
-            routing_bias,
-            hidden_states,
-            topk_ids.to(torch.int),
-            topk_weights,
-            # FlashInfer API requires weight to be long for nvfp4
-            w1.view(torch.long),
-            w2.view(torch.long),
-            output_dtype=out_dtype,
-            quant_scales=quant_scales,
-            input_sf=a1q_scale,
-            tp_size=self.tp_size,
-            tp_rank=self.tp_rank,
-            ep_size=self.ep_size,
-            ep_rank=self.ep_rank,
-            output=output,
-        )
+        # out = trtllm_fp4_block_scale_moe(
+        #     routing_logits,
+        #     routing_bias,
+        #     hidden_states,
+        #     topk_ids.to(torch.int),
+        #     topk_weights,
+        #     # FlashInfer API requires weight to be long for nvfp4
+        #     w1.view(torch.long),
+        #     w2.view(torch.long),
+        #     output_dtype=out_dtype,
+        #     quant_scales=quant_scales,
+        #     input_sf=a1q_scale,
+        #     tp_size=self.tp_size,
+        #     tp_rank=self.tp_rank,
+        #     ep_size=self.ep_size,
+        #     ep_rank=self.ep_rank,
+        #     output=output,
+        # )
+        out = output
         output.copy_(out)
-
-    return get_trtllm_moe_sm100_module().trtllm_fp4_block_scale_moe(
-        routing_logits,
-        routing_bias,
-        hidden_states,
-        hidden_states_scale,
-        gemm1_weights,
-        gemm1_weights_scale,
-        gemm2_weights,
-        gemm2_weights_scale,
-        output1_scale_scalar,
-        output1_scale_gate_scalar,
-        output2_scale_scalar,
-        num_experts,
-        top_k,
-        n_group,
-        topk_group,
-        intermediate_size,
-        local_expert_offset,
-        local_num_experts,
-        routed_scaling_factor,
-        tile_tokens_dim,
-        routing_method_type,
-        do_finalize,
-    )
+        return None
+
+        # return get_trtllm_moe_sm100_module().trtllm_fp4_block_scale_moe(
+        #     routing_logits,
+        #     routing_bias,
+        #     hidden_states,
+        #     hidden_states_scale,
+        #     gemm1_weights,
+        #     gemm1_weights_scale,
+        #     gemm2_weights,
+        #     gemm2_weights_scale,
+        #     output1_scale_scalar,
+        #     output1_scale_gate_scalar,
+        #     output2_scale_scalar,
+        #     num_experts,
+        #     top_k,
+        #     n_group,
+        #     topk_group,
+        #     intermediate_size,
+        #     local_expert_offset,
+        #     local_num_experts,
+        #     routed_scaling_factor,
+        #     tile_tokens_dim,
+        #     routing_method_type,
+        #     do_finalize,
+        # )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py