CR Fixes

amirkl94 · amirkl94 · commit 6582abc6da94 · 2025-07-29T18:02:07.000+03:00
Signed-off-by: Amir Klein &lt;203507526+amirkl94@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1147,30 +1147,25 @@ def flashinfer_fused_moe_blockscale_fp8_fake(
 
 
 def flashinfer_fused_moe_per_tensor_scale_fp8(
-    routing_logits: torch.Tensor,
-    routing_bias: Optional[torch.Tensor],
-    hidden_states: torch.Tensor,
-    input_scale: torch.Tensor,
-    gemm1_weights: torch.Tensor,
-    gemm1_weights_scale: torch.Tensor,
-    activation_scale: torch.Tensor,
-    gemm2_weights: torch.Tensor,
-    gemm2_weights_scale: torch.Tensor,
-    num_experts: int,
-    top_k: int,
-    num_expert_group: Optional[int],
-    topk_group: Optional[int],
-    intermediate_size: int,
-    local_expert_offset: int,
-    local_num_experts: int,
-    use_routing_scales_on_input: bool,
-    routed_scaling_factor: float = 1.0,
-    routing_method_type: int = 3  # Llama4-styled routing method
-) -> torch.Tensor:
-    if routing_bias is None:
-        routing_bias = torch.zeros(num_experts,
-                                   dtype=torch.bfloat16,
-                                   device=hidden_states.device)
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        input_scale: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor,
+        activation_scale: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        use_routing_scales_on_input: bool,
+        routing_method_type: int,
+        routed_scaling_factor: float = 1.0) -> torch.Tensor:
     num_expert_group = num_expert_group if num_expert_group is not None else 0
     topk_group = topk_group if topk_group is not None else 0
 
@@ -1205,7 +1200,8 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling_factor,
         use_routing_scales_on_input=use_routing_scales_on_input,
-        tile_tokens_dim=8,
+        tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0],
+                                                  top_k, num_experts),
         routing_method_type=routing_method_type)
 
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,6 +6,8 @@
 
 import torch
 import torch.nn.functional as F
+from quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8)
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
@@ -1024,26 +1026,16 @@ def apply(
             else:
                 assert (not renormalize
                         and custom_routing_function is not None)
-                return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
-                    routing_logits=router_logits,
-                    routing_bias=e_score_correction_bias,
+                return apply_flashinfer_per_tensor_scale_fp8(
+                    layer=layer,
                     hidden_states=x,
-                    input_scale=layer.w13_input_scale,
-                    gemm1_weights=layer.w13_weight,
-                    gemm1_weights_scale=layer.w13_weight_scale,
-                    gemm2_weights=layer.w2_weight,
-                    gemm2_weights_scale=layer.w2_weight_scale,
-                    activation_scale=layer.w2_input_scale,
-                    num_experts=global_num_experts,
+                    router_logits=router_logits,
+                    routing_bias=e_score_correction_bias,
+                    global_num_experts=global_num_experts,
                     top_k=top_k,
                     num_expert_group=num_expert_group,
                     topk_group=topk_group,
-                    intermediate_size=layer.intermediate_size_per_partition,
-                    local_expert_offset=layer.ep_rank *
-                    layer.local_num_experts,
-                    local_num_experts=layer.local_num_experts,
-                    use_routing_scales_on_input=apply_router_weight_on_input,
-                )
+                    apply_router_weight_on_input=apply_router_weight_on_input)
         else:
             return self.fused_experts(
                 hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -4,6 +4,8 @@
 from typing import Any, Callable, Optional, Union
 
 import torch
+from quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8)
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
@@ -453,25 +455,16 @@ def apply(
         if self.flashinfer_moe_enabled:
             assert activation == 'silu'
             assert not renormalize
-            return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
-                routing_logits=router_logits,
-                routing_bias=e_score_correction_bias,
+            return apply_flashinfer_per_tensor_scale_fp8(
+                layer=layer,
                 hidden_states=x,
-                input_scale=layer.w13_input_scale,
-                gemm1_weights=layer.w13_weight,
-                gemm1_weights_scale=layer.w13_weight_scale,
-                gemm2_weights=layer.w2_weight,
-                gemm2_weights_scale=layer.w2_weight_scale,
-                activation_scale=layer.w2_input_scale,
-                num_experts=global_num_experts,
+                router_logits=router_logits,
+                routing_bias=e_score_correction_bias,
+                global_num_experts=global_num_experts,
                 top_k=top_k,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                intermediate_size=layer.intermediate_size_per_partition,
-                local_expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=layer.local_num_experts,
-                use_routing_scales_on_input=apply_router_weight_on_input,
-            )
+                apply_router_weight_on_input=apply_router_weight_on_input)
 
         # Expert selection
         topk_weights, topk_ids = FusedMoE.select_experts(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
 import torch
 
 
@@ -58,3 +60,41 @@ def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor,
         torch.float8_e4m3fn)
     gemm2_weights.data = torch.stack(gemm2_weights_fp8_shuffled).view(
         torch.float8_e4m3fn)
+
+
+def apply_flashinfer_per_tensor_scale_fp8(
+    layer: torch.nn.Module,
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    routing_bias: Optional[torch.Tensor],
+    top_k: int,
+    num_expert_group: Optional[int],
+    topk_group: Optional[int],
+    global_num_experts: int,
+    apply_router_weight_on_input: bool,
+) -> torch.Tensor:
+    from flashinfer.fushed_moe import RoutingMethodType
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
+        "FusedMoE flashinfer kernels are only supported for Llama4"
+    return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8(
+        routing_logits=router_logits,
+        routing_bias=routing_bias,
+        hidden_states=hidden_states,
+        input_scale=layer.w13_input_scale,
+        gemm1_weights=layer.w13_weight,
+        gemm1_weights_scale=layer.w13_weight_scale,
+        gemm2_weights=layer.w2_weight,
+        gemm2_weights_scale=layer.w2_weight_scale,
+        activation_scale=layer.w2_input_scale,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        use_routing_scales_on_input=apply_router_weight_on_input,
+        routing_method=RoutingMethodType.Llama4,
+    )