epwalsh
diff --git a/‎vllm/envs.py
Lines changed: 15 additions & 0 deletions b/‎vllm/envs.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/config.py
Lines changed: 2 additions & 1 deletion b/‎vllm/model_executor/layers/fused_moe/config.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
Lines changed: 4 additions & 5 deletions b/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
Lines changed: 4 additions & 5 deletions
@@ -129,6 +129,7 @@
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
+    VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -982,6 +983,20 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_ALL2ALL_BACKEND":
     lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
 
+    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
+    # require compute capability 10.0 or above.
+    # Available options:
+    # - "throughput":  [default]
+    #     Uses CUTLASS kernels optimized for high-throughput batch inference.
+    # - "latency":
+    #     Uses TensorRT-LLM kernels optimized for low-latency inference.
+    # To set this backend, define the environment variable:
+    #     export VLLM_FLASHINFER_MOE_BACKEND=latency.
+    # If not set, defaults to "throughput".
+    "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
+    "VLLM_FLASHINFER_MOE_BACKEND", "throughput"
+    ),
+
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
     # the blockscale tensor of activations NVFP4 Quantization.
 
@@ -192,7 +192,8 @@ def use_deepep_ll_kernels(self):
     @property
     def use_flashinfer_cutlass_kernels(self):
         return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe())
+                and has_flashinfer_cutlass_fused_moe()
+                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
 
@@ -105,7 +105,7 @@ def __init__(self):
             detect_nvfp4_moe_support)
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
-        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
         self.fused_experts = None  # type: ignore[assignment]
@@ -212,7 +212,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                                              requires_grad=False)
 
         # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
-        if self.allow_flashinfer_cutlass:
+        if self.allow_flashinfer:
             w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
                                         layer.w13_weight_scale.data,
                                         dim=-2)
@@ -266,7 +266,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             (layer.w2_input_global_scale), requires_grad=False)
 
     def maybe_swap_experts_impl(self, moe_parallel_config):
-        if not self.allow_flashinfer_cutlass:
+        if not self.allow_flashinfer:
             return
         self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
             moe_parallel_config)
@@ -277,8 +277,7 @@ def select_gemm_impl(self, prepare_finalize, moe):
         from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
             select_nvfp4_gemm_impl)
 
-        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
-                                      logger)
+        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
 
     def apply(
         self,