Address comments

bkryu · bkryu · commit c9f3d5278892 · 2025-10-31T18:09:38.000Z
diff --git a/benchmarks/routines/flashinfer_benchmark_utils.py b/benchmarks/routines/flashinfer_benchmark_utils.py
@@ -232,17 +232,7 @@ def dtype_str_to_torch_dtype(dtype_str):
         "10.3": ["cudnn", "cublas", "cutlass"],
         "12.0": ["cudnn", "cublas"],
     },
-    "mm_fp4": {
-        "7.5": [],
-        "8.0": [],
-        "8.6": [],
-        "8.9": [],
-        "9.0": [],
-        "10.0": ["cudnn", "trtllm", "cutlass", "auto"],
-        "10.3": ["cudnn", "trtllm", "cutlass", "auto"],
-        "12.0": ["cudnn", "cutlass", "auto"],
-        "12.1": ["cudnn", "cutlass", "auto"],
-    },
+    # Note: mm_fp4 uses support checkers to filter backends, so it is not listed here
     # MOE
     "trtllm_fp4_block_scale_moe": {
         "7.5": [],
diff --git a/benchmarks/routines/gemm.py b/benchmarks/routines/gemm.py
@@ -793,65 +793,11 @@ def testMmFp4(args):
     autotune_supported_backends = ["cudnn", "cutlass", "trtllm", "auto"]
     res = []
 
-    backends = filter_backends_by_compute_capability(backends, args.routine, device)
-
     res_dtype = dtype_str_to_torch_dtype(args.out_dtype)
     if res_dtype not in [torch.bfloat16, torch.float16]:
         raise ValueError(
             f"Unsupported res dtype: {res_dtype}. Supported dtypes are bfloat16 and float16."
         )
-    ## Done parsing input arguments
-
-    if "trtllm" in backends:
-        remove_trtllm = False
-        if res_dtype == torch.float16:
-            print("[INFO] trtllm backend does not support float16 output")
-            remove_trtllm = True
-        if remove_trtllm:
-            backends.remove("trtllm")
-        if not use_nvfp4:
-            print(
-                "[INFO] trtllm backend does not support mxfp4 quantization (use_nvfp4=False)"
-            )
-            backends.remove("trtllm")
-    if "cutlass" in backends:
-        remove_cutlass = False
-        if not use_128x4_sf_layout:
-            print("[INFO] cutlass backend does not support use_128x4_sf_layout=False")
-            remove_cutlass = True
-        if not use_nvfp4:
-            print(
-                "[INFO] cutlass backend does not support mxfp4 quantization (use_nvfp4=False)"
-            )
-            remove_cutlass = True
-        if remove_cutlass:
-            backends.remove("cutlass")
-    if "cudnn" in backends:
-        remove_cudnn = False
-        if not use_128x4_sf_layout:
-            print("[INFO] cudnn backend does not support use_128x4_sf_layout=False")
-            remove_cudnn = True
-        if remove_cudnn:
-            backends.remove("cudnn")
-    if "auto" in backends:
-        remove_auto = False
-        if not use_128x4_sf_layout:
-            print("[INFO] auto backend does not support use_128x4_sf_layout=False")
-            remove_auto = True
-        if remove_auto:
-            backends.remove("auto")
-    if getattr(args, "autotune", False):
-        backends_to_remove = []
-        for cur_backend in backends:
-            if cur_backend not in autotune_supported_backends:
-                print(f"[INFO] {cur_backend} backend does not support autotune")
-                backends_to_remove.append(cur_backend)
-        for cur_backend in backends_to_remove:
-            backends.remove(cur_backend)
-
-    if len(backends) == 0:
-        print("[ERROR] No backends to test. Exiting.")
-        return
 
     input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
     mat2 = torch.randn([n, k], device=device, dtype=torch.bfloat16)
@@ -893,7 +839,77 @@ def testMmFp4(args):
         print(f"[VVERBOSE] {mat2_fp4.dtype = }")
 
     alpha = 1.0 / (global_sf_input * global_sf_mat2) if use_nvfp4 else None
-    # res = torch.empty([m, n], device="cuda", dtype=res_dtype)
+    # Completed preparing inputs. Now programmatically filter backends
+    block_size = 16 if use_nvfp4 else 32
+    backends_to_remove = []
+
+    for backend in backends:
+        # Skip autotune check for now (handled separately below)
+        if (
+            getattr(args, "autotune", False)
+            and backend not in autotune_supported_backends
+        ):
+            print(f"[INFO] {backend} backend does not support autotune")
+            backends_to_remove.append(backend)
+            continue
+
+        try:
+            from flashinfer.gemm import (
+                _mm_fp4_backend_checkers,
+                _check_mm_fp4_problem_size,
+            )
+
+            # Choose correct tensors for this backend
+            if backend == "trtllm":
+                b_tensor = mat2_fp4_trtllm.T
+                b_descale = mat2_inv_s_trtllm.T
+            else:
+                b_tensor = mat2_fp4.T
+                b_descale = mat2_inv_s.T
+
+            # Validate common requirements
+            _check_mm_fp4_problem_size(
+                input_fp4,
+                b_tensor,
+                input_inv_s,
+                b_descale,
+                alpha,
+                res_dtype,
+                None,  # out
+                block_size,
+                not use_128x4_sf_layout,  # use_8x4_sf_layout
+                backend,
+                use_nvfp4,
+            )
+
+            # Validate backend-specific requirements
+            if backend in _mm_fp4_backend_checkers:
+                _mm_fp4_backend_checkers[backend](
+                    input_fp4,
+                    b_tensor,
+                    input_inv_s,
+                    b_descale,
+                    alpha,
+                    res_dtype,
+                    None,  # out
+                    block_size,
+                    not use_128x4_sf_layout,
+                    backend,
+                    use_nvfp4,
+                )
+        except Exception as e:
+            print(
+                f"[INFO] {backend} backend does not support this configuration: {type(e).__name__}: {e}"
+            )
+            backends_to_remove.append(backend)
+
+    # Remove unsupported backends
+    for backend in backends_to_remove:
+        backends.remove(backend)
+
+    if len(backends) == 0:
+        print("[ERROR] No backends passed validation. Exiting.")
+        return
 
     def run_backend(backend):
         if backend in ["cudnn", "trtllm", "cutlass", "auto"]:
@@ -924,12 +940,11 @@ def run_backend(backend):
             args.dry_run_iters if args.dry_run_iters and args.dry_run_iters > 0 else 10
         )
         for cur_backend in backends:
-            if cur_backend in autotune_supported_backends:
-                if args.verbose >= 1:
-                    print(f"[INFO] Autotune warmup for mm_fp4: {warmup_iters} iters")
-                with autotune(True):
-                    for _ in range(warmup_iters):
-                        run_backend(cur_backend)
+            if args.verbose >= 1:
+                print(f"[INFO] Autotune warmup for mm_fp4: {warmup_iters} iters")
+            with autotune(True):
+                for _ in range(warmup_iters):
+                    run_backend(cur_backend)
 
     # Storage for timing results and outputs
     backend_times = {backend: [] for backend in backends}
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -457,6 +457,10 @@ def forward(
                     _,
                     workspace_buffer,
                 ) = inputs
+                if a.dtype == torch.uint8 and a_descale.dtype == torch.float8_e4m3fn:
+                    a_descale = a_descale.view(torch.uint8)
+                if b.dtype == torch.uint8 and b_descale.dtype == torch.float8_e4m3fn:
+                    b_descale = b_descale.view(torch.uint8)
                 module.fp4_gemm(
                     a, b.T, a_descale, b_descale.T, alpha, out, workspace_buffer, tactic
                 )
@@ -1963,7 +1967,7 @@ def _cutlass_gemm_fp4_requirement(
     return True
 
 
-@supported_compute_capability([100, 103, 110, 120])
+@supported_compute_capability([100, 103, 110, 120, 121])
 def _auto_gemm_fp4_requirement(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -2001,14 +2005,16 @@ def _auto_gemm_fp4_requirement(
     return False
 
 
+_mm_fp4_backend_checkers = {
+    "cudnn": _cudnn_gemm_fp4_requirement,
+    "trtllm": _trtllm_gemm_fp4_requirement,
+    "cutlass": _cutlass_gemm_fp4_requirement,
+    "auto": _auto_gemm_fp4_requirement,
+}
+
+
 @backend_requirement(
-    {
-        "cudnn": _cudnn_gemm_fp4_requirement,  # Each backend has its own requirement function
-        "trtllm": _trtllm_gemm_fp4_requirement,
-        "cutlass": _cutlass_gemm_fp4_requirement,
-        "auto": _auto_gemm_fp4_requirement,  # Auto backend requires at least one backend to be supported on the current device
-    },
-    common_check=_check_mm_fp4_problem_size,  # Shape checks common to all backends
+    backend_checks=_mm_fp4_backend_checkers, common_check=_check_mm_fp4_problem_size
 )
 def mm_fp4(
     a: torch.Tensor,
@@ -2103,7 +2109,7 @@ def mm_fp4(
         cc_major, cc_minor = get_compute_capability(a.device)
         # If cuda version is 13 or greater:
         # cudnn is more performant if cudnn version is 9.14 or greater.
-        if cuda_major >= 13 and cudnn.backend_version() >= 91400:
+        if CUDNN_AVAILABLE and cuda_major >= 13 and cudnn.backend_version() >= 91400:
             candidate_backends = ("cudnn", "cutlass")
         # Otherwise, prioritize cutlass
         else:
@@ -2114,11 +2120,11 @@ def mm_fp4(
         backends = []
         for candidate in candidate_backends:
             # mypy requires explicit type casting for the backend literal
-            backend_literal = cast(
-                Literal["cudnn", "trtllm", "cutlass", "auto"], candidate
-            )
+            backend_literal = cast(Literal["cudnn", "trtllm", "cutlass"], candidate)
             try:
-                _check_mm_fp4_problem_size(
+                # Check both common constraints and backend-specific requirements
+                # to find all compatible backends for this problem instance
+                if _check_mm_fp4_problem_size(
                     a,
                     b,
                     a_descale,
@@ -2130,41 +2136,39 @@ def mm_fp4(
                     use_8x4_sf_layout,
                     backend_literal,
                     use_nvfp4,
-                )
-                backends.append(candidate)
+                ) and _mm_fp4_backend_checkers[candidate](
+                    a,
+                    b,
+                    a_descale,
+                    b_descale,
+                    alpha,
+                    out_dtype,
+                    out,
+                    block_size,
+                    use_8x4_sf_layout,
+                    backend_literal,
+                    use_nvfp4,
+                ):
+                    backends.append(candidate)
             except Exception:
                 pass
     else:
         backends = [backend]
 
     # At this point, backends contains a supported backend if specified, or all supported backends if backend='auto'.
-    runners = []
-    for cur_backend in backends:
-        if cur_backend == "cudnn":
-            runners.append(_cudnn_gemm_fp4_runner())
-        elif cur_backend == "trtllm":
-            runners.append(
-                get_trtllm_fp4_gemm_module().trtllm_fp4_gemm_runner(use_8x4_sf_layout)
-            )
-        elif cur_backend == "cutlass":
-            if a.dtype == torch.uint8 and a_descale.dtype == torch.float8_e4m3fn:
-                a_descale = a_descale.view(torch.uint8)
-            if b.dtype == torch.uint8 and b_descale.dtype == torch.float8_e4m3fn:
-                b_descale = b_descale.view(torch.uint8)
-
-            # Dispatch to the correct module based on device architecture
-            major, _ = get_compute_capability(a.device)
-            if major == 12:
-                runners.append(
-                    get_gemm_sm120_module_cutlass_fp4().cutlass_fp4_gemm_runner()
-                )
-            else:
-                runners.append(
-                    get_gemm_sm100_module_cutlass_fp4().cutlass_fp4_gemm_runner()
-                )
-        else:
-            # Should not reach this
-            raise ValueError(f"Unsupported backend: {cur_backend}")
+    # Lazy initialization of runners to avoid overhead of creating a new runner that will not be used
+    major, _ = get_compute_capability(a.device)
+
+    backend_to_runner_factory = {
+        "cudnn": lambda: _cudnn_gemm_fp4_runner(),
+        "trtllm": lambda: get_trtllm_fp4_gemm_module().trtllm_fp4_gemm_runner(
+            use_8x4_sf_layout
+        ),
+        "cutlass": lambda: get_gemm_sm120_module_cutlass_fp4().cutlass_fp4_gemm_runner()
+        if major == 12
+        else get_gemm_sm100_module_cutlass_fp4().cutlass_fp4_gemm_runner(),
+    }
+    runners = [backend_to_runner_factory[cur_backend]() for cur_backend in backends]
 
     # Now we have a list of runners for desired & supported backends.
     tuner = AutoTuner.get()