refactor: separate SM100 and legacy TRT-LLM comm modules

EmilienM · EmilienM · commit 0fb5e4237ee4 · 2025-07-15T09:33:36.000-04:00
Restructure the compilation of the TensorRT-LLM communication module to
improve hardware compatibility and portability.

Previously, the module was compiled with SM100-specific flags only if a
compatible GPU was detected during the build process.
This made a single build non-portable across different GPU generations.

This change introduces two distinct modules:
- `trtllm_comm`: compiled with SM100 optimizations for Hopper+ GPUs.
- `trtllm_comm_legacy`: a fallback version for older GPU architectures.

At runtime, `get_trtllm_comm_module` now detects the GPU's compute capability
and dynamically loads the appropriate module.
This allows a single FlashInfer build to support a wider range of NVIDIA GPUs
and gracefully handles CPU-only environments.

Signed-off-by: Emilien Macchi &lt;emacchi@redhat.com&gt;
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -11,6 +11,8 @@
 from .activation import act_func_def_str, gen_act_and_mul_module
 from .cascade import gen_cascade_module
 from .comm import gen_trtllm_comm_module, gen_vllm_comm_module
+from .comm.nvshmem import gen_nvshmem_module
+from .comm.trtllm_ar import gen_trtllm_comm_legacy_module
 from .fp4_quantization import gen_fp4_quantization_sm100_module
 from .fused_moe import gen_fused_moe_sm100_module
 from .gemm import gen_gemm_module, gen_gemm_sm90_module, gen_gemm_sm100_module
@@ -325,7 +327,8 @@ def gen_all_modules(
         jit_specs.append(gen_fused_moe_sm100_module())
         jit_specs.append(gen_fp4_quantization_sm100_module())
         jit_specs.append(gen_gemm_sm100_module())
-        jit_specs.append(gen_trtllm_comm_module())
+
+    jit_specs.append(gen_trtllm_comm_module(sm100=has_sm100))
 
     jit_specs += [
         gen_cascade_module(),
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
@@ -95,22 +95,37 @@ class FP4QuantizationSFLayout:
     LINEAR = 1
 
 
-def gen_trtllm_comm_module() -> JitSpec:
-    major, minor = torch.cuda.get_device_capability()
+def gen_trtllm_comm_module(sm100: bool = True) -> JitSpec:
+    """
+    Generate TensorRT-LLM communication module.
+    If sm100 is True, use SM100 flags and name 'trtllm_comm'.
+    If sm100 is False, use no extra flags and name 'trtllm_comm_legacy'.
+    """
     return gen_jit_spec(
-        "trtllm_comm",
+        "trtllm_comm" if sm100 else "trtllm_comm_legacy",
         [
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_allreduce.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_allreduce_fusion.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_moe_allreduce_fusion.cu",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags if major >= 10 and minor >= 0 else [],
+        extra_cuda_cflags=sm100a_nvcc_flags if sm100 else [],
     )
 
 
 @functools.cache
 def get_trtllm_comm_module():
-    module = gen_trtllm_comm_module().build_and_load()
+    # Select the appropriate module based on device capability
+    try:
+        major, minor = torch.cuda.get_device_capability()
+        use_sm100_module = major >= 10 and minor >= 0
+    except RuntimeError:
+        # If CUDA is not available (e.g., CPU-only mode), default to legacy module
+        use_sm100_module = False
+
+    if use_sm100_module:
+        module = gen_trtllm_comm_module().build_and_load()
+    else:
+        module = gen_trtllm_comm_legacy_module().build_and_load()
 
     @register_custom_op(
         "flashinfer::trtllm_lamport_initialize", mutates_args=["buffer"]