[Chore] Separate out optional dependency checks from vllm.utils (#27207)

dongbo910220 · yewentao256 · DarkLight1337 · web-flow · commit 3ae082c373a8 · 2025-10-22T10:44:21.000-04:00
Signed-off-by: dongbo910220 &lt;1275604947@qq.com&gt;
Signed-off-by: dongbo910220 &lt;32610838+dongbo910220@users.noreply.github.com&gt;
Co-authored-by: Wentao Ye &lt;44945378+yewentao256@users.noreply.github.com&gt;
Co-authored-by: Cyrus Leung &lt;tlleungac@connect.ust.hk&gt;
diff --git a/tests/kernels/attention/test_deepgemm_attention.py b/tests/kernels/attention/test_deepgemm_attention.py
@@ -6,7 +6,7 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.utils import cdiv, has_deep_gemm
+from vllm.utils import cdiv
 from vllm.utils.deep_gemm import (
     _ceil_to_ue8m0,
     calc_diff,
@@ -15,6 +15,7 @@
     get_num_sms,
     get_paged_mqa_logits_metadata,
 )
+from vllm.utils.import_utils import has_deep_gemm
 
 
 def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor:
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -23,7 +23,7 @@
     FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
 
 from .mk_objects import (
     TestMoEQuantConfig,
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -35,9 +35,9 @@
     cutlass_fp8_supported,
 )
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.deep_gemm import is_deep_gemm_supported
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
 
 
 @dataclass
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
@@ -15,7 +15,7 @@
 from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import ParamSpec
 
-from vllm.utils import has_deep_ep
+from vllm.utils.import_utils import has_deep_ep
 from vllm.utils.network_utils import get_open_port
 
 if has_deep_ep():
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
@@ -21,11 +21,11 @@
     modular_triton_fused_moe,
 )
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import (
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
+from vllm.utils.import_utils import has_deep_gemm
 
 dg_available = has_deep_gemm()
 
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -21,8 +21,8 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep, has_deep_gemm
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
@@ -21,7 +21,7 @@
     per_token_group_quant_fp8,
 )
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep
+from vllm.utils.import_utils import has_deep_ep
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -6,7 +6,7 @@
 import torch
 import torch.nn.functional as F
 
-from vllm.utils import has_triton_kernels
+from vllm.utils.import_utils import has_triton_kernels
 
 if not has_triton_kernels():
     pytest.skip(
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -13,8 +13,8 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from .modular_kernel_tools.common import (
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
@@ -18,12 +18,12 @@
     w8a8_triton_block_scaled_mm,
 )
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import (
     fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
     per_block_cast_to_fp8,
 )
+from vllm.utils.import_utils import has_deep_gemm
 
 if current_platform.get_device_capability() < (9, 0):
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
@@ -12,7 +12,6 @@
 from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
-
 from ..utils import check_logprobs_close
 
 
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
@@ -9,8 +9,8 @@
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.utils import has_deep_ep, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_all2all
+from vllm.utils.import_utils import has_deep_ep, has_pplx
 
 from .base_device_communicator import All2AllManagerBase, Cache
 
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -14,8 +14,9 @@
     OCP_MX_Scheme,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.utils import cdiv, has_triton_kernels
+from vllm.utils import cdiv
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.import_utils import has_triton_kernels
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -26,12 +26,12 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
 )
-from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import (
     get_mk_alignment_for_contiguous_layout,
     m_grouped_fp8_gemm_nt_contiguous,
 )
 from vllm.utils.func_utils import run_once
+from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -13,7 +13,7 @@
     TopKWeightAndReduceNoOP,
 )
 from vllm.triton_utils import tl, triton
-from vllm.utils import has_triton_kernels
+from vllm.utils.import_utils import has_triton_kernels
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -55,8 +55,9 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
-from vllm.utils import cdiv, has_deep_ep, has_pplx, round_up
+from vllm.utils import cdiv, round_up
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.import_utils import has_deep_ep, has_pplx
 from vllm.utils.torch_utils import current_stream, direct_register_custom_op
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -93,7 +93,6 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import (
     fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
@@ -102,6 +101,7 @@
     should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.flashinfer import has_flashinfer_moe
+from vllm.utils.import_utils import has_deep_gemm
 
 if TYPE_CHECKING:
     from vllm.model_executor.models.utils import WeightsMapper
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -48,11 +48,9 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import (
-    has_triton_kernels,
-    round_up,
-)
+from vllm.utils import round_up
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -5,7 +5,6 @@
 import datetime
 import enum
 import getpass
-import importlib
 import inspect
 import json
 import multiprocessing
@@ -1062,46 +1061,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
     )
 
 
-@cache
-def _has_module(module_name: str) -> bool:
-    """Return True if *module_name* can be found in the current environment.
-
-    The result is cached so that subsequent queries for the same module incur
-    no additional overhead.
-    """
-    return importlib.util.find_spec(module_name) is not None
-
-
-def has_pplx() -> bool:
-    """Whether the optional `pplx_kernels` package is available."""
-
-    return _has_module("pplx_kernels")
-
-
-def has_deep_ep() -> bool:
-    """Whether the optional `deep_ep` package is available."""
-
-    return _has_module("deep_ep")
-
-
-def has_deep_gemm() -> bool:
-    """Whether the optional `deep_gemm` package is available."""
-
-    return _has_module("deep_gemm")
-
-
-def has_triton_kernels() -> bool:
-    """Whether the optional `triton_kernels` package is available."""
-
-    return _has_module("triton_kernels")
-
-
-def has_tilelang() -> bool:
-    """Whether the optional `tilelang` package is available."""
-
-    return _has_module("tilelang")
-
-
 def set_process_title(
     name: str, suffix: str = "", prefix: str = envs.VLLM_PROCESS_NAME_PREFIX
 ) -> None:
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
@@ -16,7 +16,8 @@
 import vllm.envs as envs
 from vllm.logger import logger
 from vllm.platforms import current_platform
-from vllm.utils import cdiv, has_deep_gemm
+from vllm.utils import cdiv
+from vllm.utils.import_utils import has_deep_gemm
 
 
 @functools.cache
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
@@ -324,3 +324,39 @@ def __dir__(self) -> list[str]:
         if self._module is None:
             self._module = self._load()
         return dir(self._module)
+
+
+# Optional dependency detection utilities
+@cache
+def _has_module(module_name: str) -> bool:
+    """Return True if *module_name* can be found in the current environment.
+
+    The result is cached so that subsequent queries for the same module incur
+    no additional overhead.
+    """
+    return importlib.util.find_spec(module_name) is not None
+
+
+def has_pplx() -> bool:
+    """Whether the optional `pplx_kernels` package is available."""
+    return _has_module("pplx_kernels")
+
+
+def has_deep_ep() -> bool:
+    """Whether the optional `deep_ep` package is available."""
+    return _has_module("deep_ep")
+
+
+def has_deep_gemm() -> bool:
+    """Whether the optional `deep_gemm` package is available."""
+    return _has_module("deep_gemm")
+
+
+def has_triton_kernels() -> bool:
+    """Whether the optional `triton_kernels` package is available."""
+    return _has_module("triton_kernels")
+
+
+def has_tilelang() -> bool:
+    """Whether the optional `tilelang` package is available."""
+    return _has_module("tilelang")
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -22,7 +22,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import has_deep_gemm
+from vllm.utils.import_utils import has_deep_gemm
 from vllm.v1.worker.ubatching import UBatchContext, make_ubatch_contexts
 
 logger = init_logger(__name__)

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`FusedMoEQuantConfig,`
`24`	`24`	`)`
`25`	`25`	`from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk`
`26`		`-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx`
	`26`	`+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx`
`27`	`27`
`28`	`28`	`from .mk_objects import (`
`29`	`29`	`TestMoEQuantConfig,`
Original file line number	Diff line number	Diff line change
`@@ -35,9 +35,9 @@`
`35`	`35`	`cutlass_fp8_supported,`
`36`	`36`	`)`
`37`	`37`	`from vllm.platforms import current_platform`
`38`		`-from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx`
`39`	`38`	`from vllm.utils.deep_gemm import is_deep_gemm_supported`
`40`	`39`	`from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe`
	`40`	`+from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx`
`41`	`41`
`42`	`42`
`43`	`43`	`@dataclass`
Original file line number	Diff line number	Diff line change
`@@ -21,11 +21,11 @@`
`21`	`21`	`modular_triton_fused_moe,`
`22`	`22`	`)`
`23`	`23`	`from vllm.platforms import current_platform`
`24`		`-from vllm.utils import has_deep_gemm`
`25`	`24`	`from vllm.utils.deep_gemm import (`
`26`	`25`	`get_mk_alignment_for_contiguous_layout,`
`27`	`26`	`is_deep_gemm_e8m0_used,`
`28`	`27`	`)`
	`28`	`+from vllm.utils.import_utils import has_deep_gemm`
`29`	`29`
`30`	`30`	`dg_available = has_deep_gemm()`
`31`	`31`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`per_token_group_quant_fp8,`
`22`	`22`	`)`
`23`	`23`	`from vllm.platforms import current_platform`
`24`		`-from vllm.utils import has_deep_ep`
	`24`	`+from vllm.utils.import_utils import has_deep_ep`
`25`	`25`
`26`	`26`	`from ...utils import multi_gpu_test`
`27`	`27`	`from .parallel_utils import ProcessGroupInfo, parallel_launch`
Original file line number	Diff line number	Diff line change
`@@ -18,12 +18,12 @@`
`18`	`18`	`w8a8_triton_block_scaled_mm,`
`19`	`19`	`)`
`20`	`20`	`from vllm.platforms import current_platform`
`21`		`-from vllm.utils import has_deep_gemm`
`22`	`21`	`from vllm.utils.deep_gemm import (`
`23`	`22`	`fp8_gemm_nt,`
`24`	`23`	`get_col_major_tma_aligned_tensor,`
`25`	`24`	`per_block_cast_to_fp8,`
`26`	`25`	`)`
	`26`	`+from vllm.utils.import_utils import has_deep_gemm`
`27`	`27`
`28`	`28`	`if current_platform.get_device_capability() < (9, 0):`
`29`	`29`	`pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)`