Skip to content

Commit e018aed

Browse files
842974287amd-xiaoyu12
authored andcommitted
add an env var for path to pre-downloaded flashinfer cubin files (vllm-project#22675)
Signed-off-by: Xiao Yu <[email protected]>
1 parent 57cf9b6 commit e018aed

File tree

2 files changed

+11
-0
lines changed

2 files changed

+11
-0
lines changed

vllm/envs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@
158158
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
159159
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
160160
VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
161+
VLLM_HAS_FLASHINFER_CUBIN: bool = False
161162
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
162163
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
163164
VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
@@ -1105,6 +1106,11 @@ def get_vllm_port() -> Optional[int]:
11051106
"VLLM_USE_TRTLLM_ATTENTION":
11061107
lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
11071108

1109+
# If set, it means we pre-downloaded cubin files and flashinfer will
1110+
# read the cubin files directly.
1111+
"VLLM_HAS_FLASHINFER_CUBIN":
1112+
lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
1113+
11081114
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
11091115
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
11101116
# vllm cutlass GEMM, marlin GEMM.

vllm/utils/flashinfer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool:
132132
This checks connectivity to the kernel inference library artifactory
133133
which is required for downloading certain cubin kernels like TRTLLM FHMA.
134134
"""
135+
# Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
136+
# it's true, we could assume the cubins are available.
137+
if envs.VLLM_HAS_FLASHINFER_CUBIN:
138+
return True
139+
135140
try:
136141
# Use a short timeout to avoid blocking for too long
137142
response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)

0 commit comments

Comments
 (0)