File tree Expand file tree Collapse file tree 2 files changed +11
-0
lines changed Expand file tree Collapse file tree 2 files changed +11
-0
lines changed Original file line number Diff line number Diff line change 158
158
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE : bool = False
159
159
VLLM_ENABLE_RESPONSES_API_STORE : bool = False
160
160
VLLM_USE_TRTLLM_ATTENTION : Optional [str ] = None
161
+ VLLM_HAS_FLASHINFER_CUBIN : bool = False
161
162
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 : bool = False
162
163
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 : bool = False
163
164
VLLM_TUNED_CONFIG_FOLDER : Optional [str ] = None
@@ -1105,6 +1106,11 @@ def get_vllm_port() -> Optional[int]:
1105
1106
"VLLM_USE_TRTLLM_ATTENTION" :
1106
1107
lambda : os .getenv ("VLLM_USE_TRTLLM_ATTENTION" , None ),
1107
1108
1109
+ # If set, it means we pre-downloaded cubin files and flashinfer will
1110
+ # read the cubin files directly.
1111
+ "VLLM_HAS_FLASHINFER_CUBIN" :
1112
+ lambda : os .getenv ("VLLM_HAS_FLASHINFER_CUBIN" , False ),
1113
+
1108
1114
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
1109
1115
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
1110
1116
# vllm cutlass GEMM, marlin GEMM.
Original file line number Diff line number Diff line change @@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool:
132
132
This checks connectivity to the kernel inference library artifactory
133
133
which is required for downloading certain cubin kernels like TRTLLM FHMA.
134
134
"""
135
+ # Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
136
+ # it's true, we could assume the cubins are available.
137
+ if envs .VLLM_HAS_FLASHINFER_CUBIN :
138
+ return True
139
+
135
140
try :
136
141
# Use a short timeout to avoid blocking for too long
137
142
response = requests .get (FLASHINFER_CUBINS_REPOSITORY , timeout = 5 )
You can’t perform that action at this time.
0 commit comments