Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if cache_config and cache_config.block_size is None:
cache_config.block_size = 64

# FIXME: Temporarily forcing eager mode
# remove after t.compile support stabilizes.
if (envs.VLLM_USE_V1 and model_config is not None
and not vllm_config.model_config.enforce_eager):
from vllm.config import CompilationLevel
vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION # noqa: E501

# Instances created using VllmConfig() typically have model_config as
# None by default. The modification involves adding a check to prevent
# potential null exceptions check and update model config.
Expand All @@ -114,6 +107,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
"disabling cudagraphs.")
compilation_config.cudagraph_mode = CUDAGraphMode.NONE

# diable cuda graphs since it is not supported on XPU platform
compilation_config.use_cudagraph = False
compilation_config.cudagraph_capture_sizes = []

from vllm.config import CompilationLevel
if compilation_config.level == CompilationLevel.PIECEWISE:
logger.info("Piecewise compilation level is not supported on XPU, "
"switching to Dynamo(DYNAMO_ONECE) compilation with "
"the Inductor backend.")
compilation_config.level = CompilationLevel.DYNAMO_ONCE
compilation_config.backend = "inductor"

if compilation_config.use_inductor:
# diable all custom ops for piecewise compilation
compilation_config.custom_ops = ["none"]

# check and update parallel config
parallel_config = vllm_config.parallel_config
parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
Expand Down