Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
d6bf991
[WIP][RC] Update PyTorch to 2.8.0
huydhn Jul 2, 2025
456985c
Handle xformers
huydhn Jul 2, 2025
4838d53
Some more tweaks
huydhn Jul 8, 2025
ca21216
Attempt to fix xformers build
huydhn Jul 8, 2025
0c43174
Silly typo
huydhn Jul 9, 2025
14c85d1
Few more tweaks for a greener CI
huydhn Jul 10, 2025
ad98d10
Attempt to offload to CPU to avoid OOM in CI
huydhn Jul 11, 2025
316f116
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 11, 2025
460ed09
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 13, 2025
7df288f
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 15, 2025
6a08113
Fix lint
huydhn Jul 15, 2025
44f07c0
Run all test_sequence_parallel again
huydhn Jul 16, 2025
29fb5a0
Typo
huydhn Jul 16, 2025
6a7e3f8
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 17, 2025
c5d8940
Try to reproduce OOM after recent rebase
huydhn Jul 17, 2025
f320d9d
Match xformers version
huydhn Jul 17, 2025
a5999e1
Not sure why building xformers 0.0.31 fails
huydhn Jul 18, 2025
d1dbb4e
Remove some doc changes what are not needed
huydhn Jul 18, 2025
6f394f5
Tweak some tests
huydhn Jul 19, 2025
f62f6cf
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 22, 2025
f1a6642
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 23, 2025
aa1d8c1
Lower memory usage for test_chatglm3_lora_tp4_fully_sharded_loras
huydhn Jul 23, 2025
6f2c684
Merge branch 'main' into pytorch-2.8.0
huydhn Jul 26, 2025
91ce20f
Build mamba_ssm from source
huydhn Jul 26, 2025
5ce81c7
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 4, 2025
c789827
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 6, 2025
93eb498
Ready 2.8.0
huydhn Aug 6, 2025
c741b0e
Update cpu.txt
huydhn Aug 6, 2025
f948f41
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 12, 2025
1543b92
Resolve xformers and mamba_ssm
huydhn Aug 13, 2025
8de151c
Ready to land
huydhn Aug 13, 2025
bbf1ce3
xformers v0.0.32 is almost here
huydhn Aug 14, 2025
653ccd1
Use xformers 0.0.32
huydhn Aug 15, 2025
bcb7ffc
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 15, 2025
684c24d
Install ao from cu128
huydhn Aug 15, 2025
456d284
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 19, 2025
338d1b5
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 20, 2025
984ff0f
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 21, 2025
90d455e
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 22, 2025
be370a8
Merge branch 'main' into pytorch-2.8.0
mgoin Aug 26, 2025
d8de108
Tweaking jason9693/Qwen2.5-1.5B-apeach memory usage
huydhn Aug 26, 2025
047e295
Same model, different test
huydhn Aug 26, 2025
a409dc2
Revert "Same model, different test"
huydhn Aug 27, 2025
b1c83ec
Revert "Tweaking jason9693/Qwen2.5-1.5B-apeach memory usage"
huydhn Aug 27, 2025
cafb50d
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 27, 2025
56739de
Merge branch 'main' into pytorch-2.8.0
huydhn Aug 28, 2025
3526230
Switch to XFORMERS backend for some tests
huydhn Aug 28, 2025
6bae6f2
Revert "Switch to XFORMERS backend for some tests"
huydhn Aug 28, 2025
102d0d7
Apply #23853
huydhn Aug 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,8 @@ steps:
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
# after torchao 0.12 release, and pin a working version of torchao nightly here
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

- label: LM Eval Small Models # 53min
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")

#
# Try to find python package with an executable that exactly matches
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ requires = [
"packaging>=24.2",
"setuptools>=77.0.3,<80.0.0",
"setuptools-scm>=8.0",
"torch == 2.7.1",
"torch == 2.8.0",
"wheel",
"jinja2",
]
Expand Down
3 changes: 2 additions & 1 deletion requirements/build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
torch==2.7.1
torch==2.8.0
wheel
jinja2>=3.1.6
regex
build
9 changes: 4 additions & 5 deletions requirements/cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,16 @@ packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le"
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
torch==2.8.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.7.0; platform_machine == "ppc64le"
torchaudio==2.8.0; platform_machine == "ppc64le"

# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.22.0; platform_machine == "ppc64le"
torchvision==0.23.0; platform_machine == "ppc64le"
datasets # for benchmark scripts

# Intel Extension for PyTorch, only for x86_64 CPUs
Expand Down
10 changes: 5 additions & 5 deletions requirements/cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'

# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.7.1
torchaudio==2.7.1
torch==2.8.0
torchaudio==2.8.0
# These must be updated alongside torch
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
8 changes: 4 additions & 4 deletions requirements/rocm-build.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Common dependencies
-r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
torch==2.7.0
torchvision==0.22.0
torchaudio==2.7.0
--extra-index-url https://download.pytorch.org/whl/rocm6.3
torch==2.8.0
torchvision==0.23.0
torchaudio==2.8.0

triton==3.3.0
cmake>=3.26.1,<4
Expand Down
6 changes: 3 additions & 3 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.7.1
torchaudio==2.7.1
torchvision==0.22.1
torch==2.8.0
torchaudio==2.8.0
torchvision==0.23.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
Expand Down
36 changes: 18 additions & 18 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -541,42 +541,42 @@ numpy==1.26.4
# tritonclient
# vocos
# xarray
nvidia-cublas-cu12==12.8.3.14
nvidia-cublas-cu12==12.8.4.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.8.57
nvidia-cuda-cupti-cu12==12.8.90
# via torch
nvidia-cuda-nvrtc-cu12==12.8.61
nvidia-cuda-nvrtc-cu12==12.8.93
# via torch
nvidia-cuda-runtime-cu12==12.8.57
nvidia-cuda-runtime-cu12==12.8.90
# via torch
nvidia-cudnn-cu12==9.7.1.26
nvidia-cudnn-cu12==9.10.2.21
# via torch
nvidia-cufft-cu12==11.3.3.41
nvidia-cufft-cu12==11.3.3.83
# via torch
nvidia-cufile-cu12==1.13.0.11
nvidia-cufile-cu12==1.13.1.3
# via torch
nvidia-curand-cu12==10.3.9.55
nvidia-curand-cu12==10.3.9.90
# via torch
nvidia-cusolver-cu12==11.7.2.55
nvidia-cusolver-cu12==11.7.3.90
# via torch
nvidia-cusparse-cu12==12.5.7.53
nvidia-cusparse-cu12==12.5.8.93
# via
# nvidia-cusolver-cu12
# torch
nvidia-cusparselt-cu12==0.6.3
nvidia-cusparselt-cu12==0.7.1
# via torch
nvidia-nccl-cu12==2.26.2
nvidia-nccl-cu12==2.27.3
# via torch
nvidia-nvjitlink-cu12==12.8.61
nvidia-nvjitlink-cu12==12.8.93
# via
# nvidia-cufft-cu12
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.8.55
nvidia-nvtx-cu12==12.8.90
# via torch
omegaconf==2.3.0
# via
Expand Down Expand Up @@ -1069,7 +1069,7 @@ tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.7.1+cu128
torch==2.8.0+cu128
# via
# -r requirements/test.in
# accelerate
Expand Down Expand Up @@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
# torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.7.1+cu128
torchaudio==2.8.0+cu128
# via
# -r requirements/test.in
# encodec
Expand All @@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
# pytorch-lightning
# terratorch
# torchgeo
torchvision==0.22.1+cu128
torchvision==0.23.0+cu128
# via
# -r requirements/test.in
# lightly
Expand Down Expand Up @@ -1152,7 +1152,7 @@ transformers==4.55.2
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.3.1
triton==3.4.0
# via torch
tritonclient==2.51.0
# via
Expand Down
2 changes: 1 addition & 1 deletion tests/distributed/test_sequence_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def _compare_sp(
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
]


Expand Down
6 changes: 5 additions & 1 deletion tests/lora/test_chatglm3_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
Expand All @@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True)
enable_chunked_prefill=True,
gpu_memory_utilization=0.85)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
Expand Down
5 changes: 3 additions & 2 deletions vllm/v1/attention/backends/flex_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,7 @@ def get_kernel_options(query, block_m, block_n,
device_props = torch.cuda.get_device_properties()
max_shared_memory = device_props.shared_memory_per_block_optin
if max_shared_memory < 144 * 1024:
kernel_options["BLOCK_M"] = 32
kernel_options["BLOCK_N"] = 32
kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2
kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2

return kernel_options