From d6bf991e148a0cdcad926173af2cd891f627f679 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 2 Jul 2025 00:16:26 -0700
Subject: [PATCH 01/31] [WIP][RC] Update PyTorch to 2.8.0

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .pre-commit-config.yaml     |  2 +-
 CMakeLists.txt              |  4 ++--
 docker/Dockerfile           |  6 +++---
 pyproject.toml              |  2 +-
 requirements/build.txt      |  3 ++-
 requirements/cpu.txt        | 10 +++++-----
 requirements/cuda.txt       |  8 ++++----
 requirements/rocm-build.txt |  8 ++++----
 requirements/test.in        |  7 ++++---
 requirements/test.txt       | 40 +++++++++++++++++++------------------
 10 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 720c06acf144..d69895a2b43a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,7 +49,7 @@ repos:
   rev: 0.6.17
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --extra-index-url, https://download.pytorch.org/whl/test/cu128]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0129f85123fb..8a8345f1a85b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/docker/Dockerfile b/docker/Dockerfile
index c49b5da2714c..f96c750ec50c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -50,7 +50,7 @@ ARG UV_INDEX_URL=${PIP_INDEX_URL}
 ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
-ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl/test
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
@@ -376,8 +376,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 
 # Allow specifying a version, Git revision or local .whl file
-ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
-ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
+ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/test/cu128/flashinfer"
+ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.8-cp39-abi3-linux_x86_64.whl"
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.6.post1"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
diff --git a/pyproject.toml b/pyproject.toml
index 340abb385657..2831f1ac253a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<80.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.7.0",
+    "torch == 2.8.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index 528cd3b538ef..5f826a1afa14 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,8 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.0
+torch==2.8.0
 wheel
 jinja2>=3.1.6
 regex
+build
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index df3a3393563a..a44f1051bf30 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,18 +7,18 @@ numba == 0.61.2; python_version > '3.9'
 # Dependencies for CPUs
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
---extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://download.pytorch.org/whl/test/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
-torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.8.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.7.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.22.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index a71d9728f38a..ef015081b9f4 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.0
-torchaudio==2.7.0
+torch==2.8.0
+torchaudio==2.8.0
 # These must be updated alongside torch
-torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
-xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+git+https://github.com/facebookresearch/xformers@v0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 94201543cd4f..f15efd2c91ac 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.7.0
-torchvision==0.22.0
-torchaudio==2.7.0
+--extra-index-url https://download.pytorch.org/whl/test/rocm6.3
+torch==2.8.0
+torchvision==0.23.0
+torchaudio==2.8.0
 
 triton==3.2
 cmake>=3.26.1,<4
diff --git a/requirements/test.in b/requirements/test.in
index 907d90201a24..f2f179da2d14 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -22,9 +22,10 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.7.0
-torchaudio==2.7.0
-torchvision==0.22.0
+--extra-index-url https://download.pytorch.org/whl/test/cu128
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
diff --git a/requirements/test.txt b/requirements/test.txt
index 2f3ccc4f61df..3614f5c66451 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -377,42 +377,44 @@ numpy==1.26.4
     #   transformers
     #   tritonclient
     #   vocos
-nvidia-cublas-cu12==12.8.3.14
+nvidia-cublas-cu12==12.8.4.1
     # via
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-cupti-cu12==12.8.90
     # via torch
-nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-nvrtc-cu12==12.8.93
     # via torch
-nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cuda-runtime-cu12==12.8.90
     # via torch
-nvidia-cudnn-cu12==9.7.1.26
+nvidia-cudnn-cu12==9.10.2.21
     # via torch
-nvidia-cufft-cu12==11.3.3.41
+nvidia-cufft-cu12==11.3.3.83
     # via torch
-nvidia-cufile-cu12==1.13.0.11
+nvidia-cufile-cu12==1.13.1.3
     # via torch
-nvidia-curand-cu12==10.3.9.55
+nvidia-curand-cu12==10.3.9.90
     # via torch
-nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusolver-cu12==11.7.3.90
     # via torch
-nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparse-cu12==12.5.8.93
     # via
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cusparselt-cu12==0.6.3
+nvidia-cusparselt-cu12==0.7.1
     # via torch
-nvidia-nccl-cu12==2.26.2
+nvidia-nccl-cu12==2.27.3
     # via torch
-nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvjitlink-cu12==12.8.93
     # via
     #   nvidia-cufft-cu12
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvtx-cu12==12.8.55
+nvidia-nvshmem-cu12==3.2.5
+    # via torch
+nvidia-nvtx-cu12==12.8.90
     # via torch
 opencensus==0.11.4
     # via ray
@@ -757,7 +759,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.7.0+cu128
+torch==2.8.0+cu128
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -776,12 +778,12 @@ torch==2.7.0+cu128
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.7.0+cu128
+torchaudio==2.8.0+cu128
     # via
     #   -r requirements/test.in
     #   encodec
     #   vocos
-torchvision==0.22.0+cu128
+torchvision==0.23.0+cu128
     # via
     #   -r requirements/test.in
     #   timm
@@ -811,7 +813,7 @@ transformers==4.52.4
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.3.0
+triton==3.4.0
     # via torch
 tritonclient==2.51.0
     # via

From 456985c34040ff97c045c869074ef9a99c0a89ae Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 2 Jul 2025 00:57:49 -0700
Subject: [PATCH 02/31] Handle xformers

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile     | 4 ++++
 requirements/cuda.txt | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index f96c750ec50c..290192792d27 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -363,6 +363,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
+# TODO (huydhn): Remove this once xformers is released for 2.8.0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+
 # If we need to build FlashInfer wheel before its release:
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index ef015081b9f4..528e3292c8ce 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,5 +10,6 @@ torch==2.8.0
 torchaudio==2.8.0
 # These must be updated alongside torch
 torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# TODO (huydhn): Re-enable this once xformers is released for 2.8.0
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
-git+https://github.com/facebookresearch/xformers@v0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+# git+https://github.com/facebookresearch/xformers@v0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7

From 4838d53ef28fcd00d0ebaf62f3989467ee0960cd Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 8 Jul 2025 16:07:55 -0700
Subject: [PATCH 03/31] Some more tweaks

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docs/contributing/ci/update_pytorch_version.md | 11 +++++++++++
 tests/standalone_tests/python_only_compile.sh  |  4 +++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 2327bc4b53ad..1d7c90a2afce 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -39,6 +39,17 @@ via `UV_INDEX_STRATEGY` env variable or via `--index-strategy unsafe-best-match`
 If failures are found in the pull request, raise them as issues on vLLM and
 cc the PyTorch release team to initiate discussion on how to address them.
 
+### Update some tests to use PyTorch RC
+
+#### Python-only installation test
+
+Update tests/standalone_tests/python_only_compile.sh to
+
+```
+VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . \
+  --extra-index-url https://download.pytorch.org/whl/test/cu128
+```
+
 ## Update CUDA version
 
 The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index ec1bcbcc58a0..baae47f71606 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -18,7 +18,9 @@ apt autoremove -y
 
 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
 
-VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+# TESTING, TO BE REMOVED
+VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . \
+  --extra-index-url https://download.pytorch.org/whl/test/cu128
 
 # Run the script
 python3 -c 'import vllm'

From ca21216d4b9ecf6f285c7864d817bce440665153 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 8 Jul 2025 16:39:06 -0700
Subject: [PATCH 04/31] Attempt to fix xformers build

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 290192792d27..35c312aca6db 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -364,8 +364,13 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # TODO (huydhn): Remove this once xformers is released for 2.8.0
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+    . /etc/environment
+    export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
     uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+    # DEBUG
+    python -m xformers.info
+BASH
 
 # If we need to build FlashInfer wheel before its release:
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+

From 0c431749f10e0dafaaed004598db16096937bd6b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 8 Jul 2025 18:23:31 -0700
Subject: [PATCH 05/31] Silly typo

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 35c312aca6db..dca5201804c8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -364,12 +364,13 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # TODO (huydhn): Remove this once xformers is released for 2.8.0
+# https://pytorch.s3.us-east-1.amazonaws.com/whl/test/cu128/xformers/xformers-0.0.30%2B4cf69f09.d20250708-cp312-cp312-linux_x86_64.whl
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     . /etc/environment
     export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
     uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
     # DEBUG
-    python -m xformers.info
+    python3 -m xformers.info
 BASH
 
 # If we need to build FlashInfer wheel before its release:

From 14c85d1a034359cd7042652077f95a68d6021d46 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 9 Jul 2025 23:45:33 -0700
Subject: [PATCH 06/31] Few more tweaks for a greener CI

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/distributed/test_sequence_parallel.py | 14 +++++++++-----
 tests/lora/test_chatglm3_tp.py              |  6 +++---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index b2f6a8ab9dd3..e97104e1c875 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -13,6 +13,7 @@
 from typing import Literal, NamedTuple, Optional
 
 import pytest
+import torch
 
 from vllm.config import TaskOption
 from vllm.logger import init_logger
@@ -288,12 +289,15 @@ def _compare_sp(
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
 }
 
-SP_TEST_MODELS = [
+SP_TEST_MODELS = {
     # TODO support other models
     # [LANGUAGE GENERATION]
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
-]
+    "meta-llama/Llama-3.2-1B-Instruct":
+    True,
+    # FP8 reduction requires sm90 or higher
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8":
+    torch.cuda.get_device_capability() >= (9, 0),
+}
 
 
 @pytest.mark.parametrize(
@@ -302,7 +306,7 @@ def _compare_sp(
     [
         params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id)
-        if model_id in SP_TEST_MODELS
+        if model_id in SP_TEST_MODELS and SP_TEST_MODELS[model_id]
     ],
 )
 @create_new_process_for_each_test()
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 5481b413b8f5..92644f728965 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -48,7 +48,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
+                   max_model_len=512,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
@@ -67,7 +67,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
+                   max_model_len=512,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
@@ -88,7 +88,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
+                   max_model_len=512,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,

From ad98d103c63ec5957f49505a0f30a2c206c6bbd3 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 11 Jul 2025 00:22:22 -0700
Subject: [PATCH 07/31] Attempt to offload to CPU to avoid OOM in CI

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../entrypoints/openai/test_translation_validation.py  |  2 +-
 tests/lora/test_chatglm3_tp.py                         | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index 0c2cb367f330..d83aa26d4fb3 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -169,4 +169,4 @@ async def test_long_audio_request(foscolo):
             temperature=0.0)
         out = json.loads(translation)['text'].strip().lower()
         # TODO investigate higher model uncertainty in for longer translations.
-        assert out.count("nor will i ever") == 2
+        assert out.count("nor do i ever") == 2
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 92644f728965..54aec7624d8b 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -48,7 +48,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=512,
+                   max_model_len=1024,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
@@ -67,7 +67,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=512,
+                   max_model_len=1024,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
@@ -88,14 +88,16 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=512,
+                   max_model_len=1024,
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
                    tensor_parallel_size=4,
                    trust_remote_code=True,
                    fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85,
+                   cpu_offload_gb=10)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]

From 6a08113f4095ff9fbfa2e0a8dd61bb5133ac05d7 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 15 Jul 2025 10:17:47 -0700
Subject: [PATCH 08/31] Fix lint

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 requirements/test.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index 8a77d9cadaf1..5e4dd3959985 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -412,8 +412,6 @@ nvidia-nvjitlink-cu12==12.8.93
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvshmem-cu12==3.2.5
-    # via torch
 nvidia-nvtx-cu12==12.8.90
     # via torch
 opencensus==0.11.4

From 44f07c041b58a9a85c509d65eb25a3a6ca1ec6a0 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 16 Jul 2025 11:23:01 -0700
Subject: [PATCH 09/31] Run all test_sequence_parallel again

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/distributed/test_sequence_parallel.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index e97104e1c875..c59dcb37568e 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -13,7 +13,6 @@
 from typing import Literal, NamedTuple, Optional
 
 import pytest
-import torch
 
 from vllm.config import TaskOption
 from vllm.logger import init_logger
@@ -292,11 +291,8 @@ def _compare_sp(
 SP_TEST_MODELS = {
     # TODO support other models
     # [LANGUAGE GENERATION]
-    "meta-llama/Llama-3.2-1B-Instruct":
-    True,
-    # FP8 reduction requires sm90 or higher
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8":
-    torch.cuda.get_device_capability() >= (9, 0),
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 }
 
 
@@ -306,7 +302,7 @@ def _compare_sp(
     [
         params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_id)
-        if model_id in SP_TEST_MODELS and SP_TEST_MODELS[model_id]
+        if model_id in SP_TEST_MODELS
     ],
 )
 @create_new_process_for_each_test()

From 29fb5a0362035d385640dba240514fc9fa849eab Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 16 Jul 2025 11:29:13 -0700
Subject: [PATCH 10/31] Typo

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/distributed/test_sequence_parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index c59dcb37568e..f320fbad30a4 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -288,12 +288,12 @@ def _compare_sp(
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
 }
 
-SP_TEST_MODELS = {
+SP_TEST_MODELS = [
     # TODO support other models
     # [LANGUAGE GENERATION]
     "meta-llama/Llama-3.2-1B-Instruct",
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-}
+]
 
 
 @pytest.mark.parametrize(

From c5d89404a04a9d2e864c521888bb4b4e0d2267bc Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 17 Jul 2025 13:41:10 -0700
Subject: [PATCH 11/31] Try to reproduce OOM after recent rebase

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/lora/test_chatglm3_tp.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 54aec7624d8b..5481b413b8f5 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -95,9 +95,7 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                    tensor_parallel_size=4,
                    trust_remote_code=True,
                    fully_sharded_loras=True,
-                   enable_chunked_prefill=True,
-                   gpu_memory_utilization=0.85,
-                   cpu_offload_gb=10)
+                   enable_chunked_prefill=True)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]

From f320d9d26661831a07b0e02a3cd394155b4413b6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 17 Jul 2025 13:43:03 -0700
Subject: [PATCH 12/31] Match xformers version

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 350483f652f0..67c7a23d731c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -381,7 +381,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     . /etc/environment
     export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
-    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
     # DEBUG
     python3 -m xformers.info
 BASH

From a5999e17b3edb5154f4f4326fbe440ac82bbaf34 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 17 Jul 2025 17:17:20 -0700
Subject: [PATCH 13/31] Not sure why building xformers 0.0.31 fails

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 67c7a23d731c..350483f652f0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -381,7 +381,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     . /etc/environment
     export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
-    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
+    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
     # DEBUG
     python3 -m xformers.info
 BASH

From d1dbb4eae1c3fd78ca35a2fea89e2a9ebdfe8072 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 18 Jul 2025 11:39:59 -0700
Subject: [PATCH 14/31] Remove some doc changes what are not needed

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docs/contributing/ci/update_pytorch_version.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 9dd7a7dbd255..1fe18d5d8856 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -55,17 +55,6 @@ to point to the new releases for `torch`, `torchvision`, and `torchaudio`.
 If failures are found in the pull request, raise them as issues on vLLM and
 cc the PyTorch release team to initiate discussion on how to address them.
 
-### Update some tests to use PyTorch RC
-
-#### Python-only installation test
-
-Update tests/standalone_tests/python_only_compile.sh to
-
-```
-VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . \
-  --extra-index-url https://download.pytorch.org/whl/test/cu128
-```
-
 ## Update CUDA version
 
 The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,

From 6f394f5c162bf988f231d81f9be1bd730124720c Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 18 Jul 2025 18:53:12 -0700
Subject: [PATCH 15/31] Tweak some tests

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/entrypoints/openai/test_vision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b6f1d64803e5..fd613842f986 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -36,11 +36,11 @@
     ],
     [
         "The image shows a Venn diagram with three over",
-        "The image shows a Venn diagram with three intersect",
+        "This image shows a Venn diagram with three over",
     ],
     [
         "This image displays a gradient of colors ranging from",
-        "The image displays a gradient of colors ranging from",
+        "This image displays a gradient of colors transitioning from",
     ],
 ]
 

From aa1d8c1f8d80a46c62f5db5e32c39e49013c0500 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 23 Jul 2025 01:36:59 -0700
Subject: [PATCH 16/31] Lower memory usage for
 test_chatglm3_lora_tp4_fully_sharded_loras

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/lora/test_chatglm3_tp.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 5481b413b8f5..4b715b33b38a 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
@@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                    tensor_parallel_size=4,
                    trust_remote_code=True,
                    fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]

From 91ce20f087f14869ef86c506ae8f00ae5b6e54ba Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 26 Jul 2025 13:25:53 -0700
Subject: [PATCH 17/31] Build mamba_ssm from source

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 787c7edf2522..1de59b840629 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -158,6 +158,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/cuda.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
+# Build from source to unblock PyTorch 2.8.0 update
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -461,6 +465,10 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
+# Build from source to unblock PyTorch 2.8.0 update
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \

From 93eb498c0cff4e646193141ac529a6be2cf5a304 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 6 Aug 2025 10:15:47 -0700
Subject: [PATCH 18/31] Ready 2.8.0

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .pre-commit-config.yaml                       | 2 +-
 docker/Dockerfile                             | 8 +-------
 requirements/rocm-build.txt                   | 2 +-
 requirements/test.in                          | 3 +--
 requirements/test.txt                         | 2 +-
 tests/standalone_tests/python_only_compile.sh | 4 +---
 6 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 98dfa2d012df..612b290e88d4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,7 +49,7 @@ repos:
   rev: 0.6.17
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --extra-index-url, https://download.pytorch.org/whl/test/cu128]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 6f8891f912cd..27778ca1d22d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -50,7 +50,7 @@ ARG UV_INDEX_URL=${PIP_INDEX_URL}
 ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
-ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl/test
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
 ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
@@ -381,13 +381,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # TODO (huydhn): Remove this once xformers is released for 2.8.0
-# https://pytorch.s3.us-east-1.amazonaws.com/whl/test/cu128/xformers/xformers-0.0.30%2B4cf69f09.d20250708-cp312-cp312-linux_x86_64.whl
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     . /etc/environment
     export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
     uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
-    # DEBUG
-    python3 -m xformers.info
 BASH
 
 # If we need to build FlashInfer wheel before its release:
@@ -402,9 +399,6 @@ BASH
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 
-# Allow specifying a version, Git revision or local .whl file
-ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/test/cu128/flashinfer"
-ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.8-cp39-abi3-linux_x86_64.whl"
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index f15efd2c91ac..ee2f1ed64b9f 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,7 +1,7 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/test/rocm6.3
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
 torch==2.8.0
 torchvision==0.23.0
 torchaudio==2.8.0
diff --git a/requirements/test.in b/requirements/test.in
index 4e3e39b16c3c..a8a9d36bab4d 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -22,7 +22,6 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
---extra-index-url https://download.pytorch.org/whl/test/cu128
 torch==2.8.0
 torchaudio==2.8.0
 torchvision==0.23.0
@@ -55,4 +54,4 @@ runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
-terratorch==1.1rc2 # required for PrithviMAE test
\ No newline at end of file
+terratorch==1.1rc2 # required for PrithviMAE test
diff --git a/requirements/test.txt b/requirements/test.txt
index 69d9eff129d7..84bff4471ac7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index baae47f71606..ec1bcbcc58a0 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -18,9 +18,7 @@ apt autoremove -y
 
 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
 
-# TESTING, TO BE REMOVED
-VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . \
-  --extra-index-url https://download.pytorch.org/whl/test/cu128
+VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
 
 # Run the script
 python3 -c 'import vllm'

From c741b0e1a87f5562da7a9a7e079d46feed1f123f Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 6 Aug 2025 10:21:59 -0700
Subject: [PATCH 19/31] Update cpu.txt

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 requirements/cpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index b1e3aa35ccc2..4499ca66ce4a 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,7 +7,7 @@ numba == 0.61.2; python_version > '3.9'
 # Dependencies for CPUs
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
---extra-index-url https://download.pytorch.org/whl/test/cpu
+--extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 torch==2.8.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

From 1543b929c4f91573dcf4793747637f74de7ad0cf Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 12 Aug 2025 19:06:51 -0700
Subject: [PATCH 20/31] Resolve xformers and mamba_ssm

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 4e116f40b776..663f8c83e114 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -160,10 +160,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/cuda.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Build from source to unblock PyTorch 2.8.0 update
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
-
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -375,11 +371,12 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# TODO (huydhn): Remove this once xformers is released for 2.8.0
+# TODO (huydhn): Remove this once xformers is released for 2.8.0, official xformers wheel only support these
+# compute capability atm
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     . /etc/environment
-    export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
-    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
+    export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
+    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
 BASH
 
 # If we need to build FlashInfer wheel before its release:
@@ -489,10 +486,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Build from source to unblock PyTorch 2.8.0 update
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
-
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \

From 8de151c2b8ce94f872cc3fbd9d25638bbd0a4dbf Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 12 Aug 2025 23:49:48 -0700
Subject: [PATCH 21/31] Ready to land

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/entrypoints/openai/test_vision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 009093169ff8..8259a81d7b6a 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -36,11 +36,11 @@
     ],
     [
         "The image shows a Venn diagram with three over",
-        "This image shows a Venn diagram with three over",
+        "The image shows a Venn diagram with three intersect",
     ],
     [
         "This image displays a gradient of colors ranging from",
-        "This image displays a gradient of colors transitioning from",
+        "The image displays a gradient of colors ranging from",
     ],
 ]
 

From bbf1ce32241213f4547bc8e7635e0f5d85e9ff8b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 14 Aug 2025 00:51:31 -0700
Subject: [PATCH 22/31] xformers v0.0.32 is almost here

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile     | 2 +-
 requirements/cuda.txt | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 663f8c83e114..96a7d0219cbe 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -376,7 +376,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     . /etc/environment
     export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
+    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32"
 BASH
 
 # If we need to build FlashInfer wheel before its release:
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index b48d22abe519..91834e43ff01 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -11,5 +11,5 @@ torchaudio==2.8.0
 # These must be updated alongside torch
 torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # TODO (huydhn): Re-enable this once xformers is released for 2.8.0
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-# xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32
+# xformers==0.0.32; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7

From 653ccd1dbde786b5ad1034bad271b3c44d5627ab Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 14 Aug 2025 18:31:16 -0700
Subject: [PATCH 23/31] Use xformers 0.0.32

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 docker/Dockerfile     | 8 --------
 requirements/cuda.txt | 5 ++---
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 96a7d0219cbe..b96d50f0a1c6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -371,14 +371,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# TODO (huydhn): Remove this once xformers is released for 2.8.0, official xformers wheel only support these
-# compute capability atm
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
-    . /etc/environment
-    export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32"
-BASH
-
 # If we need to build FlashInfer wheel before its release:
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 91834e43ff01..3f8b8fca3209 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,6 +10,5 @@ torch==2.8.0
 torchaudio==2.8.0
 # These must be updated alongside torch
 torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# TODO (huydhn): Re-enable this once xformers is released for 2.8.0
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32
-# xformers==0.0.32; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
+xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8

From 684c24d84c72d1ad655463a8c2001cd51961518a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 15 Aug 2025 00:47:01 -0700
Subject: [PATCH 24/31] Install ao from cu128

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .buildkite/test-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 942a8d3f9bfd..ef6d5cfe47d9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -470,8 +470,8 @@ steps:
   - tests/quantization
   commands:
   # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min

From d8de1084f18e0f0d031e79fc10bbe6640f0a6cef Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 26 Aug 2025 15:20:14 -0700
Subject: [PATCH 25/31] Tweaking jason9693/Qwen2.5-1.5B-apeach memory usage

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/models/language/pooling/test_classification.py | 7 ++++++-
 tests/models/test_transformers.py                    | 5 ++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index c71fa9627533..71ed2a48c490 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -29,7 +29,12 @@ def test_models(
         # switch to use ROCm CK FA backend
         monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
 
-    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
+    # Seem to use slightly more memory after torch.compile in 2.8
+    # only for this model jason9693/Qwen2.5-1.5B-apeach
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     gpu_memory_utilization=0.85) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
     with hf_runner(model,
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 66ff8f7a54d3..01d86673761d 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -181,10 +181,13 @@ def test_classify(
     import torch
     from transformers import AutoModelForSequenceClassification
 
+    # Seem to use slightly more memory after torch.compile in 2.8
+    # only for this model jason9693/Qwen2.5-1.5B-apeach
     with vllm_runner(model,
                      max_model_len=512,
                      dtype=dtype,
-                     model_impl="transformers") as vllm_model:
+                     model_impl="transformers",
+                     gpu_memory_utilization=0.85) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.using_transformers_backend()
 

From 047e2959de1a164bc3c31b8592bcfe828f6e2937 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 26 Aug 2025 15:26:03 -0700
Subject: [PATCH 26/31] Same model, different test

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/entrypoints/openai/test_classification.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 30078fe90257..7cd1c887485a 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -10,6 +10,8 @@
 
 from ...utils import RemoteOpenAIServer
 
+# Seem to use slightly more memory after torch.compile in 2.8
+# only for this model jason9693/Qwen2.5-1.5B-apeach
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
 
@@ -22,6 +24,8 @@ def server():
         "512",
         "--dtype",
         DTYPE,
+        "--gpu_memory_utilization",
+        "0.85",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

From a409dc2d734c66fc2504819e2d2bb81150dea034 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 26 Aug 2025 20:26:56 -0700
Subject: [PATCH 27/31] Revert "Same model, different test"

This reverts commit 047e2959de1a164bc3c31b8592bcfe828f6e2937.

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/entrypoints/openai/test_classification.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 7cd1c887485a..30078fe90257 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -10,8 +10,6 @@
 
 from ...utils import RemoteOpenAIServer
 
-# Seem to use slightly more memory after torch.compile in 2.8
-# only for this model jason9693/Qwen2.5-1.5B-apeach
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
 
@@ -24,8 +22,6 @@ def server():
         "512",
         "--dtype",
         DTYPE,
-        "--gpu_memory_utilization",
-        "0.85",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

From b1c83ec381b0486929f4f92d0ade9d918b6261b4 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 26 Aug 2025 20:27:14 -0700
Subject: [PATCH 28/31] Revert "Tweaking jason9693/Qwen2.5-1.5B-apeach memory
 usage"

This reverts commit d8de1084f18e0f0d031e79fc10bbe6640f0a6cef.

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 tests/models/language/pooling/test_classification.py | 7 +------
 tests/models/test_transformers.py                    | 5 +----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
index 71ed2a48c490..c71fa9627533 100644
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -29,12 +29,7 @@ def test_models(
         # switch to use ROCm CK FA backend
         monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
 
-    # Seem to use slightly more memory after torch.compile in 2.8
-    # only for this model jason9693/Qwen2.5-1.5B-apeach
-    with vllm_runner(model,
-                     max_model_len=512,
-                     dtype=dtype,
-                     gpu_memory_utilization=0.85) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
     with hf_runner(model,
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 01d86673761d..66ff8f7a54d3 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -181,13 +181,10 @@ def test_classify(
     import torch
     from transformers import AutoModelForSequenceClassification
 
-    # Seem to use slightly more memory after torch.compile in 2.8
-    # only for this model jason9693/Qwen2.5-1.5B-apeach
     with vllm_runner(model,
                      max_model_len=512,
                      dtype=dtype,
-                     model_impl="transformers",
-                     gpu_memory_utilization=0.85) as vllm_model:
+                     model_impl="transformers") as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.using_transformers_backend()
 

From 3526230d3c34fd6f37707baf697b4886474fa29b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 28 Aug 2025 02:49:06 -0700
Subject: [PATCH 29/31] Switch to XFORMERS backend for some tests

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .buildkite/test-pipeline.yaml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 25921ea79c35..6f77bd1c9c11 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -127,7 +127,8 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+  # FlexAttention backend OOM on L4 runner after https://github.com/vllm-project/vllm/pull/21416
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Distributed Tests (4 GPUs) # 10min
@@ -498,7 +499,8 @@ steps:
   - vllm/
   - tests/models
   commands:
-    - pytest -v -s models/test_transformers.py
+    # FlexAttention backend OOM on L4 runner after https://github.com/vllm-project/vllm/pull/21416
+    - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_utils.py
     - pytest -v -s models/test_vision.py
@@ -512,7 +514,8 @@ steps:
   - tests/models/language
   commands:
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m core_model
+    # FlexAttention backend OOM on L4 runner after https://github.com/vllm-project/vllm/pull/21416
+    - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s models/language -m core_model
 
 - label: Language Models Test (Hybrid) # 35 min
   mirror_hardwares: [amdexperimental]

From 6bae6f2f247f961bae75f6bb40c42bfa6c9ff40a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 28 Aug 2025 12:15:18 -0700
Subject: [PATCH 30/31] Revert "Switch to XFORMERS backend for some tests"

This reverts commit 3526230d3c34fd6f37707baf697b4886474fa29b.

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .buildkite/test-pipeline.yaml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6f77bd1c9c11..25921ea79c35 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -127,8 +127,7 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  # FlexAttention backend OOM on L4 runner after https://github.com/vllm-project/vllm/pull/21416
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Distributed Tests (4 GPUs) # 10min
@@ -499,8 +498,7 @@ steps:
   - vllm/
   - tests/models
   commands:
-    # FlexAttention backend OOM on L4 runner after https://github.com/vllm-project/vllm/pull/21416
-    - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s models/test_transformers.py
+    - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_utils.py
     - pytest -v -s models/test_vision.py
@@ -514,8 +512,7 @@ steps:
   - tests/models/language
   commands:
     - pip freeze | grep -E 'torch'
-    # FlexAttention backend OOM on L4 runner after https://github.com/vllm-project/vllm/pull/21416
-    - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s models/language -m core_model
+    - pytest -v -s models/language -m core_model
 
 - label: Language Models Test (Hybrid) # 35 min
   mirror_hardwares: [amdexperimental]

From 102d0d7e0200f5358aeb08cf354dc26f76f0d87b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 28 Aug 2025 12:17:23 -0700
Subject: [PATCH 31/31] Apply #23853

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 vllm/v1/attention/backends/flex_attention.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index a596f6b2b32a..d5b1c15e68d0 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -789,6 +789,7 @@ def get_kernel_options(query, block_m, block_n,
             device_props = torch.cuda.get_device_properties()
             max_shared_memory = device_props.shared_memory_per_block_optin
             if max_shared_memory < 144 * 1024:
-                kernel_options["BLOCK_M"] = 32
-                kernel_options["BLOCK_N"] = 32
+                kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2
+                kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2
+
     return kernel_options