vllm-project · vllm-bot · Aug 5, 2025 · Jul 19, 2025
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
@@ -128,6 +128,8 @@ def get_attention_backend(backend_name: _Backend):
         "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
         _Backend.TREE_ATTN:
         "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
+        _Backend.XFORMERS_VLLM_V1:
+        "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
     }
 
     if backend_name not in backend_map:

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1469,6 +1469,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             "TORCH_SDPA_VLLM_V1",
             "FLEX_ATTENTION",
             "TREE_ATTN",
+            "XFORMERS_VLLM_V1",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -271,6 +271,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
             FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
             TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
+            XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
 
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
@@ -291,6 +292,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             elif selected_backend == _Backend.TREE_ATTN:
                 logger.info_once("Using Tree Attention backend on V1 engine.")
                 return TREE_ATTN_V1
+            elif selected_backend == _Backend.XFORMERS_VLLM_V1:
+                logger.info_once("Using XFormers backend on V1 engine.")
+                return XFORMERS_V1
 
             from vllm.attention.selector import is_attn_backend_supported
 

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -63,6 +63,7 @@ class _Backend(enum.Enum):
     NO_ATTENTION = enum.auto()
     FLEX_ATTENTION = enum.auto()
     TREE_ATTN = enum.auto()
+    XFORMERS_VLLM_V1 = enum.auto()
 
 
 class PlatformEnum(enum.Enum):

@@ -4,7 +4,7 @@
 
 import ast
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -313,15 +313,10 @@ def __init__(
         alibi_slopes: Optional[list[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
-        blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
     ) -> None:
-        if blocksparse_params is not None:
-            raise ValueError(
-                "TreeAttention does not support block-sparse attention.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)