vllm-project · NicholasTao · Jul 14, 2025 · Jul 22, 2025
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -210,6 +210,8 @@ jobs:
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ_with_flashcomm_v1
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_with_flashcomm_v2
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_qwen_graph_mode.py::test_qwen25_graph_mode
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_qwen_graph_mode.py::test_qwen3_graph_mode
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine

diff --git a/tests/multicard/test_qwen_graph_mode.py b/tests/multicard/test_qwen_graph_mode.py
@@ -0,0 +1,59 @@
+import os
+from unittest.mock import patch
+
+from vllm import SamplingParams
+
+from tests.conftest import VllmRunner
+
+
+def test_qwen25_graph_mode():
+    test_qwen_graph_mode("Qwen/Qwen2.5-0.5B-Instruct")
+
+
+def test_qwen3_graph_mode():
+    test_qwen_graph_mode("Qwen/Qwen2.5-0.5B-Instruct")
+
+
+@patch.dict(os.environ, {"VLLM_ENABLE_GRAPH_MODE": "1"})
+def test_qwen_graph_mode(model) -> None:
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    sampling_params = SamplingParams(temperature=1.0,
+                                     top_p=1.0,
+                                     max_tokens=10,
+                                     top_k=-1,
+                                     min_p=0.0,
+                                     detokenize=True,
+                                     logprobs=1,
+                                     n=16)
+
+    with VllmRunner(
+            model,
+            dtype="half",
+            tensor_parallel_size=4,
+            distributed_executor_backend="mp",
+            enforce_eager=False,
+            enable_expert_parallel=True,
+            max_model_len=4096,
+            trust_remote_code=True,
+            load_format="dummy",
+            gpu_memory_utilization=0.5,
+            additional_config={
+                "torchair_graph_config": {
+                    "enabled": True,
+                    "use_cached_graph": False,
+                    "graph_batch_sizes_init": False,
+                },
+                "ascend_scheduler_config": {
+                    "enabled": True,
+                    "chunked_prefill_enabled": True,
+                },
+                "refresh": True,
+            },
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py
@@ -0,0 +1,44 @@
+import unittest
+
+import torch
+
+from vllm_ascend.attention.attention_v1 import AscendAttentionBackendImpl
+
+
+class DummyNPU:
+
+    @staticmethod
+    def npu_scatter_nd_update_(tensor, indices, updates):
+        batch = indices.shape[0]
+        for i in range(batch):
+            b = indices[i, 0, 0].item()
+            o = indices[i, 0, 1].item()
+            tensor[b, o] = updates[i]
+
+
+class TestUpdateKVCache(unittest.TestCase):
+
+    def test_basic_update(self):
+        block_num, block_size = 3, 2
+        num_heads, head_dim = 1, 1
+
+        key_cache = torch.zeros(block_num, block_size, num_heads, head_dim)
+        value_cache = torch.zeros_like(key_cache)
+
+        key = torch.tensor([[[1.0]], [[2.0]]])
+        value = torch.tensor([[[3.0]], [[4.0]]])
+
+        slot_indices = torch.tensor([1, 3])
+
+        AscendAttentionBackendImpl.update_kv_cache(key, value, key_cache,
+                                                   value_cache, slot_indices)
+
+        self.assertEqual(key_cache[0, 1, 0, 0].item(), 1.0)
+        self.assertEqual(value_cache[0, 1, 0, 0].item(), 3.0)
+
+        self.assertEqual(key_cache[1, 1, 0, 0].item(), 2.0)
+        self.assertEqual(value_cache[1, 1, 0, 0].item(), 4.0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py
@@ -0,0 +1,80 @@
+import pytest
+import torch
+import torch_npu
+
+from vllm_ascend.attention.attention_v1 import AttentionV1  # noqa: F401
+
+
+class DummyAscendConfig:
+
+    class torchair_graph_config:
+        enabled = True
+
+
+@pytest.fixture(autouse=True)
+def patch_ascend_config(monkeypatch):
+    monkeypatch.setattr(
+        "my_module.attention.attention_v1.get_ascend_config",
+        lambda: DummyAscendConfig,
+    )
+    yield
+
+
+@pytest.fixture(autouse=True)
+def patch_npu_apply(monkeypatch):
+    # stub npu_apply_rotary_pos_emb: q->q+1, k->k+2
+    def fake_apply(q, k, cos, sin):
+        return q + 1.0, k + 2.0
+
+    monkeypatch.setattr(torch_npu, "npu_apply_rotary_pos_emb", fake_apply)
+    yield
+
+
+@pytest.fixture(autouse=True)
+def patch_set_cos_sin_cache(monkeypatch):
+    monkeypatch.setattr("my_module.attention.attention_v1.__set_cos_sin_cache",
+                        lambda *args, **kwargs: None)
+    yield
+
+
+def test_rope_forward_basic():
+    attn = AttentionV1.__new__(AttentionV1)
+    attn.max_position_embeddings = 1024
+    attn.head_size = 2
+    # pre-allocate cos/sin tensors
+    attn.cos = torch.randn(1, 3)
+    attn.sin = torch.randn(1, 3)
+    attn.cos_embed = None
+    attn.sin_embed = None
+
+    # input tensor: batch=1, seq_len=3, embed_dim=num_heads*head_size=2
+    batch, seq_len, embed_dim = 1, 3, 2
+    positions_ids = torch.randint(0, seq_len, (batch, seq_len))
+    query = torch.arange(batch * seq_len * embed_dim,
+                         dtype=torch.float32).view(batch, seq_len, embed_dim)
+    key = torch.arange(batch * seq_len * embed_dim, dtype=torch.float32).view(
+        batch, seq_len, embed_dim) + 100.0
+
+    q_out, k_out = attn.rope_forward(
+        positions_ids=positions_ids,
+        query=query,
+        key=key,
+        offsets=None,
+        max_seq_len=None,
+    )
+
+    # raw query/key reshape to (..., num_heads, head_size)
+    # unsqueeze(1)，fake_apply+1/+2, flatten(-2), combine last 2 dims
+    q_view = query.view(batch, seq_len // 1, 1, attn.head_size)  # num_heads=1
+    k_view = key.view(batch, seq_len // 1, 1, attn.head_size)
+    q_unsq = q_view.unsqueeze(1)
+    k_unsq = k_view.unsqueeze(1)
+
+    expected_q = (q_unsq + 1.0).flatten(-2)
+    expected_k = (k_unsq + 2.0).flatten(-2)
+
+    assert q_out.shape == expected_q.shape
+    assert k_out.shape == expected_k.shape
+
+    assert torch.allclose(q_out, expected_q)
+    assert torch.allclose(k_out, expected_k)
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -169,12 +169,12 @@ def check_ascend_config(vllm_config, enforce_eager):
                         "Torchair graph mode is still experimental and not supported for V1 without mla currently, "
                         "it has been disabled automatically.")
                     ascend_config.torchair_graph_config.enabled = False
-                # torchair_graph is supported for deepseek model only currently.
+                # torchair_graph is supported for deepseek or qwen currently.
                 if vllm_config.model_config:
                     model_type = vllm_config.model_config.hf_config.model_type
-                    if "deepseek" not in model_type:
+                    if "deepseek" not in model_type and "qwen" not in model_type:
                         raise NotImplementedError(
-                            "Torchair graph mode only works with deepseek model."
+                            "Torchair graph mode only works with deepseek or qwen model."
                         )
             # aclgraph case
             else: