Skip to content

Commit 7572939

Browse files
authored
add qwq testcase (#3757)
### What this PR does / why we need it? This PR adds a qwq case for nightly test for qwen-qwq on A3 ,we need test them daily ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? by running the test - vLLM version: v0.11.0rc3 - vLLM main: vllm-project/vllm@c9461e0 --------- Signed-off-by: ckhw <[email protected]>
1 parent e5676fc commit 7572939

File tree

2 files changed

+123
-0
lines changed

2 files changed

+123
-0
lines changed

.github/workflows/vllm_ascend_test_nightly_a3.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ jobs:
6868
- name: deepseek-r1-0528-w8a8-prefix-cache
6969
os: linux-aarch64-a3-16
7070
tests: tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
71+
- name: qwq-32b-a3
72+
os: linux-aarch64-a3-4
73+
tests: tests/e2e/nightly/models/test_qwq_32b.py
7174
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
7275
with:
7376
vllm: v0.11.0
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
2+
# Copyright 2023 The vLLM team.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# This file is a part of the vllm-ascend project.
16+
#
17+
from typing import Any
18+
19+
import openai
20+
import pytest
21+
from vllm.utils import get_open_port
22+
23+
from tests.e2e.conftest import RemoteOpenAIServer
24+
from tools.aisbench import run_aisbench_cases
25+
26+
MODELS = [
27+
"Qwen/QwQ-32B",
28+
]
29+
30+
MODES = [
31+
"aclgraph",
32+
"single",
33+
]
34+
35+
TENSOR_PARALLELS = [4]
36+
37+
prompts = [
38+
"San Francisco is a",
39+
]
40+
41+
api_keyword_args = {
42+
"max_tokens": 10,
43+
}
44+
45+
aisbench_cases = [{
46+
"case_type": "accuracy",
47+
"dataset_path": "vllm-ascend/gsm8k-lite",
48+
"request_conf": "vllm_api_general_chat",
49+
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
50+
"max_out_len": 32768,
51+
"batch_size": 32,
52+
"baseline": 95,
53+
"threshold": 5
54+
}, {
55+
"case_type": "performance",
56+
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
57+
"request_conf": "vllm_api_stream_chat",
58+
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
59+
"num_prompts": 176,
60+
"max_out_len": 1500,
61+
"batch_size": 44,
62+
"baseline": 1,
63+
"threshold": 0.97
64+
}]
65+
66+
67+
@pytest.mark.asyncio
68+
@pytest.mark.parametrize("model", MODELS)
69+
@pytest.mark.parametrize("mode", MODES)
70+
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
71+
async def test_models(model: str, mode: str, tp_size: int) -> None:
72+
port = get_open_port()
73+
env_dict = {
74+
"TASK_QUEUE_ENABLE": "1",
75+
"OMP_PROC_BIND": "false",
76+
"VLLM_USE_V1": "1",
77+
"HCCL_OP_EXPANSION_MODE": "AIV",
78+
"VLLM_ASCEND_ENABLE_FLASHCOMM": "1",
79+
"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1",
80+
"VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE": "1",
81+
"VLLM_ASCEND_ENABLE_PREFETCH": "1"
82+
}
83+
server_args = [
84+
"--tensor-parallel-size",
85+
str(tp_size), "--port",
86+
str(port), "--max-model-len", "36864", "--max-num-batched-tokens",
87+
"36864", "--block-size", "128", "--trust-remote-code",
88+
"--gpu-memory-utilization", "0.9", "--compilation_config",
89+
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}',
90+
"--reasoning-parser", "deepseek_r1", "--distributed_executor_backend",
91+
"mp"
92+
]
93+
if mode == "single":
94+
server_args.remove("--compilation_config")
95+
server_args.remove(
96+
'{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}'
97+
)
98+
server_args.append("--additional-config")
99+
server_args.append('{"ascend_scheduler_config":{"enabled":true}}')
100+
server_args.append("--enforce-eager")
101+
request_keyword_args: dict[str, Any] = {
102+
**api_keyword_args,
103+
}
104+
with RemoteOpenAIServer(model,
105+
server_args,
106+
server_port=port,
107+
env_dict=env_dict,
108+
auto_port=False) as server:
109+
client = server.get_async_client()
110+
batch = await client.completions.create(
111+
model=model,
112+
prompt=prompts,
113+
**request_keyword_args,
114+
)
115+
choices: list[openai.types.CompletionChoice] = batch.choices
116+
assert choices[0].text, "empty response"
117+
if mode == "single":
118+
return
119+
# aisbench test
120+
run_aisbench_cases(model, port, aisbench_cases)

0 commit comments

Comments
 (0)