Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/_e2e_nightly_single_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ on:
tests:
required: true
type: string
name:
required: false
type: string

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
Expand Down Expand Up @@ -94,6 +97,28 @@ jobs:
pip install -r requirements-dev.txt
pip install -v -e .

- name: Install custom-ops & MLAPO (for DeepSeek-V3.2-Exp)
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
shell: bash -l {0}
run: |
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl

wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-mlapo-linux.aarch64.run
chmod +x ./CANN-custom_ops-mlapo-linux.aarch64.run
./CANN-custom_ops-mlapo-linux.aarch64.run --quiet --install-path=/vllm-workspace/CANN
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/torch_npu-2.7.1%2Bgitb7c90d0-cp311-cp311-linux_aarch64.whl
pip install torch_npu-2.7.1+gitb7c90d0-cp311-cp311-linux_aarch64.whl
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/libopsproto_rt2.0.so
cp libopsproto_rt2.0.so /usr/local/Ascend/ascend-toolkit/8.2.RC1/opp/built-in/op_proto/lib/linux/aarch64/libopsproto_rt2.0.so
. /vllm-workspace/CANN/vendors/customize/bin/set_env.bash
export LD_PRELOAD=/vllm-workspace/CANN/vendors/customize/op_proto/lib/linux/aarch64/libcust_opsproto_rt2.0.so:${LD_PRELOAD}

- name: Checkout aisbench repo and Install aisbench
run: |
git clone https://gitee.com/aisbench/benchmark.git
Expand Down
100 changes: 55 additions & 45 deletions .github/workflows/vllm_ascend_test_nightly_a3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ on:
pull_request:
branches:
- 'main'
push:
branches:
- 'main'

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
Expand All @@ -43,71 +46,78 @@ concurrency:
jobs:
single-node-tests:
name: single-node
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
# if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b-in8-a3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
- name: qwen3-32b-int8-a3-feature-stack3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
- name: qwen3-235b-a22b-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
- name: deepseek-r1-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
- name: qwen2-5-vl-7b
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
- name: qwen2-5-vl-32b
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
- name: qwen3-32b-int8-prefix-cache
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
- name: deepseek-r1-0528-w8a8
# - name: qwen3-32b-in8-a3
# os: linux-aarch64-a3-4
# tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
# - name: qwen3-32b-int8-a3-feature-stack3
# os: linux-aarch64-a3-4
# tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
# - name: qwen3-235b-a22b-w8a8-eplb
# os: linux-aarch64-a3-16
# tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
# - name: deepseek-r1-w8a8-eplb
# os: linux-aarch64-a3-16
# tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
# - name: qwen2-5-vl-7b
# os: linux-aarch64-a3-4
# tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
# - name: qwen2-5-vl-32b
# os: linux-aarch64-a3-4
# tests: tests/e2e/nightly/models/test_qwen2_5_vl_32b.py
# - name: qwen3-32b-int8-prefix-cache
# os: linux-aarch64-a3-4
# tests: tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
# - name: deepseek-r1-0528-w8a8
# os: linux-aarch64-a3-16
# tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
# - name: deepseek-r1-0528-w8a8-prefix-cache
# os: linux-aarch64-a3-16
# tests: tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
# - name: qwq-32b-a3
# os: linux-aarch64-a3-4
# tests: tests/e2e/nightly/models/test_qwq_32b.py
- name: deepseek3_2-exp-w8a8
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
- name: deepseek-r1-0528-w8a8-prefix-cache
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py
- name: qwq-32b-a3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwq_32b.py
tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }}

multi-node-tests:
name: multi-node
needs: single-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
# if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: GLM-4_5.yaml
# - name: multi-node-deepseek-pd
# config_file_path: DeepSeek-V3.yaml
# size: 2
# - name: multi-node-qwen3-dp
# config_file_path: Qwen3-235B-A3B.yaml
# size: 2
# - name: multi-node-dpsk-4node-pd
# config_file_path: DeepSeek-R1-W8A8.yaml
# size: 4
# - name: multi-node-qwenw8a8-2node
# config_file_path: Qwen3-235B-W8A8.yaml
# size: 2
# - name: multi-node-glm-2node
# config_file_path: GLM-4_5.yaml
# size: 2
- name: multi-node-dpsk3.2-exp-dp
config_file_path: DeepSeek-V3_2-Exp-W8A8.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
Expand Down
105 changes: 105 additions & 0 deletions tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any

import openai
import pytest
from vllm.utils import get_open_port

from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases

MODELS = [
"vllm-ascend/DeepSeek-V3.2-Exp-W8A8",
]

TENSOR_PARALLELS = [8]
DATA_PARALLELS = [2]

prompts = [
"San Francisco is a",
]

api_keyword_args = {
"max_tokens": 10,
}

aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"baseline": 95,
"threshold": 5
}, {
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 80,
"max_out_len": 1500,
"batch_size": 20,
"request_rate": 0,
"baseline": 1,
"threshold": 0.97
}]


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
async def test_models(model: str, tp_size: int, dp_size: int) -> None:
port = get_open_port()
env_dict = {
"TASK_QUEUE_ENABLE": "1",
"OMP_PROC_BIND": "false",
"HCCL_OP_EXPANSION_MODE": "AIV",
"PAGED_ATTENTION_MASK_LEN": "5500",
"DYNAMIC_EPLB": "true"
}
server_args = [
"--no-enable-prefix-caching", "--enable-expert-parallel",
"--tensor-parallel-size",
str(tp_size), "--data-parallel-size",
str(dp_size), "--port",
str(port), "--max-model-len", "36864", "--max-num-batched-tokens",
"36864", "--block-size", "128", "--trust-remote-code",
"--quantization", "ascend", "--gpu-memory-utilization", "0.9",
"--additional-config", '{"ascend_scheduler_config":{"enabled":true},'
'"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
]
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False) as server:
client = server.get_async_client()
batch = await client.completions.create(
model=model,
prompt=prompts,
**request_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(model, port, aisbench_cases)
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
test_name: "test DeepSeek-V3.2-Exp-bf1 disaggregated_prefill"
model: "Yanguan/DeepSeek-V3.2-Exp-bf16"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
disaggregated_prefill:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you want to add multi-dp test cross 2 nodes for deepseekv3.2 right? so we should not pass disaggregated_prefill related config

enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]

deployment:
-
server_cmd: >
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-address $LOCAL_IP
--data-parallel-size 2
--data-parallel-size-local 1
--data-parallel-rpc-port 13389
--tensor-parallel-size 16
--seed 1024
--served-model-name deepseek_v3.2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
--served-model-name deepseek_v3.2

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we use the model name instead of this short tag when checking if the service is running, passing this parameter will result in a timeout.

--enable-expert-parallel
--max-num-seqs 16
--max-model-len 17450
--max-num-batched-tokens 17450
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'

-
server_cmd: >
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
--host 0.0.0.0
--port $SERVER_PORT
--headless
--data-parallel-size 2
--data-parallel-size-local 1
--data-parallel-start-rank 1
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 16
--seed 1024
--served-model-name deepseek_v3.2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

--max-num-seqs 16
--max-model-len 17450
--max-num-batched-tokens 17450
--enable-expert-parallel
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.92
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]}}'
benchmarks:
Loading
Loading