Skip to content

CP kv cache management and input process #2247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,4 @@ self-hosted-runner:
- linux-aarch64-a2-4
- linux-aarch64-a2-8
- linux-arm64-npu-static-8
- linux-aarch64-310p-1
- linux-aarch64-310p-2
- linux-aarch64-310p-4
- ubuntu-24.04-arm
180 changes: 132 additions & 48 deletions .github/workflows/accuracy_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,35 @@ on:
types: [ labeled ]
workflow_dispatch:
inputs:
vllm-ascend-version:
description: 'vllm-ascend:'
vllm-version:
description: 'vllm version:'
required: true
type: choice
# Please also update this when bump matched version
# Current supported vLLM versions
options:
- latest
- main
default: main
- v0.10.0
- v0.9.1
- v0.7.3
vllm-ascend-version:
description: 'vllm-ascend version:'
required: true
type: choice
options:
- main
- v0.9.1-dev
- v0.7.3-dev
models:
description: 'model:'
required: true
type: choice
options:
- all
- Qwen/Qwen2.5-VL-7B-Instruct
- Qwen/Qwen3-8B-Base
- Qwen/Qwen3-30B-A3B
default: 'all'

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
Expand All @@ -56,27 +76,58 @@ jobs:
# test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
if: >-
${{
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
(contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
}}
runs-on: ${{ matrix.runner }}
runs-on: >-
${{
(matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') ||
'linux-aarch64-a2-1'
}}
strategy:
matrix:
include:
- model_name: Qwen3-8B-Base
runner: linux-aarch64-a2-1
- model_name: Qwen2.5-VL-7B-Instruct
runner: linux-aarch64-a2-1
- model_name: Qwen3-30B-A3B
runner: linux-aarch64-a2-2
fail-fast: false
# the accuracy test will run:
# 1. workflow_dispatch with models input
# - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
# - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
# 2. PR labeled with "*-accuracy-test"
# - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B
# - dense-accuracy-test: Qwen/Qwen3-8B-Base
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
# - moe-accuracy-test: Qwen/Qwen3-30B-A3B
model_name: ${{ fromJSON(
(github.event_name == 'schedule' &&
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
(github.event.inputs.models == 'all' &&
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
(github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' &&
'["Qwen/Qwen3-30B-A3B"]') ||
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
'["Qwen/Qwen3-8B-Base"]') ||
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
'["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' ||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
'["Qwen/Qwen3-8B-Base"]' ||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]' ||
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') &&
'["Qwen/Qwen3-30B-A3B"]'
) }}

fail-fast: false
name: ${{ matrix.model_name }} accuracy
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
env:
DATASET_SOURCE: ModelScope
VLLM_USE_MODELSCOPE: True
USE_MODELSCOPE_HUB: 1
# 1. If version specified (work_dispatch), do specified branch accuracy test
# 2. If no version (labeled PR), do accuracy test by default ref:
# The branch, tag or SHA to checkout. When checking out the repository that
Expand All @@ -88,10 +139,10 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set model name as output
id: set_output
- name: Check npu and CANN info
run: |
echo "model_name=${{ matrix.model_name }}" >> $GITHUB_OUTPUT
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info

- name: Config mirrors
run: |
Expand All @@ -110,19 +161,19 @@ jobs:
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: v0.10.0
path: ./vllm-empty
# Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}

- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
run: VLLM_TARGET_DEVICE=empty pip install -e .

- name: Resolve vllm-ascend version
run: |
VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}"

if [[ "$VERSION_INPUT" == "latest" ]]; then
if [[ "$VERSION_INPUT" == "main" ]]; then
TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
LATEST_TAG=$(echo "$TAGS" | head -n1)
if [[ -z "$LATEST_TAG" ]]; then
Expand All @@ -148,8 +199,8 @@ jobs:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .

pip install -v -e .
- name: Get vLLM commit hash and URL
working-directory: ./vllm-empty
run: |
Expand All @@ -162,6 +213,15 @@ jobs:
VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV

- name: Print resolved hashes
run: |
echo "vLLM : ${{ env.VLLM_COMMIT }}"
echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}"

- name: Install lm-eval, ray, and datasets
run: |
pip install lm-eval==0.4.8

- name: Collect version info
run: |
for dir in /usr/local/Ascend/ascend-toolkit/*; do
Expand All @@ -182,27 +242,37 @@ jobs:
pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
} >> "$GITHUB_ENV"

- name: Print versions
run: |
echo "CANN: ${{ env.GHA_CANN_VERSION }}"
echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}"
echo "Torch: ${{ env.GHA_TORCH_VERSION }}"
echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}"

- name: Run accuracy test
- name: Run Accuracy Test
id: report
working-directory: ./benchmarks
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
VLLM_COMMIT: ${{ env.VLLM_COMMIT }}
VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}
VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }}
CANN_VERSION: ${{ env.GHA_CANN_VERSION }}
TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }}
TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }}
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
run: |
model_base_name=$(basename ${{ matrix.model_name }})
markdown_name="${model_base_name}"
echo "markdown_name=$markdown_name"
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
mkdir -p ./benchmarks/accuracy
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \
--report_output ./benchmarks/accuracy/${model_base_name}.md
mkdir -p ./accuracy

python ./scripts/run_accuracy.py \
--model "${{ matrix.model_name }}" \
--output "./accuracy/${markdown_name}.md" \
--vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
--cann_version "${{ env.GHA_CANN_VERSION }}" \
--torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
--torch_version "${{ env.GHA_TORCH_VERSION }}" \
--vllm_version "${{ env.GHA_VLLM_VERSION }}" \
--vllm_commit "${{ env.VLLM_COMMIT }}" \
--vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \

- name: Generate step summary
if: ${{ always() }}
Expand All @@ -214,7 +284,19 @@ jobs:
SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"

- name: Check report first line for failure
id: check_report
run: |
REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md"
echo "Scanning $REPORT_PATH for ❌ …"
if grep -q '❌' "$REPORT_PATH"; then
echo "contains_fail=true" >> $GITHUB_OUTPUT
else
echo "contains_fail=false" >> $GITHUB_OUTPUT
fi

- name: Upload Report
if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }}
uses: actions/upload-artifact@v4
with:
name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
Expand All @@ -223,24 +305,20 @@ jobs:
retention-days: 90
overwrite: true

outputs:
model_name: ${{ steps.set_output.outputs.model_name }}

create_pr:
runs-on: ubuntu-latest
needs: accuracy_tests
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
if: ${{ github.event_name == 'workflow_dispatch' }}
env:
UPSTREAM_REPO: vllm-project/vllm-ascend

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
repository: vllm-ascend-ci/vllm-ascend
token: ${{ secrets.PAT_TOKEN }}
ref: main

- name: Add upstream remote
run: |
git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
Expand Down Expand Up @@ -272,7 +350,7 @@ jobs:
find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete

- name: Update accuracy_report/index.md
run: |
REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
Expand Down Expand Up @@ -312,10 +390,16 @@ jobs:
head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
base: '${{ github.event.inputs.vllm-ascend-version }}',
title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for:
${{
github.event.inputs.models == 'all'
&& 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)'
|| github.event.inputs.models
}}

- [Workflow run][1]

- [Workflow run][1]

[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
});
core.info(`Created PR #${pr.data.number}`);

3 changes: 1 addition & 2 deletions .github/workflows/release_whl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ jobs:
--exclude libc10.so \
--exclude libc_sec.so \
--exclude "libascend*.so" \
--exclude "libtorch*.so" \
--exclude "liberror_manager.so"
--exclude "libtorch*.so"
done
rm -f dist/*.whl
mv dist/repaired/*.whl dist/
Expand Down
26 changes: 0 additions & 26 deletions .github/workflows/reminder_comment.yml

This file was deleted.

24 changes: 8 additions & 16 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -196,24 +196,20 @@ jobs:
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
pytest -sv tests/e2e/singlecard/test_camem.py
pytest -sv tests/e2e/singlecard/test_embedding.py

# ------------------------------------ v1 spec decode test ------------------------------------ #
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py

# All other tests, ignore: 310p test, accuracy test.
pytest -sv tests/e2e/singlecard/ \
--ignore=tests/e2e/singlecard/test_offline_inference.py \
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
--ignore=tests/e2e/singlecard/test_camem.py \
--ignore=tests/e2e/singlecard/test_embedding.py \
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
--ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
e2e-2-cards:
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
# ------------------------------------ v1 spec decode test ------------------------------------ #
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py

e2e-4-cards:
needs: [e2e]
if: ${{ needs.e2e.result == 'success' }}
strategy:
Expand Down Expand Up @@ -281,11 +277,7 @@ jobs:
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
pytest -sv tests/e2e/multicard/test_data_parallel.py
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
--ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
--ignore=tests/e2e/multicard/test_data_parallel.py \
--ignore=tests/e2e/multicard/test_offline_inference_310p.py
--ignore=tests/e2e/multicard/test_data_parallel.py
Loading
Loading