diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 312a7adb11..3b4d23f078 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -7,7 +7,4 @@ self-hosted-runner: - linux-aarch64-a2-4 - linux-aarch64-a2-8 - linux-arm64-npu-static-8 - - linux-aarch64-310p-1 - - linux-aarch64-310p-2 - - linux-aarch64-310p-4 - ubuntu-24.04-arm diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 7140f262f7..0a98feb186 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -29,15 +29,35 @@ on: types: [ labeled ] workflow_dispatch: inputs: - vllm-ascend-version: - description: 'vllm-ascend:' + vllm-version: + description: 'vllm version:' required: true type: choice + # Please also update this when bump matched version # Current supported vLLM versions options: - - latest - main - default: main + - v0.10.0 + - v0.9.1 + - v0.7.3 + vllm-ascend-version: + description: 'vllm-ascend version:' + required: true + type: choice + options: + - main + - v0.9.1-dev + - v0.7.3-dev + models: + description: 'model:' + required: true + type: choice + options: + - all + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B + default: 'all' # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -56,27 +76,58 @@ jobs: # test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job if: >- ${{ - contains(github.event.pull_request.labels.*.name, 'accuracy-test') && + (contains(github.event.pull_request.labels.*.name, 'accuracy-test') || + contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') || + contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') || + contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' }} - runs-on: ${{ matrix.runner }} + runs-on: >- + ${{ + (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') || + 'linux-aarch64-a2-1' + }} strategy: matrix: - include: - - model_name: Qwen3-8B-Base - runner: linux-aarch64-a2-1 - - model_name: Qwen2.5-VL-7B-Instruct - runner: linux-aarch64-a2-1 - - model_name: Qwen3-30B-A3B - runner: linux-aarch64-a2-2 - fail-fast: false + # the accuracy test will run: + # 1. workflow_dispatch with models input + # - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base + # - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base + # 2. PR labeled with "*-accuracy-test" + # - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B + # - dense-accuracy-test: Qwen/Qwen3-8B-Base + # - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct + # - moe-accuracy-test: Qwen/Qwen3-30B-A3B + model_name: ${{ fromJSON( + (github.event_name == 'schedule' && + '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + (github.event.inputs.models == 'all' && + '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + (github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' && + '["Qwen/Qwen3-30B-A3B"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && + '["Qwen/Qwen2.5-VL-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && + '["Qwen/Qwen3-8B-Base"]') || + contains(github.event.pull_request.labels.*.name, 'accuracy-test') && + '["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' || + contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') && + '["Qwen/Qwen3-8B-Base"]' || + contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && + '["Qwen/Qwen2.5-VL-7B-Instruct"]' || + contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') && + '["Qwen/Qwen3-30B-A3B"]' + ) }} + fail-fast: false name: ${{ matrix.model_name }} accuracy container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 env: + DATASET_SOURCE: ModelScope VLLM_USE_MODELSCOPE: True + USE_MODELSCOPE_HUB: 1 # 1. If version specified (work_dispatch), do specified branch accuracy test # 2. If no version (labeled PR), do accuracy test by default ref: # The branch, tag or SHA to checkout. When checking out the repository that @@ -88,10 +139,10 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Set model name as output - id: set_output + - name: Check npu and CANN info run: | - echo "model_name=${{ matrix.model_name }}" >> $GITHUB_OUTPUT + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - name: Config mirrors run: | @@ -110,19 +161,19 @@ jobs: uses: actions/checkout@v4 with: repository: vllm-project/vllm - ref: v0.10.0 path: ./vllm-empty + # Please also update this when bump matched version + ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . + run: VLLM_TARGET_DEVICE=empty pip install -e . - name: Resolve vllm-ascend version run: | VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}" - if [[ "$VERSION_INPUT" == "latest" ]]; then + if [[ "$VERSION_INPUT" == "main" ]]; then TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||') LATEST_TAG=$(echo "$TAGS" | head -n1) if [[ -z "$LATEST_TAG" ]]; then @@ -148,8 +199,8 @@ jobs: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt - pip install -v -e . - + pip install -v -e . + - name: Get vLLM commit hash and URL working-directory: ./vllm-empty run: | @@ -162,6 +213,15 @@ jobs: VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD) echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV + - name: Print resolved hashes + run: | + echo "vLLM : ${{ env.VLLM_COMMIT }}" + echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}" + + - name: Install lm-eval, ray, and datasets + run: | + pip install lm-eval==0.4.8 + - name: Collect version info run: | for dir in /usr/local/Ascend/ascend-toolkit/*; do @@ -182,27 +242,37 @@ jobs: pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}' pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//' } >> "$GITHUB_ENV" + + - name: Print versions + run: | + echo "CANN: ${{ env.GHA_CANN_VERSION }}" + echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}" + echo "Torch: ${{ env.GHA_TORCH_VERSION }}" + echo "vLLM: ${{ env.GHA_VLLM_VERSION }}" + echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}" - - name: Run accuracy test + - name: Run Accuracy Test id: report + working-directory: ./benchmarks env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - VLLM_USE_MODELSCOPE: True - VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }} - VLLM_COMMIT: ${{ env.VLLM_COMMIT }} - VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }} - VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }} - CANN_VERSION: ${{ env.GHA_CANN_VERSION }} - TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }} - TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }} + PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 run: | model_base_name=$(basename ${{ matrix.model_name }}) markdown_name="${model_base_name}" + echo "markdown_name=$markdown_name" echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT - mkdir -p ./benchmarks/accuracy - pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \ - --config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \ - --report_output ./benchmarks/accuracy/${model_base_name}.md + mkdir -p ./accuracy + + python ./scripts/run_accuracy.py \ + --model "${{ matrix.model_name }}" \ + --output "./accuracy/${markdown_name}.md" \ + --vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \ + --cann_version "${{ env.GHA_CANN_VERSION }}" \ + --torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \ + --torch_version "${{ env.GHA_TORCH_VERSION }}" \ + --vllm_version "${{ env.GHA_VLLM_VERSION }}" \ + --vllm_commit "${{ env.VLLM_COMMIT }}" \ + --vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \ - name: Generate step summary if: ${{ always() }} @@ -214,7 +284,19 @@ jobs: SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}" echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV" + - name: Check report first line for failure + id: check_report + run: | + REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md" + echo "Scanning $REPORT_PATH for ❌ …" + if grep -q '❌' "$REPORT_PATH"; then + echo "contains_fail=true" >> $GITHUB_OUTPUT + else + echo "contains_fail=false" >> $GITHUB_OUTPUT + fi + - name: Upload Report + if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }} uses: actions/upload-artifact@v4 with: name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}" @@ -223,16 +305,12 @@ jobs: retention-days: 90 overwrite: true - outputs: - model_name: ${{ steps.set_output.outputs.model_name }} - create_pr: runs-on: ubuntu-latest needs: accuracy_tests - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }} + if: ${{ github.event_name == 'workflow_dispatch' }} env: UPSTREAM_REPO: vllm-project/vllm-ascend - steps: - name: Checkout repository uses: actions/checkout@v4 @@ -240,7 +318,7 @@ jobs: repository: vllm-ascend-ci/vllm-ascend token: ${{ secrets.PAT_TOKEN }} ref: main - + - name: Add upstream remote run: | git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git @@ -272,7 +350,7 @@ jobs: find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - + - name: Update accuracy_report/index.md run: | REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" @@ -312,10 +390,16 @@ jobs: head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`, base: '${{ github.event.inputs.vllm-ascend-version }}', title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`, - body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base) + body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: + ${{ + github.event.inputs.models == 'all' + && 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' + || github.event.inputs.models + }} + + - [Workflow run][1] - - [Workflow run][1] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` + [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` }); core.info(`Created PR #${pr.data.number}`); + diff --git a/.github/workflows/release_whl.yml b/.github/workflows/release_whl.yml index d780a5fbea..2e6a44bd17 100644 --- a/.github/workflows/release_whl.yml +++ b/.github/workflows/release_whl.yml @@ -90,8 +90,7 @@ jobs: --exclude libc10.so \ --exclude libc_sec.so \ --exclude "libascend*.so" \ - --exclude "libtorch*.so" \ - --exclude "liberror_manager.so" + --exclude "libtorch*.so" done rm -f dist/*.whl mv dist/repaired/*.whl dist/ diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml deleted file mode 100644 index 64aa0f30ae..0000000000 --- a/.github/workflows/reminder_comment.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: PR Reminder Comment Bot -permissions: - pull-requests: write -on: - pull_request_target: - types: [opened] -jobs: - pr_reminder: - runs-on: ubuntu-latest - steps: - - name: Remind to run full CI on PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - with: - script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM Ascend project. The following points will speed up your PR merge:‌‌\n\n' + - '- A PR should do only one thing, smaller PRs enable faster reviews.\n' + - '- Every PR should include unit tests and end-to-end tests ‌to ensure it works and is not broken by other future PRs.\n' + - '- Write the commit message by fulfilling the PR description to help reviewer and future developers understand.\n\n' + - 'If CI fails, you can run linting and testing checks locally according [Contributing](https://vllm-ascend.readthedocs.io/zh-cn/latest/developer_guide/contribution/index.html) and [Testing](https://vllm-ascend.readthedocs.io/zh-cn/latest/developer_guide/contribution/testing.html).' - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 43b88b0807..580559c948 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -196,13 +196,6 @@ jobs: pytest -sv tests/e2e/singlecard/test_guided_decoding.py pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_embedding.py - - # ------------------------------------ v1 spec decode test ------------------------------------ # - pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py - # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed - pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py - - # All other tests, ignore: 310p test, accuracy test. pytest -sv tests/e2e/singlecard/ \ --ignore=tests/e2e/singlecard/test_offline_inference.py \ --ignore=tests/e2e/singlecard/test_ilama_lora.py \ @@ -210,10 +203,13 @@ jobs: --ignore=tests/e2e/singlecard/test_camem.py \ --ignore=tests/e2e/singlecard/test_embedding.py \ --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \ - --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \ - --ignore=tests/e2e/singlecard/test_offline_inference_310p.py \ - --ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py - e2e-2-cards: + --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py + # ------------------------------------ v1 spec decode test ------------------------------------ # + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py + # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py + + e2e-4-cards: needs: [e2e] if: ${{ needs.e2e.result == 'success' }} strategy: @@ -281,11 +277,7 @@ jobs: pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_data_parallel.py pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \ --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \ - --ignore=tests/e2e/multicard/test_data_parallel.py \ - --ignore=tests/e2e/multicard/test_offline_inference_310p.py + --ignore=tests/e2e/multicard/test_data_parallel.py diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml deleted file mode 100644 index 2bd1d2db87..0000000000 --- a/.github/workflows/vllm_ascend_test_310p.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -name: 'e2e test / 310p-test' - -on: - push: - tags: - - 'v*' - schedule: - # Runs every 6 hours - - cron: '0 */6 * * *' - pull_request: - types: [ labeled ] - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only cancel in-progress runs of the same workflow -# and ignore the lint / 1 card / 4 cards test type -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - # e2e-310p-test will be triggered when tag 'e2e-310p-test' & 'ready-for-test' or schedule job - if: >- - ${{ - (contains(github.event.pull_request.labels.*.name, 'e2e-310p-test')) && - contains(github.event.pull_request.labels.*.name, 'ready-for-test') || - github.event_name == 'schedule' || github.event_name == 'push' - }} - strategy: - max-parallel: 2 - matrix: - os: [linux-aarch64-310p-1, linux-aarch64-310p-4] - vllm_version: [main, v0.10.0] - name: 310p e2e test - runs-on: ${{ matrix.os }} - container: - # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-310p-ubuntu22.04-py3.11 - env: - VLLM_LOGGING_LEVEL: ERROR - VLLM_USE_MODELSCOPE: True - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list - pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple - pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local - apt-get update -y - apt install git -y - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install system dependencies - run: | - apt-get -y install `cat packages.txt` - apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - ref: ${{ matrix.vllm_version }} - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - run: | - export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib - export SOC_VERSION=ASCEND310P3 - pip install -r requirements-dev.txt - pip install -v -e . - - - name: Run e2e test - env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - VLLM_USE_MODELSCOPE: True - PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 - run: | - if [[ "${{ matrix.os }}" == "linux-aarch64-310p-1" ]]; then - pytest -sv tests/e2e/singlecard/test_offline_inference_310p.py - else - pytest -sv tests/e2e/multicard/test_offline_inference_310p.py - fi \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index a12df1e0b7..983ea003ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,7 +54,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi python3 -m pip cache purge # Install modelscope (for fast download) and ray (for multinode) -RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ +RUN python3 -m pip install modelscope ray && \ python3 -m pip cache purge CMD ["/bin/bash"] diff --git a/Dockerfile.310p b/Dockerfile.310p index 299624c541..1064b73ed2 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -55,7 +55,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi python3 -m pip cache purge # Install modelscope (for fast download) and ray (for multinode) -RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ +RUN python3 -m pip install modelscope ray && \ python3 -m pip cache purge CMD ["/bin/bash"] diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index ff7ec05a43..c6d34984f1 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -52,7 +52,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi python3 -m pip cache purge # Install modelscope (for fast download) and ray (for multinode) -RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ +RUN python3 -m pip install modelscope ray && \ python3 -m pip cache purge CMD ["/bin/bash"] diff --git a/Dockerfile.a3 b/Dockerfile.a3 index da1efcc41b..00fbd6b4df 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -54,7 +54,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi python3 -m pip cache purge # Install modelscope (for fast download) and ray (for multinode) -RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ +RUN python3 -m pip install modelscope ray && \ python3 -m pip cache purge CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index b03851ca65..64184518ed 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -51,7 +51,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi python3 -m pip cache purge # Install modelscope (for fast download) and ray (for multinode) -RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ +RUN python3 -m pip install modelscope ray && \ python3 -m pip cache purge CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 1146d0a00a..c30c7df68b 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -51,7 +51,7 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi python3 -m pip cache purge # Install modelscope (for fast download) and ray (for multinode) -RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ +RUN python3 -m pip install modelscope ray && \ python3 -m pip cache purge CMD ["/bin/bash"] diff --git a/README.md b/README.md index 8fac45b0c9..e04a551a9b 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l - Software: * Python >= 3.9, < 3.12 * CANN >= 8.2.rc1 - * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250619 * vLLM (the same version as vllm-ascend) ## Getting Started @@ -52,7 +52,6 @@ Please use the following recommended versions to get started quickly: | Version | Release type | Doc | |------------|--------------|--------------------------------------| |v0.9.2rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details| -|v0.9.1rc2|Next stable release|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details| |v0.7.3.post1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/stable/installation.html) for more details| ## Contributing diff --git a/README.zh.md b/README.zh.md index 21fbbf0579..746fed99be 100644 --- a/README.zh.md +++ b/README.zh.md @@ -42,7 +42,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP - 软件: * Python >= 3.9, < 3.12 * CANN >= 8.2.rc1 - * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250619 * vLLM (与vllm-ascend版本一致) ## 开始使用 @@ -52,7 +52,6 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP | Version | Release type | Doc | |------------|--------------|--------------------------------------| |v0.9.2rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多| -|v0.9.1rc2| 下一个正式/稳定版 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多| |v0.7.3.post1| 最新正式/稳定版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/stable/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/stable/installation.html)了解更多| ## 贡献 diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py new file mode 100644 index 0000000000..cc2f4e22da --- /dev/null +++ b/benchmarks/scripts/run_accuracy.py @@ -0,0 +1,313 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import argparse +import gc +import json +import multiprocessing +import sys +import time +from multiprocessing import Queue + +import lm_eval +import torch + +# URLs for version information in Markdown report +VLLM_URL = "https://github.com/vllm-project/vllm/commit/" +VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/" + +# Model and task configurations +UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"] +UNIMODAL_TASK = ["ceval-valid", "gsm8k"] +MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] +MULTIMODAL_TASK = ["mmmu_val"] + +# Batch size configurations per task +BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1} + +# Model type mapping (vllm for text, vllm-vlm for vision-language) +MODEL_TYPE = { + "Qwen/Qwen3-8B-Base": "vllm", + "Qwen/Qwen3-30B-A3B": "vllm", + "Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm", +} + +# Command templates for running evaluations +MODEL_RUN_INFO = { + "Qwen/Qwen3-30B-A3B": ( + "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" + "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" + ), + "Qwen/Qwen3-8B-Base": ( + "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" + "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" + ), + "Qwen/Qwen2.5-VL-7B-Instruct": ( + "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n" + "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" + "--apply_chat_template --fewshot_as_multiturn --batch_size 1" + ), +} + +# Evaluation metric filters per task +FILTER = { + "gsm8k": "exact_match,flexible-extract", + "ceval-valid": "acc,none", + "mmmu_val": "acc,none", +} + +# Expected accuracy values for models +EXPECTED_VALUE = { + "Qwen/Qwen3-30B-A3B": {"ceval-valid": 0.83, "gsm8k": 0.85}, + "Qwen/Qwen3-8B-Base": {"ceval-valid": 0.82, "gsm8k": 0.83}, + "Qwen/Qwen2.5-VL-7B-Instruct": {"mmmu_val": 0.51}, +} +PARALLEL_MODE = { + "Qwen/Qwen3-8B-Base": "TP", + "Qwen/Qwen2.5-VL-7B-Instruct": "TP", + "Qwen/Qwen3-30B-A3B": "EP", +} + +# Execution backend configuration +EXECUTION_MODE = { + "Qwen/Qwen3-8B-Base": "ACLGraph", + "Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph", + "Qwen/Qwen3-30B-A3B": "ACLGraph", +} + +# Model arguments for evaluation +MODEL_ARGS = { + "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6", + "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2", + "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True", +} + +# Whether to apply chat template formatting +APPLY_CHAT_TEMPLATE = { + "Qwen/Qwen3-8B-Base": True, + "Qwen/Qwen2.5-VL-7B-Instruct": True, + "Qwen/Qwen3-30B-A3B": False, +} +# Few-shot examples handling as multi-turn dialogues. +FEWSHOT_AS_MULTITURN = { + "Qwen/Qwen3-8B-Base": True, + "Qwen/Qwen2.5-VL-7B-Instruct": True, + "Qwen/Qwen3-30B-A3B": False, +} + +# Relative tolerance for accuracy checks +RTOL = 0.03 +ACCURACY_FLAG = {} + + +def run_accuracy_test(queue, model, dataset): + """Run accuracy evaluation for a model on a dataset in separate process""" + try: + eval_params = { + "model": MODEL_TYPE[model], + "model_args": MODEL_ARGS[model], + "tasks": dataset, + "apply_chat_template": APPLY_CHAT_TEMPLATE[model], + "fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model], + "batch_size": BATCH_SIZE[dataset], + } + + if MODEL_TYPE[model] == "vllm": + eval_params["num_fewshot"] = 5 + + results = lm_eval.simple_evaluate(**eval_params) + print(f"Success: {model} on {dataset} ") + measured_value = results["results"] + queue.put(measured_value) + except Exception as e: + print(f"Error in run_accuracy_test: {e}") + queue.put(e) + sys.exit(1) + finally: + if "results" in locals(): + del results + gc.collect() + torch.npu.empty_cache() + time.sleep(5) + + +def generate_md(model_name, tasks_list, args, datasets): + """Generate Markdown report with evaluation results""" + # Format the run command + run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets) + model = model_name.split("/")[1] + + # Version information section + version_info = ( + f"**vLLM Version**: vLLM: {args.vllm_version} " + f"([{args.vllm_commit}]({VLLM_URL + args.vllm_commit})), " + f"vLLM Ascend: {args.vllm_ascend_version} " + f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL + args.vllm_ascend_commit})) " + ) + + # Report header with system info + preamble = f"""# {model} +{version_info} +**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version} +**Hardware Environment**: Atlas A2 Series +**Datasets**: {datasets} +**Parallel Mode**: {PARALLEL_MODE[model_name]} +**Execution Mode**: {EXECUTION_MODE[model_name]} +**Command**: +```bash +{run_cmd} +``` + """ + + header = ( + "| Task | Filter | n-shot | Metric | Value | Stderr |\n" + "|-----------------------|-------:|-------:|----------|--------:|-------:|" + ) + rows = [] + rows_sub = [] + # Process results for each task + for task_dict in tasks_list: + for key, stats in task_dict.items(): + alias = stats.get("alias", key) + task_name = alias.strip() + if "exact_match,flexible-extract" in stats: + metric_key = "exact_match,flexible-extract" + else: + metric_key = None + for k in stats: + if "," in k and not k.startswith("acc_stderr"): + metric_key = k + break + if metric_key is None: + continue + metric, flt = metric_key.split(",", 1) + + value = stats[metric_key] + stderr = stats.get(f"{metric}_stderr,{flt}", 0) + if model_name in UNIMODAL_MODEL_NAME: + n_shot = "5" + else: + n_shot = "0" + flag = ACCURACY_FLAG.get(task_name, "") + row = ( + f"| {task_name:<37} " + f"| {flt:<6} " + f"| {n_shot:6} " + f"| {metric:<6} " + f"| {flag}{value:>5.4f} " + f"| ± {stderr:>5.4f} |" + ) + if not task_name.startswith("-"): + rows.append(row) + rows_sub.append( + "
" + + "\n" + + "" + + task_name + + " details" + + "" + + "\n" * 2 + + header + ) + rows_sub.append(row) + rows_sub.append("
") + # Combine all Markdown sections + md = ( + preamble + + "\n" + + header + + "\n" + + "\n".join(rows) + + "\n" + + "\n".join(rows_sub) + + "\n" + ) + print(md) + return md + + +def safe_md(args, accuracy, datasets): + """ + Safely generate and save Markdown report from accuracy results. + """ + data = json.loads(json.dumps(accuracy)) + for model_key, tasks_list in data.items(): + md_content = generate_md(model_key, tasks_list, args, datasets) + with open(args.output, "w", encoding="utf-8") as f: + f.write(md_content) + print(f"create Markdown file:{args.output}") + + +def main(args): + """Main evaluation workflow""" + accuracy = {} + accuracy[args.model] = [] + result_queue: Queue[float] = multiprocessing.Queue() + if args.model in UNIMODAL_MODEL_NAME: + datasets = UNIMODAL_TASK + else: + datasets = MULTIMODAL_TASK + datasets_str = ",".join(datasets) + # Evaluate model on each dataset + for dataset in datasets: + accuracy_expected = EXPECTED_VALUE[args.model][dataset] + p = multiprocessing.Process( + target=run_accuracy_test, args=(result_queue, args.model, dataset) + ) + p.start() + p.join() + if p.is_alive(): + p.terminate() + p.join() + gc.collect() + torch.npu.empty_cache() + time.sleep(10) + result = result_queue.get() + print(result) + if ( + accuracy_expected - RTOL + < result[dataset][FILTER[dataset]] + < accuracy_expected + RTOL + ): + ACCURACY_FLAG[dataset] = "✅" + else: + ACCURACY_FLAG[dataset] = "❌" + accuracy[args.model].append(result) + print(accuracy) + safe_md(args, accuracy, datasets_str) + + +if __name__ == "__main__": + multiprocessing.set_start_method("spawn", force=True) + # Initialize argument parser + parser = argparse.ArgumentParser( + description="Run model accuracy evaluation and generate report" + ) + parser.add_argument("--output", type=str, required=True) + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--vllm_ascend_version", type=str, required=False) + parser.add_argument("--torch_version", type=str, required=False) + parser.add_argument("--torch_npu_version", type=str, required=False) + parser.add_argument("--vllm_version", type=str, required=False) + parser.add_argument("--cann_version", type=str, required=False) + parser.add_argument("--vllm_commit", type=str, required=False) + parser.add_argument("--vllm_ascend_commit", type=str, required=False) + args = parser.parse_args() + main(args) diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index f9c56d5876..2dd13e1451 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -23,7 +23,6 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo | |-------------|--------------|------------------|-------------|--------------------|--------------| | v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | | -| v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1| | | v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | | | v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | @@ -38,7 +37,6 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | Date | Event | |------------|-------------------------------------------| -| 2025.08.04 | Release candidates, v0.9.1rc2 | | 2025.07.11 | Release candidates, v0.9.2rc1 | | 2025.06.22 | Release candidates, v0.9.1rc1 | | 2025.06.10 | Release candidates, v0.9.0rc2 | diff --git a/docs/source/faqs.md b/docs/source/faqs.md index a565ab818c..4192a24700 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -3,7 +3,6 @@ ## Version Specific FAQs - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007) -- [[v0.9.1rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1487) - [[v0.9.2rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1742) ## General FAQs diff --git a/docs/source/installation.md b/docs/source/installation.md index e3bbfdfff1..20d4379d9b 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -12,8 +12,8 @@ This document describes how to install vllm-ascend manually. | Software | Supported version | Note | |---------------|----------------------------------|-------------------------------------------| | CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu | - | torch-npu | >= 2.7.1.dev20250724 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps | - | torch | >= 2.7.1 | Required for torch-npu and vllm | + | torch-npu | >= 2.5.1.post1.dev20250619 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps | + | torch | >= 2.5.1 | Required for torch-npu and vllm | You have 2 way to install: - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip. @@ -78,19 +78,20 @@ source vllm-ascend-env/bin/activate pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions # Download and install the CANN package. -wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run -chmod +x ./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run -./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run --full +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-toolkit_8.2.rc1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-toolkit_8.2.rc1_linux-"$(uname -i)".run +./Ascend-cann-toolkit_8.2.rc1_linux-"$(uname -i)".run --full # https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.2.rc1_linux-aarch64.run source /usr/local/Ascend/ascend-toolkit/set_env.sh -wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run -chmod +x ./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run -./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run --install -wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run -chmod +x ./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run -./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run --install +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.2.rc1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-kernels-910b_8.2.rc1_linux-"$(uname -i)".run +./Ascend-cann-kernels-910b_8.2.rc1_linux-"$(uname -i)".run --install + +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-nnal_8.2.rc1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-nnal_8.2.rc1_linux-"$(uname -i)".run +./Ascend-cann-nnal_8.2.rc1_linux-"$(uname -i)".run --install source /usr/local/Ascend/nnal/atb/set_env.sh ``` diff --git a/docs/source/tutorials/single_node_300i.md b/docs/source/tutorials/single_node_300i.md index 270d002ca8..7d45bfd6bc 100644 --- a/docs/source/tutorials/single_node_300i.md +++ b/docs/source/tutorials/single_node_300i.md @@ -1,8 +1,7 @@ # Single Node (Atlas 300I series) ```{note} -1. This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. -2. Currently, the 310I series only supports eager mode and the data type is float16. +This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. ``` ## Run vLLM on Altlas 300I series @@ -84,7 +83,7 @@ curl http://localhost:8000/v1/completions \ :::: -::::{tab-item} Qwen2.5-7B-Instruct +::::{tab-item} Qwen/Qwen2.5-7B-Instruct :sync: qwen7b Run the following command to start the vLLM server: @@ -114,36 +113,6 @@ curl http://localhost:8000/v1/completions \ :::: -::::{tab-item} Qwen2.5-VL-3B-Instruct -:sync: qwen-vl-2.5-3b - -Run the following command to start the vLLM server: - -```{code-block} bash - :substitutions: -vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ - --tensor-parallel-size 1 \ - --enforce-eager \ - --dtype float16 \ - --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' -``` - -Once your server is started, you can query the model with input prompts - -```bash -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "The future of AI is", - "max_tokens": 64, - "top_p": 0.95, - "top_k": 50, - "temperature": 0.6 - }' -``` - -:::: - ::::{tab-item} Pangu-Pro-MoE-72B :sync: pangu @@ -282,49 +251,6 @@ clean_up() :::: -::::{tab-item} Qwen2.5-VL-3B-Instruct -:sync: qwen-vl-2.5-3b - -```{code-block} python - :substitutions: -from vllm import LLM, SamplingParams -import gc -import torch -from vllm import LLM, SamplingParams -from vllm.distributed.parallel_state import (destroy_distributed_environment, - destroy_model_parallel) - -def clean_up(): - destroy_model_parallel() - destroy_distributed_environment() - gc.collect() - torch.npu.empty_cache() -prompts = [ - "Hello, my name is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, top_p=0.95, top_k=50, temperature=0.6) -# Create an LLM. -llm = LLM( - model="Qwen/Qwen2.5-VL-3B-Instruct", - tensor_parallel_size=1, - enforce_eager=True, # For 300I series, only eager mode is supported. - dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series - compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series -) -# Generate texts from the prompts. -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -del llm -clean_up() -``` - -:::: - ::::{tab-item} Pangu-Pro-MoE-72B :sync: pangu diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 7e77e0def0..7e17b7700b 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,211 +1,75 @@ # Release note -## v0.9.1rc2 - 2025.08.04 -This is the 2nd release candidate of v0.9.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/) to get started. - -### Highlights -* MOE and dense w4a8 quantization support now: [#1320](https://github.com/vllm-project/vllm-ascend/pull/1320) [#1910](https://github.com/vllm-project/vllm-ascend/pull/1910) [#1275](https://github.com/vllm-project/vllm-ascend/pull/1275) [#1480](https://github.com/vllm-project/vllm-ascend/pull/1480) -* Dynamic EPLB support in [#1943](https://github.com/vllm-project/vllm-ascend/pull/1943) -* Disaggregated Prefilling support for V1 Engine and improvement, continued development and stabilization of the disaggregated prefill feature, including performance enhancements and bug fixes for single-machine setups:[#1953](https://github.com/vllm-project/vllm-ascend/pull/1953) [#1612](https://github.com/vllm-project/vllm-ascend/pull/1612) [#1361](https://github.com/vllm-project/vllm-ascend/pull/1361) [#1746](https://github.com/vllm-project/vllm-ascend/pull/1746) [#1552](https://github.com/vllm-project/vllm-ascend/pull/1552) [#1801](https://github.com/vllm-project/vllm-ascend/pull/1801) [#2083](https://github.com/vllm-project/vllm-ascend/pull/2083) [#1989](https://github.com/vllm-project/vllm-ascend/pull/1989) - -### Models improvement: -* DeepSeek DeepSeek DBO support and improvement: [#1285](https://github.com/vllm-project/vllm-ascend/pull/1285) [#1291](https://github.com/vllm-project/vllm-ascend/pull/1291) [#1328](https://github.com/vllm-project/vllm-ascend/pull/1328) [#1420](https://github.com/vllm-project/vllm-ascend/pull/1420) [#1445](https://github.com/vllm-project/vllm-ascend/pull/1445) [#1589](https://github.com/vllm-project/vllm-ascend/pull/1589) [#1759](https://github.com/vllm-project/vllm-ascend/pull/1759) [#1827](https://github.com/vllm-project/vllm-ascend/pull/1827) [#2093](https://github.com/vllm-project/vllm-ascend/pull/2093) -* DeepSeek MTP improvement and bugfix: [#1214](https://github.com/vllm-project/vllm-ascend/pull/1214) [#943](https://github.com/vllm-project/vllm-ascend/pull/943) [#1584](https://github.com/vllm-project/vllm-ascend/pull/1584) [#1473](https://github.com/vllm-project/vllm-ascend/pull/1473) [#1294](https://github.com/vllm-project/vllm-ascend/pull/1294) [#1632](https://github.com/vllm-project/vllm-ascend/pull/1632) [#1694](https://github.com/vllm-project/vllm-ascend/pull/1694) [#1840](https://github.com/vllm-project/vllm-ascend/pull/1840) [#2076](https://github.com/vllm-project/vllm-ascend/pull/2076) [#1990](https://github.com/vllm-project/vllm-ascend/pull/1990) [#2019](https://github.com/vllm-project/vllm-ascend/pull/2019) -* Qwen3 MoE support improvement and bugfix around graph mode and DP: [#1940](https://github.com/vllm-project/vllm-ascend/pull/1940) [#2006](https://github.com/vllm-project/vllm-ascend/pull/2006) [#1832](https://github.com/vllm-project/vllm-ascend/pull/1832) -* Qwen3 performance improvement around rmsnorm/repo/mlp ops: [#1545](https://github.com/vllm-project/vllm-ascend/pull/1545) [#1719](https://github.com/vllm-project/vllm-ascend/pull/1719) [#1726](https://github.com/vllm-project/vllm-ascend/pull/1726) [#1782](https://github.com/vllm-project/vllm-ascend/pull/1782) [#1745](https://github.com/vllm-project/vllm-ascend/pull/1745) -* DeepSeek MLA chunked prefill/graph mode/multistream improvement and bugfix: [#1240](https://github.com/vllm-project/vllm-ascend/pull/1240) [#933](https://github.com/vllm-project/vllm-ascend/pull/933) [#1135](https://github.com/vllm-project/vllm-ascend/pull/1135) [#1311](https://github.com/vllm-project/vllm-ascend/pull/1311) [#1750](https://github.com/vllm-project/vllm-ascend/pull/1750) [#1872](https://github.com/vllm-project/vllm-ascend/pull/1872) [#2170](https://github.com/vllm-project/vllm-ascend/pull/2170) [#1551](https://github.com/vllm-project/vllm-ascend/pull/1551) -* Qwen2.5 VL improvement via mrope/padding mechanism improvement: [#1261](https://github.com/vllm-project/vllm-ascend/pull/1261) [#1705](https://github.com/vllm-project/vllm-ascend/pull/1705) [#1929](https://github.com/vllm-project/vllm-ascend/pull/1929) [#2007](https://github.com/vllm-project/vllm-ascend/pull/2007) -* Ray: Fix the device error when using ray and add initialize_cache and improve warning info: [#1234](https://github.com/vllm-project/vllm-ascend/pull/1234) [#1501](https://github.com/vllm-project/vllm-ascend/pull/1501) - -### Graph mode improvement: -* Fix DeepSeek with deepseek with mc2 in [#1269](https://github.com/vllm-project/vllm-ascend/pull/1269) -* Fix accuracy problem for deepseek V3/R1 models with torchair graph in long sequence predictions in [#1332](https://github.com/vllm-project/vllm-ascend/pull/1332) -* Fix torchair_graph_batch_sizes bug in [#1570](https://github.com/vllm-project/vllm-ascend/pull/1570) -* Enable the limit of tp <= 4 for torchair graph mode in [#1404](https://github.com/vllm-project/vllm-ascend/pull/1404) -* Fix rope accruracy bug [#1887](https://github.com/vllm-project/vllm-ascend/pull/1887) -* Support multistream of shared experts in FusedMoE [#997](https://github.com/vllm-project/vllm-ascend/pull/997) -* Enable kvcache_nz for the decode process in torchair graph mode[#1098](https://github.com/vllm-project/vllm-ascend/pull/1098) -* Fix chunked-prefill with torchair case to resolve UnboundLocalError: local variable 'decode_hs_or_q_c' issue in [#1378](https://github.com/vllm-project/vllm-ascend/pull/1378) -* Improve shared experts multi-stream perf for w8a8 dynamic. in [#1561](https://github.com/vllm-project/vllm-ascend/pull/1561) -* Repair moe error when set multistream. in [#1882](https://github.com/vllm-project/vllm-ascend/pull/1882) -* Round up graph batch size to tp size in EP case [#1610](https://github.com/vllm-project/vllm-ascend/pull/1610) -* Fix torchair bug when DP is enabled in [#1727](https://github.com/vllm-project/vllm-ascend/pull/1727) -* Add extra checking to torchair_graph_config. in [#1675](https://github.com/vllm-project/vllm-ascend/pull/1675) -* Fix rope bug in torchair+chunk-prefill scenario in [#1693](https://github.com/vllm-project/vllm-ascend/pull/1693) -* torchair_graph bugfix when chunked_prefill is true in [#1748](https://github.com/vllm-project/vllm-ascend/pull/1748) -* Improve prefill optimization to support torchair graph mode in [#2090](https://github.com/vllm-project/vllm-ascend/pull/2090) -* Fix rank set in DP scenario [#1247](https://github.com/vllm-project/vllm-ascend/pull/1247) -* Reset all unused positions to prevent out-of-bounds to resolve GatherV3 bug in [#1397](https://github.com/vllm-project/vllm-ascend/pull/1397) -* Remove duplicate multimodal codes in ModelRunner in [#1393](https://github.com/vllm-project/vllm-ascend/pull/1393) -* Fix block table shape to resolve accuracy issue in [#1297](https://github.com/vllm-project/vllm-ascend/pull/1297) -* Implement primal full graph with limited scenario in [#1503](https://github.com/vllm-project/vllm-ascend/pull/1503) -* Restore paged attention kernel in Full Graph for performance in [#1677](https://github.com/vllm-project/vllm-ascend/pull/1677) -* Fix DeepSeek OOM issue in extreme `--gpu-memory-utilization` scenario in [#1829](https://github.com/vllm-project/vllm-ascend/pull/1829) -* Turn off aclgraph when enabling TorchAir in [#2154](https://github.com/vllm-project/vllm-ascend/pull/2154) - -### Ops improvement: -* add custom ascendc kernel vocabparallelembedding [#796](https://github.com/vllm-project/vllm-ascend/pull/796) -* fix rope sin/cos cache bug in [#1267](https://github.com/vllm-project/vllm-ascend/pull/1267) -* Refactoring AscendFusedMoE (#1229) in [#1264](https://github.com/vllm-project/vllm-ascend/pull/1264) -* Use fused ops npu_top_k_top_p in sampler [#1920](https://github.com/vllm-project/vllm-ascend/pull/1920) - -### Core: -* Upgrade CANN to 8.2.rc1 in [#2036](https://github.com/vllm-project/vllm-ascend/pull/2036) -* Upgrade torch-npu to 2.5.1.post1 in [#2135](https://github.com/vllm-project/vllm-ascend/pull/2135) -* Upgrade python to 3.11 in [#2136](https://github.com/vllm-project/vllm-ascend/pull/2136) -* Disable quantization in mindie_turbo in [#1749](https://github.com/vllm-project/vllm-ascend/pull/1749) -* fix v0 spec decode in [#1323](https://github.com/vllm-project/vllm-ascend/pull/1323) -* Enable `ACL_OP_INIT_MODE=1` directly only when using V0 spec decode in [#1271](https://github.com/vllm-project/vllm-ascend/pull/1271) -* Refactoring forward_context and model_runner_v1 in [#1422](https://github.com/vllm-project/vllm-ascend/pull/1422) -* Fix sampling params in [#1423](https://github.com/vllm-project/vllm-ascend/pull/1423) -* add a switch for enabling NZ layout in weights and enable NZ for GMM. in [#1409](https://github.com/vllm-project/vllm-ascend/pull/1409) -* Resolved bug in ascend_forward_context in [#1449](https://github.com/vllm-project/vllm-ascend/pull/1449) [#1554](https://github.com/vllm-project/vllm-ascend/pull/1554) [#1598](https://github.com/vllm-project/vllm-ascend/pull/1598) -* Address PrefillCacheHit state to fix prefix cache accuracy bug in [#1492](https://github.com/vllm-project/vllm-ascend/pull/1492) -* Fix load weight error and add new e2e case in [#1651](https://github.com/vllm-project/vllm-ascend/pull/1651) -* Optimize the number of rope-related index selections in deepseek. in [#1614](https://github.com/vllm-project/vllm-ascend/pull/1614) -* add mc2 mask in [#1642](https://github.com/vllm-project/vllm-ascend/pull/1642) -* Fix static EPLB log2phy condition and improve unit test in [#1667](https://github.com/vllm-project/vllm-ascend/pull/1667) [#1896](https://github.com/vllm-project/vllm-ascend/pull/1896) [#2003](https://github.com/vllm-project/vllm-ascend/pull/2003) -* add chunk mc2 for prefill in [#1703](https://github.com/vllm-project/vllm-ascend/pull/1703) -* Fix mc2 op GroupCoordinator bug in [#1711](https://github.com/vllm-project/vllm-ascend/pull/1711) -* Fix the failure to recognize the actual type of quantization in [#1721](https://github.com/vllm-project/vllm-ascend/pull/1721) -* Fix deepseek bug when tp_size == 1 in [#1755](https://github.com/vllm-project/vllm-ascend/pull/1755) -* Added support for delay-free blocks in prefill nodes in [#1691](https://github.com/vllm-project/vllm-ascend/pull/1691) -* Moe alltoallv communication optimization for unquantized RL training & alltoallv support dpo in [#1547](https://github.com/vllm-project/vllm-ascend/pull/1547) -* Adapt dispatchV2 interface in [#1822](https://github.com/vllm-project/vllm-ascend/pull/1822) -* Fix disaggregate prefill hang issue in long output in [#1807](https://github.com/vllm-project/vllm-ascend/pull/1807) -* Fix flashcomm_v1 when engine v0 in [#1859](https://github.com/vllm-project/vllm-ascend/pull/1859) -* ep_group is not equal to word_size in some cases. in [#1862](https://github.com/vllm-project/vllm-ascend/pull/1862) -* Fix wheel glibc version incompatibility in [#1808](https://github.com/vllm-project/vllm-ascend/pull/1808) -* Fix mc2 process group to resolve self.cpu_group is None in [#1831](https://github.com/vllm-project/vllm-ascend/pull/1831) -* Pin vllm version to v0.9.1 to make mypy check passed in [#1904](https://github.com/vllm-project/vllm-ascend/pull/1904) -* Apply npu_moe_gating_top_k_softmax for moe to improve perf in [#1902](https://github.com/vllm-project/vllm-ascend/pull/1902) -* Fix bug in path_decorator when engine v0 in [#1919](https://github.com/vllm-project/vllm-ascend/pull/1919) -* Avoid performing cpu all_reduce in disaggregated-prefill scenario. in [#1644](https://github.com/vllm-project/vllm-ascend/pull/1644) -* add super kernel in decode moe in [#1916](https://github.com/vllm-project/vllm-ascend/pull/1916) -* [Prefill Perf] Parallel Strategy Optimizations (VRAM-for-Speed Tradeoff) in [#1802](https://github.com/vllm-project/vllm-ascend/pull/1802) -* Remove unnecessary reduce_results access in shared_experts.down_proj in [#2016](https://github.com/vllm-project/vllm-ascend/pull/2016) -* Optimize greedy reject sampler with vectorization. in [#2002](https://github.com/vllm-project/vllm-ascend/pull/2002) -* Make multiple Ps and Ds work on a single machine in [#1936](https://github.com/vllm-project/vllm-ascend/pull/1936) -* Fixes the shape conflicts between shared & routed experts for deepseek model when tp > 1 and multistream_moe enabled in [#2075](https://github.com/vllm-project/vllm-ascend/pull/2075) -* Add cpu binding support [#2031](https://github.com/vllm-project/vllm-ascend/pull/2031) -* Add with_prefill cpu allreduce to handle D-node recomputatio in [#2129](https://github.com/vllm-project/vllm-ascend/pull/2129) -* Add D2H & initRoutingQuantV2 to improve prefill perf in [#2038](https://github.com/vllm-project/vllm-ascend/pull/2038) - -### Docs: -* Provide an e2e guide for execute duration profiling [#1113](https://github.com/vllm-project/vllm-ascend/pull/1113) -* Add Referer header for CANN package download url. [#1192](https://github.com/vllm-project/vllm-ascend/pull/1192) -* Add reinstall instructions doc [#1370](https://github.com/vllm-project/vllm-ascend/pull/1370) -* Update Disaggregate prefill README [#1379](https://github.com/vllm-project/vllm-ascend/pull/1379) -* Disaggregate prefill for kv cache register style [#1296](https://github.com/vllm-project/vllm-ascend/pull/1296) -* Fix errors and non-standard parts in examples/disaggregate_prefill_v1/README.md in [#1965](https://github.com/vllm-project/vllm-ascend/pull/1965) - -### Known Issues -* Full graph mode support are not yet available for specific hardware types with full_cuda_graphenable. [#2182](https://github.com/vllm-project/vllm-ascend/issues/2182) -* Qwen3 MoE aclgraph mode with tp failed when enable ep due to bincount error [#2226](https://github.com/vllm-project/vllm-ascend/issues/2226) -* As mentioend in v0.9.1rc1 release note, Altlas 300I series support will NOT be included. - ## v0.9.2rc1 - 2025.07.11 This is the 1st release candidate of v0.9.2 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started. From this release, V1 engine will be enabled by default, there is no need to set `VLLM_USE_V1=1` any more. And this release is the last version to support V0 engine, V0 code will be clean up in the future. ### Highlights -* Pooling model works with V1 engine now. You can take a try with Qwen3 embedding model [#1359](https://github.com/vllm-project/vllm-ascend/pull/1359). -* The performance on Atlas 300I series has been improved. [#1591](https://github.com/vllm-project/vllm-ascend/pull/1591) -* aclgraph mode works with Moe models now. Currently, only Qwen3 Moe is well tested. [#1381](https://github.com/vllm-project/vllm-ascend/pull/1381) +- Pooling model works with V1 engine now. You can take a try with Qwen3 embedding model [#1359](https://github.com/vllm-project/vllm-ascend/pull/1359). +- The performance on Atlas 300I series has been improved. [#1591](https://github.com/vllm-project/vllm-ascend/pull/1591) +- aclgraph mode works with Moe models now. Currently, only Qwen3 Moe is well tested. [#1381](https://github.com/vllm-project/vllm-ascend/pull/1381) ### Core -* Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250619`. Don’t forget to update it in your environment. [#1347](https://github.com/vllm-project/vllm-ascend/pull/1347) -* The GatherV3 error has been fixed with aclgraph mode. [#1416](https://github.com/vllm-project/vllm-ascend/pull/1416) -* W8A8 quantization works on Atlas 300I series now. [#1560](https://github.com/vllm-project/vllm-ascend/pull/1560) -* Fix the accuracy problem with deploy models with parallel parameters. [#1678](https://github.com/vllm-project/vllm-ascend/pull/1678) -* The pre-built wheel package now requires lower version of glibc. Users can use it by `pip install vllm-ascend` directly. [#1582](https://github.com/vllm-project/vllm-ascend/pull/1582) +- Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250619`. Don’t forget to update it in your environment. [#1347](https://github.com/vllm-project/vllm-ascend/pull/1347) +- The **GatherV3** error has been fixed with **aclgraph** mode. [#1416](https://github.com/vllm-project/vllm-ascend/pull/1416) +- W8A8 quantization works on Atlas 300I series now. [#1560](https://github.com/vllm-project/vllm-ascend/pull/1560) +- Fix the accuracy problem with deploy models with parallel parameters. [#1678](https://github.com/vllm-project/vllm-ascend/pull/1678) +- The pre-built wheel package now requires lower version of glibc. Users can use it by `pip install vllm-ascend` directly. [#1582](https://github.com/vllm-project/vllm-ascend/pull/1582) ## Other -* Official doc has been updated for better read experience. For example, more deployment tutorials are added, user/developer docs are updated. More guide will coming soon. -* Fix accuracy problem for deepseek V3/R1 models with torchair graph in long sequence predictions. [#1331](https://github.com/vllm-project/vllm-ascend/pull/1331) -* A new env variable `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` has been added. It enables the fused allgather-experts kernel for Deepseek V3/R1 models. The default value is `0`. [#1335](https://github.com/vllm-project/vllm-ascend/pull/1335) -* A new env variable `VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION` has been added to improve the performance of topk-topp sampling. The default value is 0, we'll consider to enable it by default in the future[#1732](https://github.com/vllm-project/vllm-ascend/pull/1732) -* A batch of bugs have been fixed for Data Parallelism case [#1273](https://github.com/vllm-project/vllm-ascend/pull/1273) [#1322](https://github.com/vllm-project/vllm-ascend/pull/1322) [#1275](https://github.com/vllm-project/vllm-ascend/pull/1275) [#1478](https://github.com/vllm-project/vllm-ascend/pull/1478) -* The DeepSeek performance has been improved. [#1194](https://github.com/vllm-project/vllm-ascend/pull/1194) [#1395](https://github.com/vllm-project/vllm-ascend/pull/1395) [#1380](https://github.com/vllm-project/vllm-ascend/pull/1380) -* Ascend scheduler works with prefix cache now. [#1446](https://github.com/vllm-project/vllm-ascend/pull/1446) -* DeepSeek now works with prefix cache now. [#1498](https://github.com/vllm-project/vllm-ascend/pull/1498) -* Support prompt logprobs to recover ceval accuracy in V1 [#1483](https://github.com/vllm-project/vllm-ascend/pull/1483) - -## Knowissue - -* Pipeline parallel does not work with ray and graph mode: https://github.com/vllm-project/vllm-ascend/issues/1751 https://github.com/vllm-project/vllm-ascend/issues/1754 - -## New Contributors -* @xleoken made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1357 -* @lyj-jjj made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1335 -* @sharonyunyun made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1194 -* @Pr0Wh1teGivee made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1308 -* @leo-pony made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1374 -* @zeshengzong made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1452 -* @GDzhu01 made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1477 -* @Agonixiaoxiao made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1531 -* @zhanghw0354 made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1476 -* @farawayboat made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1591 -* @ZhengWG made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1196 -* @wm901115nwpu made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1654 - -**Full Changelog**: https://github.com/vllm-project/vllm-ascend/compare/v0.9.1rc1...v0.9.2rc1 +- Official doc has been updated for better read experience. For example, more deployment tutorials are added, user/developer docs are updated. More guide will coming soon. +- Fix accuracy problem for deepseek V3/R1 models with torchair graph in long sequence predictions. [#1331](https://github.com/vllm-project/vllm-ascend/pull/1331) +- A new env variable `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` has been added. It enables the fused allgather-experts kernel for Deepseek V3/R1 models. The default value is `0`. [#1335](https://github.com/vllm-project/vllm-ascend/pull/1335) +- A new env variable `VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION` has been added to improve the performance of topk-topp sampling. The default value is 0, we'll consider to enable it by default in the future[#1732](https://github.com/vllm-project/vllm-ascend/pull/1732) +- A batch of bugs have been fixed for Data Parallelism case [#1273](https://github.com/vllm-project/vllm-ascend/pull/1273) [#1322](https://github.com/vllm-project/vllm-ascend/pull/1322) [#1275](https://github.com/vllm-project/vllm-ascend/pull/1275) [#1478](https://github.com/vllm-project/vllm-ascend/pull/1478) +- The DeepSeek performance has been improved. [#1194](https://github.com/vllm-project/vllm-ascend/pull/1194) [#1395](https://github.com/vllm-project/vllm-ascend/pull/1395) [#1380](https://github.com/vllm-project/vllm-ascend/pull/1380) +- Ascend scheduler works with prefix cache now. [#1446](https://github.com/vllm-project/vllm-ascend/pull/1446) +- DeepSeek now works with prefix cache now. [#1498](https://github.com/vllm-project/vllm-ascend/pull/1498) +- Support prompt logprobs to recover ceval accuracy in V1 [#1483](https://github.com/vllm-project/vllm-ascend/pull/1483) ## v0.9.1rc1 - 2025.06.22 This is the 1st release candidate of v0.9.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started. -### Experimental - -* Atlas 300I series is experimental supported in this release (Functional test passed with Qwen2.5-7b-instruct/Qwen2.5-0.5b/Qwen3-0.6B/Qwen3-4B/Qwen3-8B). [#1333](https://github.com/vllm-project/vllm-ascend/pull/1333) -* Support EAGLE-3 for speculative decoding. [#1032](https://github.com/vllm-project/vllm-ascend/pull/1032) +### Highlights -After careful consideration, above features **will NOT be included in v0.9.1-dev branch (v0.9.1 final release)** taking into account the v0.9.1 release quality and the feature rapid iteration. We will improve this from 0.9.2rc1 and later. +- Atlas 300I series is experimental supported in this release. [#1333](https://github.com/vllm-project/vllm-ascend/pull/1333) After careful consideration, this feature **will NOT be included in v0.9.1-dev branch** taking into account the v0.9.1 release quality and the feature rapid iteration to improve performance on Atlas 300I series. We will improve this from 0.9.2rc1 and later. +- Support EAGLE-3 for speculative decoding. [#1032](https://github.com/vllm-project/vllm-ascend/pull/1032) ### Core -* Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250528`. Don’t forget to update it in your environment. [#1235](https://github.com/vllm-project/vllm-ascend/pull/1235) -* Support Atlas 300I series container image. You can get it from [quay.io](https://quay.io/repository/vllm/vllm-ascend) -* Fix token-wise padding mechanism to make multi-card graph mode work. [#1300](https://github.com/vllm-project/vllm-ascend/pull/1300) -* Upgrade vLLM to 0.9.1 [#1165]https://github.com/vllm-project/vllm-ascend/pull/1165 +- Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250528`. Don’t forget to update it in your environment. [#1235](https://github.com/vllm-project/vllm-ascend/pull/1235) +- Support Atlas 300I series container image. You can get it from [quay.io](https://quay.io/repository/vllm/vllm-ascend) +- Fix token-wise padding mechanism to make multi-card graph mode work. [#1300](https://github.com/vllm-project/vllm-ascend/pull/1300) +- Upgrade vllm to 0.9.1 [#1165]https://github.com/vllm-project/vllm-ascend/pull/1165 ### Other Improvements -* Initial support Chunked Prefill for MLA. [#1172](https://github.com/vllm-project/vllm-ascend/pull/1172) -* An example of best practices to run DeepSeek with ETP has been added. [#1101](https://github.com/vllm-project/vllm-ascend/pull/1101) -* Performance improvements for DeepSeek using the TorchAir graph. [#1098](https://github.com/vllm-project/vllm-ascend/pull/1098), [#1131](https://github.com/vllm-project/vllm-ascend/pull/1131) -* Supports the speculative decoding feature with AscendScheduler. [#943](https://github.com/vllm-project/vllm-ascend/pull/943) -* Improve `VocabParallelEmbedding` custom op performance. It will be enabled in the next release. [#796](https://github.com/vllm-project/vllm-ascend/pull/796) -* Fixed a device discovery and setup bug when running vLLM Ascend on Ray [#884](https://github.com/vllm-project/vllm-ascend/pull/884) -* DeepSeek with [MC2](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html) (Merged Compute and Communication) now works properly. [#1268](https://github.com/vllm-project/vllm-ascend/pull/1268) -* Fixed log2phy NoneType bug with static EPLB feature. [#1186](https://github.com/vllm-project/vllm-ascend/pull/1186) -* Improved performance for DeepSeek with DBO enabled. [#997](https://github.com/vllm-project/vllm-ascend/pull/997), [#1135](https://github.com/vllm-project/vllm-ascend/pull/1135) -* Refactoring AscendFusedMoE [#1229](https://github.com/vllm-project/vllm-ascend/pull/1229) -* Add initial user stories page (include LLaMA-Factory/TRL/verl/MindIE Turbo/GPUStack) [#1224](https://github.com/vllm-project/vllm-ascend/pull/1224) -* Add unit test framework [#1201](https://github.com/vllm-project/vllm-ascend/pull/1201) +- Initial support Chunked Prefill for MLA. [#1172](https://github.com/vllm-project/vllm-ascend/pull/1172) +- An example of best practices to run DeepSeek with ETP has been added. [#1101](https://github.com/vllm-project/vllm-ascend/pull/1101) +- Performance improvements for DeepSeek using the TorchAir graph. [#1098](https://github.com/vllm-project/vllm-ascend/pull/1098), [#1131](https://github.com/vllm-project/vllm-ascend/pull/1131) +- Supports the speculative decoding feature with AscendScheduler. [#943](https://github.com/vllm-project/vllm-ascend/pull/943) +- Improve `VocabParallelEmbedding` custom op performance. It will be enabled in the next release. [#796](https://github.com/vllm-project/vllm-ascend/pull/796) +- Fixed a device discovery and setup bug when running vLLM Ascend on Ray [#884](https://github.com/vllm-project/vllm-ascend/pull/884) +- DeepSeek with [MC2](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html) (Merged Compute and Communication) now works properly. [#1268](https://github.com/vllm-project/vllm-ascend/pull/1268) +- Fixed log2phy NoneType bug with static EPLB feature. [#1186](https://github.com/vllm-project/vllm-ascend/pull/1186) +- Improved performance for DeepSeek with DBO enabled. [#997](https://github.com/vllm-project/vllm-ascend/pull/997), [#1135](https://github.com/vllm-project/vllm-ascend/pull/1135) +- Refactoring AscendFusedMoE [#1229](https://github.com/vllm-project/vllm-ascend/pull/1229) +- Add initial user stories page (include LLaMA-Factory/TRL/verl/MindIE Turbo/GPUStack) [#1224](https://github.com/vllm-project/vllm-ascend/pull/1224) +- Add unit test framework [#1201](https://github.com/vllm-project/vllm-ascend/pull/1201) ### Known Issues -* In some cases, the vLLM process may crash with a **GatherV3** error when **aclgraph** is enabled. We are working on this issue and will fix it in the next release. [#1038](https://github.com/vllm-project/vllm-ascend/issues/1038) -* Prefix cache feature does not work with the Ascend Scheduler but without chunked prefill enabled. This will be fixed in the next release. [#1350](https://github.com/vllm-project/vllm-ascend/issues/1350) +- In some cases, the vLLM process may crash with a **GatherV3** error when **aclgraph** is enabled. We are working on this issue and will fix it in the next release. [#1038](https://github.com/vllm-project/vllm-ascend/issues/1038) +- Prefix cache feature does not work with the Ascend Scheduler but without chunked prefill enabled. This will be fixed in the next release. [#1350](https://github.com/vllm-project/vllm-ascend/issues/1350) ### Full Changelog https://github.com/vllm-project/vllm-ascend/compare/v0.9.0rc2...v0.9.1rc1 -## New Contributors -* @farawayboat made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1333 -* @yzim made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1159 -* @chenwaner made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1098 -* @wangyanhui-cmss made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1184 -* @songshanhu07 made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1186 -* @yuancaoyaoHW made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1032 - -**Full Changelog**: https://github.com/vllm-project/vllm-ascend/compare/v0.9.0rc2...v0.9.1rc1 - ## v0.9.0rc2 - 2025.06.10 This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1. ### Highlights -* Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) +- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) ## v0.9.0rc1 - 2025.06.09 @@ -213,37 +77,37 @@ This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [ ### Highlights -* DeepSeek works with graph mode now. Follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html) to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789) -* Qwen series models works with graph mode now. It works by default with V1 Engine. Please note that in this release, only Qwen series models are well tested with graph mode. We'll make it stable and generalize in the next release. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model. +- DeepSeek works with graph mode now. Follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html) to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789) +- Qwen series models works with graph mode now. It works by default with V1 Engine. Please note that in this release, only Qwen series models are well tested with graph mode. We'll make it stable and generalize in the next release. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model. ### Core -* The performance of multi-step scheduler has been improved. Thanks for the contribution from China Merchants Bank. [#814](https://github.com/vllm-project/vllm-ascend/pull/814) -* LoRA、Multi-LoRA And Dynamic Serving is supported for V1 Engine now. Thanks for the contribution from China Merchants Bank. [#893](https://github.com/vllm-project/vllm-ascend/pull/893) -* Prefix cache and chunked prefill feature works now [#782](https://github.com/vllm-project/vllm-ascend/pull/782) [#844](https://github.com/vllm-project/vllm-ascend/pull/844) -* Spec decode and MTP features work with V1 Engine now. [#874](https://github.com/vllm-project/vllm-ascend/pull/874) [#890](https://github.com/vllm-project/vllm-ascend/pull/890) -* DP feature works with DeepSeek now. [#1012](https://github.com/vllm-project/vllm-ascend/pull/1012) -* Input embedding feature works with V0 Engine now. [#916](https://github.com/vllm-project/vllm-ascend/pull/916) -* Sleep mode feature works with V1 Engine now. [#1084](https://github.com/vllm-project/vllm-ascend/pull/1084) +- The performance of multi-step scheduler has been improved. Thanks for the contribution from China Merchants Bank. [#814](https://github.com/vllm-project/vllm-ascend/pull/814) +- LoRA、Multi-LoRA And Dynamic Serving is supported for V1 Engine now. Thanks for the contribution from China Merchants Bank. [#893](https://github.com/vllm-project/vllm-ascend/pull/893) +- Prefix cache and chunked prefill feature works now [#782](https://github.com/vllm-project/vllm-ascend/pull/782) [#844](https://github.com/vllm-project/vllm-ascend/pull/844) +- Spec decode and MTP features work with V1 Engine now. [#874](https://github.com/vllm-project/vllm-ascend/pull/874) [#890](https://github.com/vllm-project/vllm-ascend/pull/890) +- DP feature works with DeepSeek now. [#1012](https://github.com/vllm-project/vllm-ascend/pull/1012) +- Input embedding feature works with V0 Engine now. [#916](https://github.com/vllm-project/vllm-ascend/pull/916) +- Sleep mode feature works with V1 Engine now. [#1084](https://github.com/vllm-project/vllm-ascend/pull/1084) ### Model -* Qwen2.5 VL works with V1 Engine now. [#736](https://github.com/vllm-project/vllm-ascend/pull/736) -* LLama4 works now. [#740](https://github.com/vllm-project/vllm-ascend/pull/740) -* A new kind of DeepSeek model called dual-batch overlap(DBO) is added. Please set `VLLM_ASCEND_ENABLE_DBO=1` to use it. [#941](https://github.com/vllm-project/vllm-ascend/pull/941) +- Qwen2.5 VL works with V1 Engine now. [#736](https://github.com/vllm-project/vllm-ascend/pull/736) +- LLama4 works now. [#740](https://github.com/vllm-project/vllm-ascend/pull/740) +- A new kind of DeepSeek model called dual-batch overlap(DBO) is added. Please set `VLLM_ASCEND_ENABLE_DBO=1` to use it. [#941](https://github.com/vllm-project/vllm-ascend/pull/941) ### Other -* online serve with ascend quantization works now. [#877](https://github.com/vllm-project/vllm-ascend/pull/877) -* A batch of bugs for graph mode and moe model have been fixed. [#773](https://github.com/vllm-project/vllm-ascend/pull/773) [#771](https://github.com/vllm-project/vllm-ascend/pull/771) [#774](https://github.com/vllm-project/vllm-ascend/pull/774) [#816](https://github.com/vllm-project/vllm-ascend/pull/816) [#817](https://github.com/vllm-project/vllm-ascend/pull/817) [#819](https://github.com/vllm-project/vllm-ascend/pull/819) [#912](https://github.com/vllm-project/vllm-ascend/pull/912) [#897](https://github.com/vllm-project/vllm-ascend/pull/897) [#961](https://github.com/vllm-project/vllm-ascend/pull/961) [#958](https://github.com/vllm-project/vllm-ascend/pull/958) [#913](https://github.com/vllm-project/vllm-ascend/pull/913) [#905](https://github.com/vllm-project/vllm-ascend/pull/905) -* A batch of performance improvement PRs have been merged. [#784](https://github.com/vllm-project/vllm-ascend/pull/784) [#803](https://github.com/vllm-project/vllm-ascend/pull/803) [#966](https://github.com/vllm-project/vllm-ascend/pull/966) [#839](https://github.com/vllm-project/vllm-ascend/pull/839) [#970](https://github.com/vllm-project/vllm-ascend/pull/970) [#947](https://github.com/vllm-project/vllm-ascend/pull/947) [#987](https://github.com/vllm-project/vllm-ascend/pull/987) [#1085](https://github.com/vllm-project/vllm-ascend/pull/1085) -* From this release, binary wheel package will be released as well. [#775](https://github.com/vllm-project/vllm-ascend/pull/775) -* The contributor doc site is [added](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) +- online serve with ascend quantization works now. [#877](https://github.com/vllm-project/vllm-ascend/pull/877) +- A batch of bugs for graph mode and moe model have been fixed. [#773](https://github.com/vllm-project/vllm-ascend/pull/773) [#771](https://github.com/vllm-project/vllm-ascend/pull/771) [#774](https://github.com/vllm-project/vllm-ascend/pull/774) [#816](https://github.com/vllm-project/vllm-ascend/pull/816) [#817](https://github.com/vllm-project/vllm-ascend/pull/817) [#819](https://github.com/vllm-project/vllm-ascend/pull/819) [#912](https://github.com/vllm-project/vllm-ascend/pull/912) [#897](https://github.com/vllm-project/vllm-ascend/pull/897) [#961](https://github.com/vllm-project/vllm-ascend/pull/961) [#958](https://github.com/vllm-project/vllm-ascend/pull/958) [#913](https://github.com/vllm-project/vllm-ascend/pull/913) [#905](https://github.com/vllm-project/vllm-ascend/pull/905) +- A batch of performance improvement PRs have been merged. [#784](https://github.com/vllm-project/vllm-ascend/pull/784) [#803](https://github.com/vllm-project/vllm-ascend/pull/803) [#966](https://github.com/vllm-project/vllm-ascend/pull/966) [#839](https://github.com/vllm-project/vllm-ascend/pull/839) [#970](https://github.com/vllm-project/vllm-ascend/pull/970) [#947](https://github.com/vllm-project/vllm-ascend/pull/947) [#987](https://github.com/vllm-project/vllm-ascend/pull/987) [#1085](https://github.com/vllm-project/vllm-ascend/pull/1085) +- From this release, binary wheel package will be released as well. [#775](https://github.com/vllm-project/vllm-ascend/pull/775) +- The contributor doc site is [added](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) ### Known Issue -* In some case, vLLM process may be crashed with aclgraph enabled. We're working this issue and it'll be fixed in the next release. -* Multi node data-parallel doesn't work with this release. This is a known issue in vllm and has been fixed on main branch. [#18981](https://github.com/vllm-project/vllm/pull/18981) +- In some case, vLLM process may be crashed with aclgraph enabled. We're working this issue and it'll be fixed in the next release. +- Multi node data-parallel doesn't work with this release. This is a known issue in vllm and has been fixed on main branch. [#18981](https://github.com/vllm-project/vllm/pull/18981) ## v0.7.3.post1 - 2025.05.29 @@ -251,21 +115,21 @@ This is the first post release of 0.7.3. Please follow the [official doc](https: ### Highlights -* Qwen3 and Qwen3MOE is supported now. The performance and accuracy of Qwen3 is well tested. You can try it now. Mindie Turbo is recomanded to improve the performance of Qwen3. [#903](https://github.com/vllm-project/vllm-ascend/pull/903) [#915](https://github.com/vllm-project/vllm-ascend/pull/915) -* Added a new performance guide. The guide aims to help users to improve vllm-ascend performance on system level. It includes OS configuration, library optimization, deploy guide and so on. [#878](https://github.com/vllm-project/vllm-ascend/pull/878) [Doc Link](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html) +- Qwen3 and Qwen3MOE is supported now. The performance and accuracy of Qwen3 is well tested. You can try it now. Mindie Turbo is recomanded to improve the performance of Qwen3. [#903](https://github.com/vllm-project/vllm-ascend/pull/903) [#915](https://github.com/vllm-project/vllm-ascend/pull/915) +- Added a new performance guide. The guide aims to help users to improve vllm-ascend performance on system level. It includes OS configuration, library optimization, deploy guide and so on. [#878](https://github.com/vllm-project/vllm-ascend/pull/878) [Doc Link](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html) ### Bug Fix -* Qwen2.5-VL works for RLHF scenarios now. [#928](https://github.com/vllm-project/vllm-ascend/pull/928) -* Users can launch the model from online weights now. e.g. from huggingface or modelscope directly [#858](https://github.com/vllm-project/vllm-ascend/pull/858) [#918](https://github.com/vllm-project/vllm-ascend/pull/918) -* The meaningless log info `UserWorkspaceSize0` has been cleaned. [#911](https://github.com/vllm-project/vllm-ascend/pull/911) -* The log level for `Failed to import vllm_ascend_C` has been changed to `warning` instead of `error`. [#956](https://github.com/vllm-project/vllm-ascend/pull/956) -* DeepSeek MLA now works with chunked prefill in V1 Engine. Please note that V1 engine in 0.7.3 is just expermential and only for test usage. [#849](https://github.com/vllm-project/vllm-ascend/pull/849) [#936](https://github.com/vllm-project/vllm-ascend/pull/936) +- Qwen2.5-VL works for RLHF scenarios now. [#928](https://github.com/vllm-project/vllm-ascend/pull/928) +- Users can launch the model from online weights now. e.g. from huggingface or modelscope directly [#858](https://github.com/vllm-project/vllm-ascend/pull/858) [#918](https://github.com/vllm-project/vllm-ascend/pull/918) +- The meaningless log info `UserWorkspaceSize0` has been cleaned. [#911](https://github.com/vllm-project/vllm-ascend/pull/911) +- The log level for `Failed to import vllm_ascend_C` has been changed to `warning` instead of `error`. [#956](https://github.com/vllm-project/vllm-ascend/pull/956) +- DeepSeek MLA now works with chunked prefill in V1 Engine. Please note that V1 engine in 0.7.3 is just expermential and only for test usage. [#849](https://github.com/vllm-project/vllm-ascend/pull/849) [#936](https://github.com/vllm-project/vllm-ascend/pull/936) ### Docs -* The benchmark doc is updated for Qwen2.5 and Qwen2.5-VL [#792](https://github.com/vllm-project/vllm-ascend/pull/792) -* Add the note to clear that only "modelscope<1.23.0" works with 0.7.3. [#954](https://github.com/vllm-project/vllm-ascend/pull/954) +- The benchmark doc is updated for Qwen2.5 and Qwen2.5-VL [#792](https://github.com/vllm-project/vllm-ascend/pull/792) +- Add the note to clear that only "modelscope<1.23.0" works with 0.7.3. [#954](https://github.com/vllm-project/vllm-ascend/pull/954) ## v0.7.3 - 2025.05.08 @@ -274,66 +138,66 @@ This is the first post release of 0.7.3. Please follow the [official doc](https: We are excited to announce the release of 0.7.3 for vllm-ascend. This is the first official release. The functionality, performance, and stability of this release are fully tested and verified. We encourage you to try it out and provide feedback. We'll post bug fix versions in the future if needed. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. ### Highlights -* This release includes all features landed in the previous release candidates ([v0.7.1rc1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.1rc1), [v0.7.3rc1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc1), [v0.7.3rc2](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc2)). And all the features are fully tested and verified. Visit the official doc the get the detail [feature](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html) and [model](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/supported_models.html) support matrix. -* Upgrade CANN to 8.1.RC1 to enable chunked prefill and automatic prefix caching features. You can now enable them now. -* Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version of torch-npu now. Now users don't need to install the torch-npu by hand. The 2.5.1 version of torch-npu will be installed automatically. [#662](https://github.com/vllm-project/vllm-ascend/pull/662) -* Integrate MindIE Turbo into vLLM Ascend to improve DeepSeek V3/R1, Qwen 2 series performance. [#708](https://github.com/vllm-project/vllm-ascend/pull/708) +- This release includes all features landed in the previous release candidates ([v0.7.1rc1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.1rc1), [v0.7.3rc1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc1), [v0.7.3rc2](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc2)). And all the features are fully tested and verified. Visit the official doc the get the detail [feature](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html) and [model](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/supported_models.html) support matrix. +- Upgrade CANN to 8.1.RC1 to enable chunked prefill and automatic prefix caching features. You can now enable them now. +- Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version of torch-npu now. Now users don't need to install the torch-npu by hand. The 2.5.1 version of torch-npu will be installed automatically. [#662](https://github.com/vllm-project/vllm-ascend/pull/662) +- Integrate MindIE Turbo into vLLM Ascend to improve DeepSeek V3/R1, Qwen 2 series performance. [#708](https://github.com/vllm-project/vllm-ascend/pull/708) ### Core -* LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will be improved in the next release. Please follow the official doc for more usage information. Thanks for the contribution from China Merchants Bank. [#700](https://github.com/vllm-project/vllm-ascend/pull/700) +- LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will be improved in the next release. Please follow the official doc for more usage information. Thanks for the contribution from China Merchants Bank. [#700](https://github.com/vllm-project/vllm-ascend/pull/700) ### Model -* The performance of Qwen2 vl and Qwen2.5 vl is improved. [#702](https://github.com/vllm-project/vllm-ascend/pull/702) -* The performance of `apply_penalties` and `topKtopP` ops are improved. [#525](https://github.com/vllm-project/vllm-ascend/pull/525) +- The performance of Qwen2 vl and Qwen2.5 vl is improved. [#702](https://github.com/vllm-project/vllm-ascend/pull/702) +- The performance of `apply_penalties` and `topKtopP` ops are improved. [#525](https://github.com/vllm-project/vllm-ascend/pull/525) ### Other -* Fixed a issue that may lead CPU memory leak. [#691](https://github.com/vllm-project/vllm-ascend/pull/691) [#712](https://github.com/vllm-project/vllm-ascend/pull/712) -* A new environment `SOC_VERSION` is added. If you hit any soc detection error when building with custom ops enabled, please set `SOC_VERSION` to a suitable value. [#606](https://github.com/vllm-project/vllm-ascend/pull/606) -* openEuler container image supported with v0.7.3-openeuler tag. [#665](https://github.com/vllm-project/vllm-ascend/pull/665) -* Prefix cache feature works on V1 engine now. [#559](https://github.com/vllm-project/vllm-ascend/pull/559) +- Fixed a issue that may lead CPU memory leak. [#691](https://github.com/vllm-project/vllm-ascend/pull/691) [#712](https://github.com/vllm-project/vllm-ascend/pull/712) +- A new environment `SOC_VERSION` is added. If you hit any soc detection error when building with custom ops enabled, please set `SOC_VERSION` to a suitable value. [#606](https://github.com/vllm-project/vllm-ascend/pull/606) +- openEuler container image supported with v0.7.3-openeuler tag. [#665](https://github.com/vllm-project/vllm-ascend/pull/665) +- Prefix cache feature works on V1 engine now. [#559](https://github.com/vllm-project/vllm-ascend/pull/559) ## v0.8.5rc1 - 2025.05.06 This is the 1st release candidate of v0.8.5 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. Now you can enable V1 egnine by setting the environment variable `VLLM_USE_V1=1`, see the feature support status of vLLM Ascend in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html). ### Highlights -* Upgrade CANN version to 8.1.RC1 to support chunked prefill and automatic prefix caching (`--enable_prefix_caching`) when V1 is enabled [#747](https://github.com/vllm-project/vllm-ascend/pull/747) -* Optimize Qwen2 VL and Qwen 2.5 VL [#701](https://github.com/vllm-project/vllm-ascend/pull/701) -* Improve Deepseek V3 eager mode and graph mode performance, now you can use --additional_config={'enable_graph_mode': True} to enable graph mode. [#598](https://github.com/vllm-project/vllm-ascend/pull/598) [#719](https://github.com/vllm-project/vllm-ascend/pull/719) +- Upgrade CANN version to 8.1.RC1 to support chunked prefill and automatic prefix caching (`--enable_prefix_caching`) when V1 is enabled [#747](https://github.com/vllm-project/vllm-ascend/pull/747) +- Optimize Qwen2 VL and Qwen 2.5 VL [#701](https://github.com/vllm-project/vllm-ascend/pull/701) +- Improve Deepseek V3 eager mode and graph mode performance, now you can use --additional_config={'enable_graph_mode': True} to enable graph mode. [#598](https://github.com/vllm-project/vllm-ascend/pull/598) [#719](https://github.com/vllm-project/vllm-ascend/pull/719) ### Core -* Upgrade vLLM to 0.8.5.post1 [#715](https://github.com/vllm-project/vllm-ascend/pull/715) -* Fix early return in CustomDeepseekV2MoE.forward during profile_run [#682](https://github.com/vllm-project/vllm-ascend/pull/682) -* Adapts for new quant model generated by modelslim [#719](https://github.com/vllm-project/vllm-ascend/pull/719) -* Initial support on P2P Disaggregated Prefill based on llm_datadist [#694](https://github.com/vllm-project/vllm-ascend/pull/694) -* Use `/vllm-workspace` as code path and include `.git` in container image to fix issue when start vllm under `/workspace` [#726](https://github.com/vllm-project/vllm-ascend/pull/726) -* Optimize NPU memory usage to make DeepSeek R1 W8A8 32K model len work. [#728](https://github.com/vllm-project/vllm-ascend/pull/728) -* Fix `PYTHON_INCLUDE_PATH` typo in setup.py [#762](https://github.com/vllm-project/vllm-ascend/pull/762) +- Upgrade vLLM to 0.8.5.post1 [#715](https://github.com/vllm-project/vllm-ascend/pull/715) +- Fix early return in CustomDeepseekV2MoE.forward during profile_run [#682](https://github.com/vllm-project/vllm-ascend/pull/682) +- Adapts for new quant model generated by modelslim [#719](https://github.com/vllm-project/vllm-ascend/pull/719) +- Initial support on P2P Disaggregated Prefill based on llm_datadist [#694](https://github.com/vllm-project/vllm-ascend/pull/694) +- Use `/vllm-workspace` as code path and include `.git` in container image to fix issue when start vllm under `/workspace` [#726](https://github.com/vllm-project/vllm-ascend/pull/726) +- Optimize NPU memory usage to make DeepSeek R1 W8A8 32K model len work. [#728](https://github.com/vllm-project/vllm-ascend/pull/728) +- Fix `PYTHON_INCLUDE_PATH` typo in setup.py [#762](https://github.com/vllm-project/vllm-ascend/pull/762) ### Other -* Add Qwen3-0.6B test [#717](https://github.com/vllm-project/vllm-ascend/pull/717) -* Add nightly CI [#668](https://github.com/vllm-project/vllm-ascend/pull/668) -* Add accuracy test report [#542](https://github.com/vllm-project/vllm-ascend/pull/542) +- Add Qwen3-0.6B test [#717](https://github.com/vllm-project/vllm-ascend/pull/717) +- Add nightly CI [#668](https://github.com/vllm-project/vllm-ascend/pull/668) +- Add accuracy test report [#542](https://github.com/vllm-project/vllm-ascend/pull/542) ## v0.8.4rc2 - 2025.04.29 This is the second release candidate of v0.8.4 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. Some experimental features are included in this version, such as W8A8 quantization and EP/DP support. We'll make them stable enough in the next release. ### Highlights -* Qwen3 and Qwen3MOE is supported now. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/single_npu.html) to run the quick demo. [#709](https://github.com/vllm-project/vllm-ascend/pull/709) -* Ascend W8A8 quantization method is supported now. Please take the [official doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_quantization.html) for example. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/619) is welcome. [#580](https://github.com/vllm-project/vllm-ascend/pull/580) -* DeepSeek V3/R1 works with DP, TP and MTP now. Please note that it's still in experimental status. Let us know if you hit any problem. [#429](https://github.com/vllm-project/vllm-ascend/pull/429) [#585](https://github.com/vllm-project/vllm-ascend/pull/585) [#626](https://github.com/vllm-project/vllm-ascend/pull/626) [#636](https://github.com/vllm-project/vllm-ascend/pull/636) [#671](https://github.com/vllm-project/vllm-ascend/pull/671) +- Qwen3 and Qwen3MOE is supported now. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/single_npu.html) to run the quick demo. [#709](https://github.com/vllm-project/vllm-ascend/pull/709) +- Ascend W8A8 quantization method is supported now. Please take the [official doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_quantization.html) for example. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/619) is welcome. [#580](https://github.com/vllm-project/vllm-ascend/pull/580) +- DeepSeek V3/R1 works with DP, TP and MTP now. Please note that it's still in experimental status. Let us know if you hit any problem. [#429](https://github.com/vllm-project/vllm-ascend/pull/429) [#585](https://github.com/vllm-project/vllm-ascend/pull/585) [#626](https://github.com/vllm-project/vllm-ascend/pull/626) [#636](https://github.com/vllm-project/vllm-ascend/pull/636) [#671](https://github.com/vllm-project/vllm-ascend/pull/671) ### Core -* ACLGraph feature is supported with V1 engine now. It's disabled by default because this feature rely on CANN 8.1 release. We'll make it available by default in the next release [#426](https://github.com/vllm-project/vllm-ascend/pull/426) -* Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version of torch-npu now. Now users don't need to install the torch-npu by hand. The 2.5.1 version of torch-npu will be installed automatically. [#661](https://github.com/vllm-project/vllm-ascend/pull/661) +- ACLGraph feature is supported with V1 engine now. It's disabled by default because this feature rely on CANN 8.1 release. We'll make it available by default in the next release [#426](https://github.com/vllm-project/vllm-ascend/pull/426) +- Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version of torch-npu now. Now users don't need to install the torch-npu by hand. The 2.5.1 version of torch-npu will be installed automatically. [#661](https://github.com/vllm-project/vllm-ascend/pull/661) ### Other -* MiniCPM model works now. [#645](https://github.com/vllm-project/vllm-ascend/pull/645) -* openEuler container image supported with `v0.8.4-openeuler` tag and customs Ops build is enabled by default for openEuler OS. [#689](https://github.com/vllm-project/vllm-ascend/pull/689) -* Fix ModuleNotFoundError bug to make Lora work [#600](https://github.com/vllm-project/vllm-ascend/pull/600) -* Add "Using EvalScope evaluation" doc [#611](https://github.com/vllm-project/vllm-ascend/pull/611) -* Add a `VLLM_VERSION` environment to make vLLM version configurable to help developer set correct vLLM version if the code of vLLM is changed by hand locally. [#651](https://github.com/vllm-project/vllm-ascend/pull/651) +- MiniCPM model works now. [#645](https://github.com/vllm-project/vllm-ascend/pull/645) +- openEuler container image supported with `v0.8.4-openeuler` tag and customs Ops build is enabled by default for openEuler OS. [#689](https://github.com/vllm-project/vllm-ascend/pull/689) +- Fix ModuleNotFoundError bug to make Lora work [#600](https://github.com/vllm-project/vllm-ascend/pull/600) +- Add "Using EvalScope evaluation" doc [#611](https://github.com/vllm-project/vllm-ascend/pull/611) +- Add a `VLLM_VERSION` environment to make vLLM version configurable to help developer set correct vLLM version if the code of vLLM is changed by hand locally. [#651](https://github.com/vllm-project/vllm-ascend/pull/651) ## v0.8.4rc1 - 2025.04.18 @@ -341,72 +205,72 @@ This is the first release candidate of v0.8.4 for vllm-ascend. Please follow the ### Highlights -* vLLM V1 engine experimental support is included in this version. You can visit [official guide](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html) to get more detail. By default, vLLM will fallback to V0 if V1 doesn't work, please set `VLLM_USE_V1=1` environment if you want to use V1 forcely. -* LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will be improved in the next release. Please follow the [official doc](https://docs.vllm.ai/en/latest/features/lora.html) for more usage information. Thanks for the contribution from China Merchants Bank. [#521](https://github.com/vllm-project/vllm-ascend/pull/521). -* Sleep Mode feature is supported. Currently it's only work on V0 engine. V1 engine support will come soon. [#513](https://github.com/vllm-project/vllm-ascend/pull/513) +- vLLM V1 engine experimental support is included in this version. You can visit [official guide](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html) to get more detail. By default, vLLM will fallback to V0 if V1 doesn't work, please set `VLLM_USE_V1=1` environment if you want to use V1 forcely. +- LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will be improved in the next release. Please follow the [official doc](https://docs.vllm.ai/en/latest/features/lora.html) for more usage information. Thanks for the contribution from China Merchants Bank. [#521](https://github.com/vllm-project/vllm-ascend/pull/521). +- Sleep Mode feature is supported. Currently it's only work on V0 engine. V1 engine support will come soon. [#513](https://github.com/vllm-project/vllm-ascend/pull/513) ### Core -* The Ascend scheduler is added for V1 engine. This scheduler is more affinity with Ascend hardware. More scheduler policy will be added in the future. [#543](https://github.com/vllm-project/vllm-ascend/pull/543) -* Disaggregated Prefill feature is supported. Currently only 1P1D works. NPND is under design by vllm team. vllm-ascend will support it once it's ready from vLLM. Follow the [official guide](https://docs.vllm.ai/en/latest/features/disagg_prefill.html) to use. [#432](https://github.com/vllm-project/vllm-ascend/pull/432) -* Spec decode feature works now. Currently it's only work on V0 engine. V1 engine support will come soon. [#500](https://github.com/vllm-project/vllm-ascend/pull/500) -* Structured output feature works now on V1 Engine. Currently it only supports xgrammar backend while using guidance backend may get some errors. [#555](https://github.com/vllm-project/vllm-ascend/pull/555) +- The Ascend scheduler is added for V1 engine. This scheduler is more affinity with Ascend hardware. More scheduler policy will be added in the future. [#543](https://github.com/vllm-project/vllm-ascend/pull/543) +- Disaggregated Prefill feature is supported. Currently only 1P1D works. NPND is under design by vllm team. vllm-ascend will support it once it's ready from vLLM. Follow the [official guide](https://docs.vllm.ai/en/latest/features/disagg_prefill.html) to use. [#432](https://github.com/vllm-project/vllm-ascend/pull/432) +- Spec decode feature works now. Currently it's only work on V0 engine. V1 engine support will come soon. [#500](https://github.com/vllm-project/vllm-ascend/pull/500) +- Structured output feature works now on V1 Engine. Currently it only supports xgrammar backend while using guidance backend may get some errors. [#555](https://github.com/vllm-project/vllm-ascend/pull/555) ### Other -* A new communicator `pyhccl` is added. It's used for call CANN HCCL library directly instead of using `torch.distribute`. More usage of it will be added in the next release [#503](https://github.com/vllm-project/vllm-ascend/pull/503) -* The custom ops build is enabled by default. You should install the packages like `gcc`, `cmake` first to build `vllm-ascend` from source. Set `COMPILE_CUSTOM_KERNELS=0` environment to disable the compilation if you don't need it. [#466](https://github.com/vllm-project/vllm-ascend/pull/466) -* The custom op `rotay embedding` is enabled by default now to improve the performance. [#555](https://github.com/vllm-project/vllm-ascend/pull/555) +- A new communicator `pyhccl` is added. It's used for call CANN HCCL library directly instead of using `torch.distribute`. More usage of it will be added in the next release [#503](https://github.com/vllm-project/vllm-ascend/pull/503) +- The custom ops build is enabled by default. You should install the packages like `gcc`, `cmake` first to build `vllm-ascend` from source. Set `COMPILE_CUSTOM_KERNELS=0` environment to disable the compilation if you don't need it. [#466](https://github.com/vllm-project/vllm-ascend/pull/466) +- The custom op `rotay embedding` is enabled by default now to improve the performance. [#555](https://github.com/vllm-project/vllm-ascend/pull/555) ## v0.7.3rc2 - 2025.03.29 This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. -* Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start.html -* Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html +- Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start.html +- Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html ### Highlights -* Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371) -* V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376) -* Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282) +- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371) +- V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376) +- Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282) ### Core -* Bump torch_npu version to dev20250320.3 to improve accuracy to fix `!!!` output problem. [#406](https://github.com/vllm-project/vllm-ascend/pull/406) +- Bump torch_npu version to dev20250320.3 to improve accuracy to fix `!!!` output problem. [#406](https://github.com/vllm-project/vllm-ascend/pull/406) ### Model -* The performance of Qwen2-vl is improved by optimizing patch embedding (Conv3D). [#398](https://github.com/vllm-project/vllm-ascend/pull/398) +- The performance of Qwen2-vl is improved by optimizing patch embedding (Conv3D). [#398](https://github.com/vllm-project/vllm-ascend/pull/398) ### Other -* Fixed a bug to make sure multi step scheduler feature work. [#349](https://github.com/vllm-project/vllm-ascend/pull/349) -* Fixed a bug to make prefix cache feature works with correct accuracy. [#424](https://github.com/vllm-project/vllm-ascend/pull/424) +- Fixed a bug to make sure multi step scheduler feature work. [#349](https://github.com/vllm-project/vllm-ascend/pull/349) +- Fixed a bug to make prefix cache feature works with correct accuracy. [#424](https://github.com/vllm-project/vllm-ascend/pull/424) ## v0.7.3rc1 - 2025.03.14 🎉 Hello, World! This is the first release candidate of v0.7.3 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. -* Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start.html -* Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html +- Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start.html +- Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html ### Highlights -* DeepSeek V3/R1 works well now. Read the [official guide](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html) to start! [#242](https://github.com/vllm-project/vllm-ascend/pull/242) -* Speculative decoding feature is supported. [#252](https://github.com/vllm-project/vllm-ascend/pull/252) -* Multi step scheduler feature is supported. [#300](https://github.com/vllm-project/vllm-ascend/pull/300) +- DeepSeek V3/R1 works well now. Read the [official guide](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html) to start! [#242](https://github.com/vllm-project/vllm-ascend/pull/242) +- Speculative decoding feature is supported. [#252](https://github.com/vllm-project/vllm-ascend/pull/252) +- Multi step scheduler feature is supported. [#300](https://github.com/vllm-project/vllm-ascend/pull/300) ### Core -* Bump torch_npu version to dev20250308.3 to improve `_exponential` accuracy -* Added initial support for pooling models. Bert based model, such as `BAAI/bge-base-en-v1.5` and `BAAI/bge-reranker-v2-m3` works now. [#229](https://github.com/vllm-project/vllm-ascend/pull/229) +- Bump torch_npu version to dev20250308.3 to improve `_exponential` accuracy +- Added initial support for pooling models. Bert based model, such as `BAAI/bge-base-en-v1.5` and `BAAI/bge-reranker-v2-m3` works now. [#229](https://github.com/vllm-project/vllm-ascend/pull/229) ### Model -* The performance of Qwen2-VL is improved. [#241](https://github.com/vllm-project/vllm-ascend/pull/241) -* MiniCPM is now supported [#164](https://github.com/vllm-project/vllm-ascend/pull/164) +- The performance of Qwen2-VL is improved. [#241](https://github.com/vllm-project/vllm-ascend/pull/241) +- MiniCPM is now supported [#164](https://github.com/vllm-project/vllm-ascend/pull/164) ### Other -* Support MTP(Multi-Token Prediction) for DeepSeek V3/R1 [#236](https://github.com/vllm-project/vllm-ascend/pull/236) -* [Docs] Added more model tutorials, include DeepSeek, QwQ, Qwen and Qwen 2.5VL. See the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/index.html) for detail -* Pin modelscope<1.23.0 on vLLM v0.7.3 to resolve: https://github.com/vllm-project/vllm/pull/13807 +- Support MTP(Multi-Token Prediction) for DeepSeek V3/R1 [#236](https://github.com/vllm-project/vllm-ascend/pull/236) +- [Docs] Added more model tutorials, include DeepSeek, QwQ, Qwen and Qwen 2.5VL. See the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/index.html) for detail +- Pin modelscope<1.23.0 on vLLM v0.7.3 to resolve: https://github.com/vllm-project/vllm/pull/13807 ### Known issues -* In [some cases](https://github.com/vllm-project/vllm-ascend/issues/324), especially when the input/output is very long, the accuracy of output may be incorrect. We are working on it. It'll be fixed in the next release. -* Improved and reduced the garbled code in model output. But if you still hit the issue, try to change the generation config value, such as `temperature`, and try again. There is also a knonwn issue shown below. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/267) is welcome. [#277](https://github.com/vllm-project/vllm-ascend/pull/277) +- In [some cases](https://github.com/vllm-project/vllm-ascend/issues/324), especially when the input/output is very long, the accuracy of output may be incorrect. We are working on it. It'll be fixed in the next release. +- Improved and reduced the garbled code in model output. But if you still hit the issue, try to change the generation config value, such as `temperature`, and try again. There is also a knonwn issue shown below. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/267) is welcome. [#277](https://github.com/vllm-project/vllm-ascend/pull/277) ## v0.7.1rc1 - 2025.02.19 @@ -420,23 +284,23 @@ Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.1-de ### Highlights -* Initial supports for Ascend NPU on vLLM. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) -* DeepSeek is now supported. [#88](https://github.com/vllm-project/vllm-ascend/pull/88) [#68](https://github.com/vllm-project/vllm-ascend/pull/68) -* Qwen, Llama series and other popular models are also supported, you can see more details in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html). +- Initial supports for Ascend NPU on vLLM. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) +- DeepSeek is now supported. [#88](https://github.com/vllm-project/vllm-ascend/pull/88) [#68](https://github.com/vllm-project/vllm-ascend/pull/68) +- Qwen, Llama series and other popular models are also supported, you can see more details in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html). ### Core -* Added the Ascend quantization config option, the implementation will coming soon. [#7](https://github.com/vllm-project/vllm-ascend/pull/7) [#73](https://github.com/vllm-project/vllm-ascend/pull/73) -* Add silu_and_mul and rope ops and add mix ops into attention layer. [#18](https://github.com/vllm-project/vllm-ascend/pull/18) +- Added the Ascend quantization config option, the implementation will coming soon. [#7](https://github.com/vllm-project/vllm-ascend/pull/7) [#73](https://github.com/vllm-project/vllm-ascend/pull/73) +- Add silu_and_mul and rope ops and add mix ops into attention layer. [#18](https://github.com/vllm-project/vllm-ascend/pull/18) ### Other -* [CI] Enable Ascend CI to actively monitor and improve quality for vLLM on Ascend. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) -* [Docker] Add vllm-ascend container image [#64](https://github.com/vllm-project/vllm-ascend/pull/64) -* [Docs] Add a [live doc](https://vllm-ascend.readthedocs.org) [#55](https://github.com/vllm-project/vllm-ascend/pull/55) +- [CI] Enable Ascend CI to actively monitor and improve quality for vLLM on Ascend. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) +- [Docker] Add vllm-ascend container image [#64](https://github.com/vllm-project/vllm-ascend/pull/64) +- [Docs] Add a [live doc](https://vllm-ascend.readthedocs.org) [#55](https://github.com/vllm-project/vllm-ascend/pull/55) ### Known issues -* This release relies on an unreleased torch_npu version. It has been installed within official container image already. Please [install](https://vllm-ascend.readthedocs.io/en/v0.7.1rc1/installation.html) it manually if you are using non-container environment. -* There are logs like `No platform detected, vLLM is running on UnspecifiedPlatform` or `Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")` shown when running vllm-ascend. It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/12432) which will be included in v0.7.3 soon. -* There are logs like `# CPU blocks: 35064, # CPU blocks: 2730` shown when running vllm-ascend which should be `# NPU blocks:` . It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/13378) which will be included in v0.7.3 soon. +- This release relies on an unreleased torch_npu version. It has been installed within official container image already. Please [install](https://vllm-ascend.readthedocs.io/en/v0.7.1rc1/installation.html) it manually if you are using non-container environment. +- There are logs like `No platform detected, vLLM is running on UnspecifiedPlatform` or `Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")` shown when running vllm-ascend. It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/12432) which will be included in v0.7.3 soon. +- There are logs like `# CPU blocks: 35064, # CPU blocks: 2730` shown when running vllm-ascend which should be `# NPU blocks:` . It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/13378) which will be included in v0.7.3 soon. diff --git a/examples/disaggregated_prefill_v1/README.md b/examples/disaggregated_prefill_v1/README.md index a77e3e2373..14def1dae5 100644 --- a/examples/disaggregated_prefill_v1/README.md +++ b/examples/disaggregated_prefill_v1/README.md @@ -6,7 +6,7 @@ This demo document provides instructions for running a disaggregated vLLM-ascend ## Prerequisites - Ascend NPU environment with vLLM 0.9.1 installed - Network interfaces configured for distributed communication (eg: eth0) -- Model weights located at `/models/deepseek_r1_w8a8` +- Model weights located at `/data01/deepseek_r1_w8a8_zhw` ## Rank table generation The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. The following command generates a rank table for all nodes with 16 cards prefill and 16 cards decode: @@ -15,15 +15,11 @@ Run the following command on every node to generate the rank table: ```shell cd vllm-ascend/examples/disaggregate_prefill_v1/ bash gen_ranktable.sh --ips 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 \ - --npus-per-node 8 --network-card-name eth0 --prefill-device-cnt 16 --decode-device-cnt 16 + --npus-per-node 8 --network-card-name enp189s0f0 --prefill-device-cnt 16 --decode-device-cnt 16 ``` Rank table will generated at `/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json` -## Start disaggregated vLLM-ascend service -For demonstration purposes, we will utilize the quantized version of Deepseek-R1. Recommended Parallelization Strategies: -- P-node: DP2-TP8-EP16 (Data Parallelism 2, Tensor Parallelism 8, Expert Parallelism 16) -- D-node: DP4-TP4-EP16 (Data Parallelism 4, Tensor Parallelism 4, Expert Parallelism 16) - +## Start disaggregated vLLM-ascend service Execution Sequence - 4 configured node ip are: 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 - Start Prefill on Node 1 (P1) @@ -32,7 +28,7 @@ Execution Sequence - Start Decode on Node 2 (D2) - Start proxy server on Node1 -Run prefill server P1 on first node: +* Run prefill server P1 on first node ```shell export HCCL_IF_IP=172.19.32.175 # node ip export GLOO_SOCKET_IFNAME="eth0" # network card name @@ -42,9 +38,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 export VLLM_USE_V1=1 -export VLLM_LLMDD_RPC_PORT=5559 - -vllm serve /models/deepseek_r1_w8a8 \ +vllm serve /data01/deepseek_r1_w8a8_zhw \ --host 0.0.0.0 \ --port 20002 \ --data-parallel-size 2 \ @@ -53,12 +47,11 @@ vllm serve /models/deepseek_r1_w8a8 \ --data-parallel-address 172.19.32.175 \ --data-parallel-rpc-port 13356 \ --tensor-parallel-size 8 \ - --enable-expert-parallel \ + --no-enable-prefix-caching \ --seed 1024 \ --served-model-name deepseek \ - --max-model-len 32768 \ - --max-num-batched-tokens 32768 \ - --max-num-seqs 256 \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ --trust-remote-code \ --enforce-eager \ --gpu-memory-utilization 0.9 \ @@ -72,10 +65,10 @@ vllm serve /models/deepseek_r1_w8a8 \ "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' \ --additional-config \ - '{"chunked_prefill_for_mla":true}' + '{"torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}, "ascend_scheduler_config":{"enabled":false}}' ``` -Run prefill server P2 on second node: +* Run prefill server P2 on second node ```shell export HCCL_IF_IP=172.19.241.49 export GLOO_SOCKET_IFNAME="eth0" @@ -85,9 +78,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 export VLLM_USE_V1=1 -export VLLM_LLMDD_RPC_PORT=5659 - -vllm serve /models/deepseek_r1_w8a8 \ +vllm serve /data01/deepseek_r1_w8a8_zhw \ --host 0.0.0.0 \ --port 20002 \ --headless \ @@ -97,12 +88,11 @@ vllm serve /models/deepseek_r1_w8a8 \ --data-parallel-address 172.19.32.175 \ --data-parallel-rpc-port 13356 \ --tensor-parallel-size 8 \ - --enable-expert-parallel \ + --no-enable-prefix-caching \ --seed 1024 \ --served-model-name deepseek \ - --max-model-len 32768 \ - --max-num-batched-tokens 32768 \ - --max-num-seqs 256 \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ --trust-remote-code \ --enforce-eager \ --gpu-memory-utilization 0.9 \ @@ -116,12 +106,10 @@ vllm serve /models/deepseek_r1_w8a8 \ "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' \ --additional-config \ - '{"chunked_prefill_for_mla":true}' + '{"torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}, "ascend_scheduler_config":{"enabled":false}}' ``` -Run decode server d1 on third node: - -* In the D node, the `max-num-batched-tokens` parameter can be set to a smaller value since the D node processes at most `max-num-seqs` batches concurrently. As the `profile_run` only needs to handle `max-num-seqs` sequences at a time, we can safely set `max-num-batched-tokens` equal to `max-num-seqs`. This optimization will help reduce activation memory consumption. +* Run decode server d1 on third node ```shell export HCCL_IF_IP=172.19.123.51 export GLOO_SOCKET_IFNAME="eth0" @@ -131,24 +119,22 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 export VLLM_USE_V1=1 -export VLLM_LLMDD_RPC_PORT=5759 - -vllm serve /models/deepseek_r1_w8a8 \ +vllm serve /data01/deepseek_r1_w8a8_zhw \ --host 0.0.0.0 \ --port 20002 \ - --data-parallel-size 4 \ - --data-parallel-size-local 2 \ + --data-parallel-size 2 \ + --data-parallel-size-local 1 \ --api-server-count 2 \ --data-parallel-address 172.19.123.51 \ --data-parallel-rpc-port 13356 \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ + --tensor-parallel-size 8 \ + --no-enable-prefix-caching \ --seed 1024 \ --served-model-name deepseek \ - --max-model-len 32768 \ - --max-num-batched-tokens 256 \ - --max-num-seqs 256 \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ --trust-remote-code \ + --enforce-eager \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{"kv_connector": "LLMDataDistCMgrConnector", @@ -160,10 +146,10 @@ vllm serve /models/deepseek_r1_w8a8 \ "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' \ --additional-config \ - '{"torchair_graph_config": {"enabled":true}}' + '{"torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}, "ascend_scheduler_config":{"enabled":false}}' ``` -Run decode server d2 on last node: +* Run decode server d2 on last node ```shell export HCCL_IF_IP=172.19.190.36 export GLOO_SOCKET_IFNAME="eth0" @@ -173,25 +159,23 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 export VLLM_USE_V1=1 -export VLLM_LLMDD_RPC_PORT=5859 - -vllm serve /models/deepseek_r1_w8a8 \ +vllm serve /data01/deepseek_r1_w8a8_zhw \ --host 0.0.0.0 \ --port 20002 \ --headless \ - --data-parallel-size 4 \ - --data-parallel-start-rank 2 \ - --data-parallel-size-local 2 \ + --data-parallel-size 2 \ + --data-parallel-start-rank 1 \ + --data-parallel-size-local 1 \ --data-parallel-address 172.19.123.51 \ --data-parallel-rpc-port 13356 \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ + --tensor-parallel-size 8 \ + --no-enable-prefix-caching \ --seed 1024 \ --served-model-name deepseek \ - --max-model-len 32768 \ - --max-num-batched-tokens 256 \ - --max-num-seqs 256 \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ --trust-remote-code \ + --enforce-eager \ --gpu-memory-utilization 0.9 \ --kv-transfer-config \ '{"kv_connector": "LLMDataDistCMgrConnector", @@ -203,16 +187,16 @@ vllm serve /models/deepseek_r1_w8a8 \ "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" }' \ --additional-config \ - '{"torchair_graph_config": {"enabled":true}}' + '{"torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}, "ascend_scheduler_config":{"enabled":false}}' ``` -Run proxy server on the first node: +* Run proxy server on the first node ```shell cd /vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1 python toy_proxy_server.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002 ``` -Verification +* Verification Check service health using the proxy server endpoint: ```shell curl http://localhost:1025/v1/completions \ @@ -225,8 +209,8 @@ curl http://localhost:1025/v1/completions \ }' ``` -Performance -Test performance with vllm benchmark: +* Performance +Test performance with vllm benchmark ```shell cd /vllm-workspace/vllm/benchmarks python3 benchmark_serving.py \ @@ -237,9 +221,9 @@ python3 benchmark_serving.py \ --num-prompts 256 \ --ignore-eos \ --model deepseek \ - --tokenizer /models/deepseek_r1_w8a8 \ + --tokenizer /data01/deepseek_r1_w8a8_zhw \ --host localhost \ - --port 1025 \ + --port 8000 \ --endpoint /v1/completions \ --max-concurrency 4 \ --request-rate 4 diff --git a/examples/disaggregated_prefill_v1/gen_ranktable.py b/examples/disaggregated_prefill_v1/gen_ranktable.py index 52db3ee8e1..d170f3ba06 100644 --- a/examples/disaggregated_prefill_v1/gen_ranktable.py +++ b/examples/disaggregated_prefill_v1/gen_ranktable.py @@ -4,7 +4,7 @@ import torch.distributed as dist -from vllm_ascend.utils import AscendSocVersion, init_ascend_soc_version, get_ascend_soc_version +from vllm_ascend.soc_info import NPUSocInfo parser = argparse.ArgumentParser( description="Arguments of rank table generator", ) @@ -33,9 +33,7 @@ # This variable is set by torchrun, # and is different from WORLD_SIZE in gen_rank_table.sh. world_size = os.environ.get("WORLD_SIZE") - -init_ascend_soc_version() -soc_info = get_ascend_soc_version() +soc_info = NPUSocInfo() def get_cmd_stdout(cmd): @@ -61,7 +59,7 @@ def get_cmd_stdout(cmd): for card_id in range(num_cards): for chip_id in range(chips_per_card): device_id = card_id * chips_per_card + chip_id - if soc_info == AscendSocVersion.A3: + if soc_info.is_a3: device_ip = get_cmd_stdout( f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr" ).split(":")[1].strip() @@ -81,7 +79,7 @@ def get_cmd_stdout(cmd): "device_id": str(device_id), "device_ip": str(device_ip), } - if soc_info == AscendSocVersion.A3: + if soc_info.is_a3: device_info.update({ "super_pod_id": str(super_pod_id), "super_device_id": str(super_device_id) diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py deleted file mode 100644 index 3af8fbe02d..0000000000 --- a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py +++ /dev/null @@ -1,518 +0,0 @@ -# Adapted from https://github.com/vllm-project/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py - -# SPDX-License-Identifier: Apache-2.0 -# -# Tutorial: Using the Load Balance Proxy Server Example -# -# This proxy server is designed to distribute requests between multiple -# "prefiller" and "decoder" backend servers for large language model inference. -# It is useful for scaling out inference workloads and balancing load across -# multiple backend instances. -# -# Features: -# - Load balances requests to multiple prefiller and decoder servers. -# - Supports OpenAI-compatible /v1/completions and /v1/chat/completions endpoints. -# - Streams responses from backend servers to clients. -# -# Prerequisites: -# - Python 3.8+ -# - Install dependencies: -# pip install fastapi httpx uvicorn vllm -# -# Step 1: Start Your Backend Servers -# ---------------------------------- -# You need to have at least one prefiller and one decoder backend running. -# These can be mock servers or actual vLLM servers. -# -# For testing, you can use the provided mock server: -# -# vllm serve --host 0.0.0.0 --port 8100 ... # Prefiller 1 -# vllm serve --host 0.0.0.0 --port 8101 ... # Prefiller 2 -# vllm serve --host 0.0.0.0 --port 8200 ... # Decoder 1 -# vllm serve --host 0.0.0.0 --port 8201 ... # Decoder 2 -# -# Step 2: Start the Proxy Server -# ------------------------------ -# Run the proxy server, specifying the host/port for each prefiller and decoder: -# -# python load_balance_proxy_server_example.py \ -# --host 0.0.0.0 --port 9000 \ -# --prefiller-hosts 127.0.0.1 127.0.0.1 \ -# --prefiller-ports 8100 8101 \ -# --decoder-hosts 127.0.0.1 127.0.0.1 \ -# --decoder-ports 8200 8201 -# -# This will start the proxy on port 9000, load balancing between two prefiller -# and two decoder servers. -# -# Step 3: Send a Request to the Proxy -# ----------------------------------- -# You can now send OpenAI-compatible requests to the proxy. For example: -# -# curl -X POST http://localhost:9000/v1/completions \ -# -H "Content-Type: application/json" \ -# -d '{ -# "model": "your-model", -# "prompt": "The quick brown fox jumps over the lazy dog", -# "max_tokens": 16 -# }' -# -# Or for chat completions: -# -# curl -X POST http://localhost:9000/v1/chat/completions \ -# -H "Content-Type: application/json" \ -# -d '{ -# "model": "your-model", -# "messages": [{"role": "user", "content": "Hello!"}], -# "max_tokens": 16 -# }' -# -# Step 4: Health Check -# -------------------- -# To check if the proxy is running and see how many backend instances are -# connected, use: -# -# curl http://localhost:9000/healthcheck -# -# This will return a JSON object with the status and the number of prefiller -# and decoder instances. -# -# Notes: -# - You can scale the number of prefiller and decoder servers as needed. -# - The proxy will round-robin requests to balance load. -# - For production, ensure your backend servers are robust and secure. -# -# For more details, see the code and comments in this file. - - -import argparse -import asyncio -import heapq -import os -import sys -from contextlib import asynccontextmanager -from typing import List - -import httpx -from fastapi import FastAPI, Request -from fastapi.responses import StreamingResponse -from vllm.logger import init_logger - -logger = init_logger(__name__) - -# Add uvloop for faster event loop if available -try: - import uvloop - asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) -except ImportError: - pass - - -class ServerState: - - def __init__(self, host, port): - self.host = host - self.port = port - self.url = f'http://{host}:{port}/v1' - self.client = httpx.AsyncClient(timeout=None, - base_url=self.url, - limits=httpx.Limits( - max_connections=100000, - max_keepalive_connections=100000)) - self.active_tokens = 0 - self.active_kv_cache = 0 # Only for prefiller - self.active_requests = 0 # Number of active requests - self.aborted_requests = set() # Track aborted requests - # Removed individual server lock - will use global locks instead - - -class ProxyState: - - def __init__(self, prefiller_instances, decoder_instances): - self.prefillers: List[ServerState] = [ - ServerState(h, p) for h, p in prefiller_instances - ] - self.decoders: List[ServerState] = [ - ServerState(h, p) for h, p in decoder_instances - ] - self.req_to_prefiller = {} - self.req_id_lock = asyncio.Lock() - self.req_id_counter = 0 - # Removed selection locks - no longer needed for synchronous methods - - # Initialize priority queues for efficient server selection - # Each entry is (priority_score, server_index, server_reference) - # Lower priority score = higher priority (less loaded) - self.prefiller_heap = [(0, i, server) - for i, server in enumerate(self.prefillers)] - self.decoder_heap = [(0, i, server) - for i, server in enumerate(self.decoders)] - heapq.heapify(self.prefiller_heap) - heapq.heapify(self.decoder_heap) - - def _update_prefiller_priority(self, server_idx: int): - """Update the priority of a prefiller server in the heap.""" - server = self.prefillers[server_idx] - # Priority based on active_tokens and active_kv_cache - priority = server.active_tokens + server.active_kv_cache * 0.3 - # Remove old entry and add new one - self.prefiller_heap = [(p, i, s) for p, i, s in self.prefiller_heap - if i != server_idx] - heapq.heappush(self.prefiller_heap, - (priority, server_idx, server)) # type: ignore - - def _update_decoder_priority(self, server_idx: int): - """Update the priority of a decoder server in the heap.""" - server = self.decoders[server_idx] - priority = server.active_tokens - # Remove old entry and add new one - self.decoder_heap = [(p, i, s) for p, i, s in self.decoder_heap - if i != server_idx] - heapq.heappush(self.decoder_heap, - (priority, server_idx, server)) # type: ignore - - def abort_prefiller_request(self, server_idx: int, - request_id): # Changed to synchronous - """ - Mark a request as aborted. This will helps to release kv cache in - prefiller node. - """ - # No lock needed - atomic operation - self.prefillers[server_idx].aborted_requests.add(request_id) - - def aquire_aborted_prefiller_requests( - self, server_idx: int): # Changed to synchronous - """ - Get the set of aborted requests and clear it. - This is used to release kv cache in prefiller node. - """ - # No lock needed - atomic operation - aborted_requests = self.prefillers[server_idx].aborted_requests.copy() - self.prefillers[server_idx].aborted_requests.clear() - return aborted_requests - - async def next_req_id(self): - async with self.req_id_lock: - self.req_id_counter += 1 - return str(self.req_id_counter) - - def select_prefiller(self, token_count): # Changed to synchronous - # No lock needed - entire function is atomic - if not self.prefiller_heap: - raise RuntimeError("No prefiller servers available") - - priority, chosen, server = heapq.heappop(self.prefiller_heap) - - # Update the chosen server atomically - self.prefillers[chosen].active_tokens += token_count - self.prefillers[chosen].active_kv_cache += token_count - - # Update priority and re-add to heap - self._update_prefiller_priority(chosen) - - return chosen - - def release_prefiller(self, idx, token_count): # Changed to synchronous - # No lock needed - atomic operation - self.prefillers[idx].active_tokens -= token_count - # Update priority queue after releasing - self._update_prefiller_priority(idx) - - def release_prefiller_kv(self, idx, token_count): # Changed to synchronous - # No lock needed - atomic operation - if self.prefillers[idx].active_kv_cache > 0: - self.prefillers[idx].active_kv_cache -= token_count - # Update priority queue after releasing - self._update_prefiller_priority(idx) - - def select_decoder(self, token_count): # Changed to synchronous - # No lock needed - entire function is atomic - if not self.decoder_heap: - raise RuntimeError("No decoder servers available") - - priority, chosen, server = heapq.heappop(self.decoder_heap) - - # Update the chosen server atomically - self.decoders[chosen].active_tokens += token_count - - # Update priority and re-add to heap - self._update_decoder_priority(chosen) - - return chosen - - def release_decoder(self, idx, token_count): # Changed to synchronous - # No lock needed - atomic operation - self.decoders[idx].active_tokens -= token_count - # Update priority queue after releasing - self._update_decoder_priority(idx) - - # Omni_infer's calculate_input_scores function - def calculate_prefill_scores(self, request_length: int) -> float: - length_score = request_length / 4.0 - input_score = length_score * 0.0345 + 120.0745 - return input_score - - def calculate_decode_scores(self, request_length: int) -> float: - return request_length - - -proxy_state = None - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--prefiller-hosts", - type=str, - nargs="+", - default=["localhost"]) - parser.add_argument("--prefiller-ports", - type=int, - nargs="+", - default=[8001]) - parser.add_argument("--decoder-hosts", - type=str, - nargs="+", - default=["localhost"]) - parser.add_argument("--decoder-ports", type=int, nargs="+", default=[8002]) - parser.add_argument("--max-retries", - type=int, - default=3, - help="Maximum number of retries for HTTP requests") - parser.add_argument( - "--retry-delay", - type=float, - default=0.001, - help="Base delay (seconds) for exponential backoff retries") - args = parser.parse_args() - if len(args.prefiller_hosts) != len(args.prefiller_ports): - raise ValueError( - "Number of prefiller hosts must match number of prefiller ports") - if len(args.decoder_hosts) != len(args.decoder_ports): - raise ValueError( - "Number of decoder hosts must match number of decoder ports") - args.prefiller_instances = list( - zip(args.prefiller_hosts, args.prefiller_ports)) - args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports)) - return args - - -@asynccontextmanager -async def lifespan(app: FastAPI): - global proxy_state - proxy_state = ProxyState(global_args.prefiller_instances, - global_args.decoder_instances) - print( - f"Initialized {len(proxy_state.prefillers)} prefill clients and {len(proxy_state.decoders)} decode clients." - ) - yield - for p in proxy_state.prefillers: - await p.client.aclose() - for d in proxy_state.decoders: - await d.client.aclose() - - -app = FastAPI(lifespan=lifespan) - - -async def send_request_to_service(client: httpx.AsyncClient, - prefiller_id: int, - endpoint: str, - req_data: dict, - request_id: str, - max_retries: int = 3, - base_delay: float = 0.2): - aborted_requests = proxy_state.aquire_aborted_prefiller_requests( - prefiller_id) - req_data = req_data.copy() - req_data['kv_transfer_params'] = { - "do_remote_decode": True, - "do_remote_prefill": False, - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": None, - "remote_port": None, - "aborted_request": list(aborted_requests), - } - req_data["stream"] = False - req_data["max_tokens"] = 1 - if "stream_options" in req_data: - del req_data["stream_options"] - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id - } - last_exc = None - for attempt in range(1, max_retries + 1): - try: - response = await client.post(endpoint, - json=req_data, - headers=headers) - response.raise_for_status() - return response - except (httpx.RequestError, httpx.HTTPStatusError) as e: - logger.warning( - f"Attempt {attempt} failed for {endpoint}: {str(e)}") - last_exc = e - if attempt < max_retries: - await asyncio.sleep(base_delay * (2**(attempt - 1))) - else: - logger.error( - f"All {max_retries} attempts failed for {endpoint}.") - raise last_exc - - -async def stream_service_response_with_retry(client: httpx.AsyncClient, - endpoint: str, - req_data: dict, - request_id: str, - max_retries: int = 3, - base_delay: float = 0.2): - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id - } - for attempt in range(1, max_retries + 1): - try: - async with client.stream("POST", - endpoint, - json=req_data, - headers=headers) as response: - response.raise_for_status() - first_chunk_sent = False - async for chunk in response.aiter_bytes(): - first_chunk_sent = True - yield chunk - return # Success, exit after streaming - except (httpx.RequestError, httpx.HTTPStatusError) as e: - if attempt < max_retries: - logger.warning( - f"Attempt {attempt} failed for streaming {endpoint}: {str(e)}" - ) - await asyncio.sleep(base_delay * (2**(attempt - 1))) - else: - logger.error( - f"All {max_retries} attempts failed for streaming {endpoint}." - ) - raise e - except Exception as e: - # If any chunk has been sent, do not retry, just log and drop - if 'first_chunk_sent' in locals() and first_chunk_sent: - logger.error( - f"Streaming to client interrupted after response started: {str(e)}" - ) - return - else: - if attempt < max_retries: - logger.warning( - f"Attempt {attempt} failed for streaming {endpoint}: {str(e)}" - ) - await asyncio.sleep(base_delay * (2**(attempt - 1))) - else: - logger.error( - f"All {max_retries} attempts failed for streaming {endpoint}." - ) - raise e - - -async def _handle_completions(api: str, request: Request): - try: - req_data = await request.json() - req_body = await request.body() - request_length = len(req_body) - prefiller_score = proxy_state.calculate_prefill_scores(request_length) - logger.debug( - f"Request length: {request_length}, Prefiller score: {prefiller_score}" - ) - request_id = await proxy_state.next_req_id() - # Select prefiller - prefiller_idx = proxy_state.select_prefiller(prefiller_score) - prefiller = proxy_state.prefillers[prefiller_idx] - # Send request to prefiller - response = await send_request_to_service( - prefiller.client, - prefiller_idx, - api, - req_data, - request_id, - max_retries=global_args.max_retries, - base_delay=global_args.retry_delay) - proxy_state.release_prefiller(prefiller_idx, prefiller_score) - response_json = response.json() - kv_transfer_params = response_json.get('kv_transfer_params', {}) - if kv_transfer_params: - req_data["kv_transfer_params"] = kv_transfer_params - # Select decoder - decoder_score = proxy_state.calculate_decode_scores(request_length) - logger.debug("Decoder score: %f", decoder_score) - # Use the prefiller's kv_transfer_params to select decoder - decoder_idx = proxy_state.select_decoder(decoder_score) - decoder = proxy_state.decoders[decoder_idx] - logger.debug("Using %s %s", prefiller.url, decoder.url) - # Stream response from decoder - released_kv = False - - async def generate_stream(): - nonlocal released_kv - # Only one await per chunk, minimal logic in loop - try: - async for chunk in stream_service_response_with_retry( - decoder.client, - api, - req_data, - request_id=request_id, - max_retries=global_args.max_retries, - base_delay=global_args.retry_delay): - if not released_kv and chunk: - proxy_state.release_prefiller_kv( - prefiller_idx, prefiller_score) - released_kv = True - yield chunk - except Exception as e: - logger.error( - f"Error during streaming from decoder {decoder.url}: {str(e)} the aborted request {request_id} will be routing to the target prefiller when new request is ready to dispatch to it" - ) - proxy_state.abort_prefiller_request(prefiller_idx, request_id) - proxy_state.release_prefiller_kv(prefiller_idx, - prefiller_score) - - # After streaming done, release tokens - proxy_state.release_decoder(decoder_idx, decoder_score) - - return StreamingResponse(generate_stream(), - media_type="application/json") - except Exception as e: - import traceback - exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server" - f" - {api} endpoint") - print(e) - print("".join(traceback.format_exception(*exc_info))) - raise - - -@app.post("/v1/completions") -async def handle_completions(request: Request): - return await _handle_completions("/completions", request) - - -@app.post("/v1/chat/completions") -async def handle_chat_completions(request: Request): - return await _handle_completions("/chat/completions", request) - - -@app.get("/healthcheck") -async def healthcheck(): - return { - "status": "ok", - "prefill_instances": len(proxy_state.prefillers), - "decode_instances": len(proxy_state.decoders) - } - - -if __name__ == '__main__': - global global_args - global_args = parse_args() - import uvicorn - uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/examples/disaggregated_prefill_v1/toy_proxy_server.py b/examples/disaggregated_prefill_v1/toy_proxy_server.py new file mode 100644 index 0000000000..2e26d0aee2 --- /dev/null +++ b/examples/disaggregated_prefill_v1/toy_proxy_server.py @@ -0,0 +1,275 @@ +# Adapted from https://github.com/vllm-project/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py + +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import itertools +import os +import uuid +from contextlib import asynccontextmanager + +import httpx +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + Lifespan context manager to handle startup and shutdown events. + """ + # Startup: Initialize client pools for prefiller and decoder services + app.state.prefill_clients = [] + app.state.decode_clients = [] + limit = httpx.Limits(max_connections=100000, + max_keepalive_connections=100000) + + # Create prefill clients + for i, (host, port) in enumerate(global_args.prefiller_instances): + prefiller_base_url = f'http://{host}:{port}/v1' + app.state.prefill_clients.append({ + 'client': + httpx.AsyncClient(timeout=None, + base_url=prefiller_base_url, + limits=limit), + 'host': + host, + 'port': + port, + 'id': + i + }) + + # Create decode clients + for i, (host, port) in enumerate(global_args.decoder_instances): + decoder_base_url = f'http://{host}:{port}/v1' + app.state.decode_clients.append({ + 'client': + httpx.AsyncClient(timeout=None, + base_url=decoder_base_url, + limits=limit), + 'host': + host, + 'port': + port, + 'id': + i + }) + + # Initialize round-robin iterators + app.state.prefill_iterator = itertools.cycle( + range(len(app.state.prefill_clients))) + app.state.decode_iterator = itertools.cycle( + range(len(app.state.decode_clients))) + + print(f"Initialized {len(app.state.prefill_clients)} prefill clients " + f"and {len(app.state.decode_clients)} decode clients.") + + yield + + # Shutdown: Close all clients + for client_info in app.state.prefill_clients: + await client_info['client'].aclose() + + for client_info in app.state.decode_clients: + await client_info['client'].aclose() + + +# Update FastAPI app initialization to use lifespan +app = FastAPI(lifespan=lifespan) + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--host", type=str, default="localhost") + + # For prefiller instances + parser.add_argument("--prefiller-hosts", + "--prefiller-host", + type=str, + nargs="+", + default=["localhost"]) + parser.add_argument("--prefiller-ports", + "--prefiller-port", + type=int, + nargs="+", + default=[8100]) + + # For decoder instances + parser.add_argument("--decoder-hosts", + "--decoder-host", + type=str, + nargs="+", + default=["localhost"]) + parser.add_argument("--decoder-ports", + "--decoder-port", + type=int, + nargs="+", + default=[8200]) + + args = parser.parse_args() + + # Validate and pair hosts with ports + if len(args.prefiller_hosts) != len(args.prefiller_ports): + raise ValueError( + "Number of prefiller hosts must match number of prefiller ports") + + if len(args.decoder_hosts) != len(args.decoder_ports): + raise ValueError( + "Number of decoder hosts must match number of decoder ports") + + # Create tuples of (host, port) for each service type + args.prefiller_instances = list( + zip(args.prefiller_hosts, args.prefiller_ports)) + args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports)) + + return args + + +def get_next_client(app, service_type: str): + """ + Get the next client in round-robin fashion. + + Args: + app: The FastAPI app instance + service_type: Either 'prefill' or 'decode' + + Returns: + The next client to use + """ + if service_type == 'prefill': + client_idx = next(app.state.prefill_iterator) + return app.state.prefill_clients[client_idx] + elif service_type == 'decode': + client_idx = next(app.state.decode_iterator) + return app.state.decode_clients[client_idx] + else: + raise ValueError(f"Unknown service type: {service_type}") + + +async def send_request_to_service(client_info: dict, endpoint: str, + req_data: dict, request_id: str): + """ + Send a request to a service using a client from the pool. + """ + req_data = req_data.copy() + req_data['kv_transfer_params'] = { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": None, + "remote_port": None + } + req_data["stream"] = False + req_data["max_tokens"] = 1 + if "stream_options" in req_data: + del req_data["stream_options"] + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + + response = await client_info['client'].post(endpoint, + json=req_data, + headers=headers) + response.raise_for_status() + + return response + + +async def stream_service_response(client_info: dict, endpoint: str, + req_data: dict, request_id: str): + """ + Asynchronously stream response from a service using a client from the pool. + """ + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + + async with client_info['client'].stream("POST", + endpoint, + json=req_data, + headers=headers) as response: + response.raise_for_status() + async for chunk in response.aiter_bytes(): + yield chunk + + +async def _handle_completions(api: str, request: Request): + try: + req_data = await request.json() + request_id = str(uuid.uuid4()) + + # Get the next prefill client in round-robin fashion + prefill_client_info = get_next_client(request.app, 'prefill') + + # Send request to prefill service + response = await send_request_to_service(prefill_client_info, api, + req_data, request_id) + + # Extract the needed fields + response_json = response.json() + kv_transfer_params = response_json.get('kv_transfer_params', {}) + if kv_transfer_params: + req_data["kv_transfer_params"] = kv_transfer_params + + # Get the next decode client in round-robin fashion + decode_client_info = get_next_client(request.app, 'decode') + + logger.debug("Using %s %s", prefill_client_info, decode_client_info) + + # Stream response from decode service + async def generate_stream(): + async for chunk in stream_service_response(decode_client_info, + api, + req_data, + request_id=request_id): + yield chunk + + return StreamingResponse(generate_stream(), + media_type="application/json") + + except Exception as e: + import sys + import traceback + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server" + f" - {api} endpoint") + print(e) + print("".join(traceback.format_exception(*exc_info))) + raise + + +@app.post("/v1/completions") +async def handle_completions(request: Request): + return await _handle_completions("/completions", request) + + +@app.post("/v1/chat/completions") +async def handle_chat_completions(request: Request): + return await _handle_completions("/chat/completions", request) + + +@app.get("/healthcheck") +async def healthcheck(): + """Simple endpoint to check if the server is running.""" + return { + "status": "ok", + "prefill_instances": len(app.state.prefill_clients), + "decode_instances": len(app.state.decode_clients) + } + + +if __name__ == '__main__': + global global_args + global_args = parse_args() + + import uvicorn + uvicorn.run(app, host=global_args.host, port=global_args.port) \ No newline at end of file diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py index c5d0b3e4ee..e497a13ec2 100644 --- a/examples/offline_data_parallel.py +++ b/examples/offline_data_parallel.py @@ -54,16 +54,17 @@ --master-port=13345 """ -import contextlib -import gc import os from time import sleep +import contextlib +import gc import torch + from vllm import LLM, SamplingParams +from vllm.utils import get_open_port from vllm.distributed.parallel_state import ( # noqa E402 destroy_distributed_environment, destroy_model_parallel) -from vllm.utils import get_open_port os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" diff --git a/examples/offline_disaggregated_prefill_npu.py b/examples/offline_disaggregated_prefill_npu.py index f37b5087a3..84fa3fe65f 100644 --- a/examples/offline_disaggregated_prefill_npu.py +++ b/examples/offline_disaggregated_prefill_npu.py @@ -37,10 +37,7 @@ def clean_up(): def run_prefill(prefill_done, process_close): - # ranktable.json needs be generated using gen_ranktable.sh - # from the examples/disaggregated_prefill_v1 in the main branch. - os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json" - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig @@ -51,15 +48,16 @@ def run_prefill(prefill_done, process_close): ] sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - ktc = KVTransferConfig(kv_connector="LLMDataDistCMgrConnector", kv_buffer_device="npu", kv_role="kv_producer", - kv_parallel_size=1, - kv_connector_module_path="vllm_ascend.distributed.llmdatadist_c_mgr_connector") + ktc = KVTransferConfig.from_cli( + '{"kv_connector":"AscendHcclConnector","kv_buffer_device":"npu","kv_role":"kv_producer", "kv_parallel_size":2}' + ) + # Set NPU memory utilization to 0.8 llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", kv_transfer_config=ktc, max_model_len=2000, gpu_memory_utilization=0.8, - tensor_parallel_size=1) + tensor_parallel_size=2) llm.generate(prompts, sampling_params) print("Prefill node is finished.") @@ -79,11 +77,7 @@ def run_prefill(prefill_done, process_close): def run_decode(prefill_done): - os.environ['VLLM_LLMDD_RPC_PORT'] = '6634' - # ranktable.json needs be generated using gen_ranktable.sh - # from the examples/disaggregated_prefill_v1 module in the main branch. - os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json" - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "1" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "2,3" from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig @@ -94,14 +88,15 @@ def run_decode(prefill_done): ] sampling_params = SamplingParams(temperature=0, top_p=0.95) - ktc = KVTransferConfig(kv_connector="LLMDataDistCMgrConnector", kv_buffer_device="npu", kv_role="kv_consumer", - kv_parallel_size=1, kv_connector_module_path="vllm_ascend.distributed.llmdatadist_c_mgr_connector") + ktc = KVTransferConfig.from_cli( + '{"kv_connector":"AscendHcclConnector","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":2}' + ) llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", kv_transfer_config=ktc, max_model_len=2000, gpu_memory_utilization=0.8, - tensor_parallel_size=1) + tensor_parallel_size=2) # Wait for the producer to start the consumer print("Waiting for prefill node to finish...") diff --git a/examples/offline_external_launcher.py b/examples/offline_external_launcher.py deleted file mode 100644 index 4566fdcfa2..0000000000 --- a/examples/offline_external_launcher.py +++ /dev/null @@ -1,287 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py - -# Note: This script is designed to run with e2e test, -# please be careful to modify it. -""" -Usage: -Single node: - Dense models: - python examples/offline_external_launcher.py \ - --model="Qwen/Qwen2.5-0.5B-Instruct" \ - --tp-size=1 \ - --proc-per-node=2 - MOE models: - python examples/offline_external_launcher.py \ - --model="Qwen/Qwen3-30B-A3B" \ - --tp-size=2 \ - --proc-per-node=2 \ - --enable-expert-parallel - -Multi-node: - Node 0 (assume the node has ip of 10.99.48.128): - python examples/offline_external_launcher.py \ - --model="Qwen/Qwen3-30B-A3B" \ - --tp-size=2 \ - --node-size=2 \ - --node-rank=0 \ - --proc-per-node=2 \ - --enable-expert-parallel \ - --master-addr=10.99.48.128 \ - --master-port=13345 - Node 1: - python examples/offline_external_launcher.py \ - --model="Qwen/Qwen3-30B-A3B" \ - --tp-size=2 \ - --node-size=2 \ - --node-rank=1 \ - --enable-expert-parallel \ - --master-addr=10.99.48.128 \ - --master-port=13345 -""" - -import argparse -import contextlib -import gc -import os -from multiprocessing import Process -from time import sleep - -import torch -from vllm import LLM, SamplingParams -from vllm.distributed.parallel_state import ( # noqa E402 - destroy_distributed_environment, destroy_model_parallel, get_tp_group) -from vllm.utils import get_open_port, GiB_bytes - -os.environ["VLLM_USE_MODELSCOPE"] = "True" -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - - -def parse_args(): - - parser = argparse.ArgumentParser(description="External launcher Inference") - parser.add_argument( - "--model", - type=str, - default="Qwen/Qwen3-0.6B", - help="Model name or path", - ) - parser.add_argument("--tp-size", - type=int, - default=1, - help="Tensor parallel size") - parser.add_argument("--node-size", - type=int, - default=1, - help="Total number of nodes") - parser.add_argument("--node-rank", - type=int, - default=0, - help="Rank of the current node") - parser.add_argument("--proc-per-node", - type=int, - default=1, - help="Number of processes per node") - parser.add_argument("--master-addr", - type=str, - default="", - help="Master node IP address") - parser.add_argument("--master-port", - type=int, - default=0, - help="Master node port") - parser.add_argument("--enforce-eager", - action="store_true", - help="Enforce eager mode execution.") - parser.add_argument("--trust-remote-code", - action="store_true", - help="Trust remote code.") - parser.add_argument("--enable-expert-parallel", - action="store_true", - help="Enable expert parallel, used in MOE models.") - parser.add_argument("--enable-sleep-mode", - action="store_true", - help="Enable sleep mode for the engine.") - parser.add_argument("--temperature", - type=float, - default=0.8, - help="Float that controls the randomness of the sampling.") - parser.add_argument("--model-weight-gib", - type=float, - default=None, - help="Model weight memory usage in GiB (e.g., 1.0 for 0.5B model).") - - args = parser.parse_args() - if args.enable_sleep_mode: - if args.model_weight_gib is None or args.temperature != 0: - parser.error("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set.") - if args.model_weight_gib <= 0: - parser.error("model-weight-gib must be greater than 0 when enable-sleep-mode is set.") - if args.model == parser.get_default("model") and args.model_weight_gib is None: - parser.error("model-weight-gib must be provided for default model when enable-sleep-mode is set.") - - return args - - -def main( - local_rank: int, - rank: int, - master_addr: str, - master_port: int, - model_weight_gib: float, - model: str = "Qwen/Qwen3-0.6B", - world_size: int = 4, - tensor_parallel_size: int = 2, - enable_expert_parallel: bool = False, - enforce_eager: bool = False, - trust_remote_code: bool = True, - enable_sleep_mode: bool = False, - temperature: float = 0.8, -): - os.environ["MASTER_ADDR"] = master_addr - os.environ["MASTER_PORT"] = str(master_port) - os.environ["RANK"] = str(rank) - os.environ["LOCAL_RANK"] = str(local_rank) - os.environ["WORLD_SIZE"] = str(world_size) - if not torch.distributed.is_initialized(): - torch.distributed.init_process_group( - backend="cpu:gloo,npu:hccl", - world_size=world_size, - rank=rank, - ) - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] * 10 - sampling_params = SamplingParams( - temperature=temperature, - top_p=0.95, - max_tokens=10, - ) - llm = LLM( - model=model, - tensor_parallel_size=tensor_parallel_size, - enable_expert_parallel=enable_expert_parallel, - enforce_eager=enforce_eager, - trust_remote_code=trust_remote_code, - distributed_executor_backend="external_launcher", - seed=0, - enable_sleep_mode=enable_sleep_mode, - ) - tp_ranks = get_tp_group().ranks - print(f'TP RANKS: {tp_ranks}') - - outputs = llm.generate(prompts, sampling_params) - - if enable_sleep_mode: - if rank == 0: - free_bytes_before_sleep, total = torch.npu.mem_get_info() - llm.sleep(level=1) - if rank == 0: - free_bytes_after_sleep, total = torch.npu.mem_get_info() - freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep - print(f"Freed memory: {freed_bytes / 1024 ** 3:.2f} GiB") - # now the freed memory should be larger than the model weights - assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes - - llm.wake_up() - outputs_after_wakeup = llm.generate(prompts, sampling_params) - if rank == 0: - # cmp output - assert outputs[0].outputs[0].text == outputs_after_wakeup[0].outputs[0].text - print("Sleep and wake up successfully!!") - - for i, output in enumerate(outputs): - if i >= 5: - # print only 5 outputs - break - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Global rank: {rank}, Prompt: {prompt!r}, " - f"Generated text: {generated_text!r}") - - # Give engines time to pause their processing loops before exiting. - sleep(5) - del llm - cleanup_env_and_memory() - - -def cleanup_env_and_memory(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - torch.npu.empty_cache() - torch.npu.reset_peak_memory_stats() - - -if __name__ == "__main__": - args = parse_args() - - tp_size = args.tp_size - node_size = args.node_size - proc_per_node = args.proc_per_node - node_rank = args.node_rank - - if node_size == 1: - master_addr = "127.0.0.1" - master_port = get_open_port() - else: - master_addr = args.master_addr - master_port = args.master_port - - world_size = node_size * proc_per_node - - procs = [] - for local_rank, rank in enumerate( - range(proc_per_node * node_rank, proc_per_node * (node_rank + 1))): - proc = Process(target=main, - args=( - local_rank, - rank, - master_addr, - master_port, - args.model_weight_gib, - args.model, - world_size, - tp_size, - args.enable_expert_parallel, - args.enforce_eager, - args.trust_remote_code, - args.enable_sleep_mode, - args.temperature, - )) - - proc.start() - procs.append(proc) - exit_code = 0 - for proc in procs: - proc.join(timeout=600) - if proc.exitcode is None: - print( - f"Killing process {proc.pid} that didn't stop within 30 minutes." - ) - proc.kill() - exit_code = 1 - elif proc.exitcode: - exit_code = proc.exitcode - - exit(exit_code) diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 99a565bde7..03bb1cb527 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -25,32 +25,21 @@ """ import os -import argparse - -from vllm.assets.audio import AudioAsset -try: - import librosa -except ImportError: - raise Exception("Can't import librosa, please ensure it's installed") from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +question_per_audio_count = { + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} -def prepare_inputs(audio_count: int, audio_path1: str, audio_path2: str): - use_vllm_audio_assert = True if audio_path1 == "mary_had_lamb" and audio_path2 == "winning_call" else False - if use_vllm_audio_assert: - audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] - else: - audio_assets = [librosa.load(audio_path1, sr=None), librosa.load(audio_path2, sr=None)] - - question_per_audio_count = { - 1: "What is recited in the audio?", - 2: "What sport and what nursery rhyme are referenced?" - } +def prepare_inputs(audio_count: int): audio_in_prompt = "".join([ f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) @@ -63,7 +52,7 @@ def prepare_inputs(audio_count: int, audio_path1: str, audio_path2: str): mm_data = { "audio": - audio_assets if not use_vllm_audio_assert else [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]] + [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]] } # Merge text prompt and audio data into inputs @@ -71,7 +60,7 @@ def prepare_inputs(audio_count: int, audio_path1: str, audio_path2: str): return inputs -def main(audio_count: int, audio_path1: str, audio_path2: str): +def main(audio_count: int): # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. @@ -82,7 +71,7 @@ def main(audio_count: int, audio_path1: str, audio_path2: str): limit_mm_per_prompt={"audio": audio_count}, enforce_eager=True) - inputs = prepare_inputs(audio_count, audio_path1, audio_path2) + inputs = prepare_inputs(audio_count) sampling_params = SamplingParams(temperature=0.2, max_tokens=64, @@ -92,14 +81,9 @@ def main(audio_count: int, audio_path1: str, audio_path2: str): for o in outputs: generated_text = o.outputs[0].text - print("generated_text:", generated_text) + print(generated_text) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Arguments of rank table generator", ) - parser.add_argument("--audio-path1", type=str, default="mary_had_lamb") - parser.add_argument("--audio-path2", type=str, default="winning_call") - args = parser.parse_args() - audio_count = 2 - main(audio_count, args.audio_path1, args.audio_path2) + main(audio_count) diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh index 1866fb060c..eb3cfbf510 100644 --- a/examples/run_dp_server.sh +++ b/examples/run_dp_server.sh @@ -1,32 +1,33 @@ +rm -rf ./.torchair_cache/ +rm -rf ./dynamo_* +rm -rf /root/ascend/log/debug/plog/* export HCCL_IF_IP=2.0.0.0 -export GLOO_SOCKET_IFNAME="eth0" -export TP_SOCKET_IFNAME="eth0" -export HCCL_SOCKET_IFNAME="eth0" +export GLOO_SOCKET_IFNAME="enp189s0f0" +export TP_SOCKET_IFNAME="enp189s0f0" +export HCCL_SOCKET_IFNAME="enp189s0f0" export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 export VLLM_USE_V1=1 -export VLLM_USE_MODELSCOPE=true - export ASCEND_LAUNCH_BLOCKING=0 -vllm serve Qwen/Qwen1.5-MoE-A2.7B \ - --host 0.0.0.0 \ - --port 20002 \ - --served-model-name Qwen \ - --data-parallel-size 2 \ - --data-parallel-size-local 2 \ - --data-parallel-address 2.0.0.0 \ - --data-parallel-rpc-port 13389 \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ - --no-enable-prefix-caching \ - --max-num-seqs 16 \ - --max-model-len 4096 \ - --max-num-batched-tokens 4096 \ - --gpu-memory-utilization 0.9 \ - --trust-remote-code \ - --enforce-eager \ - --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}' +vllm serve /data/weights/Qwen2.5-0.5B-Instruct \ + --host 0.0.0.0 \ + --port 20002 \ + --served-model-name Qwen \ + --data-parallel-size 4 \ + --data-parallel-size-local 4 \ + --data-parallel-address 2.0.0.0 \ + --data-parallel-rpc-port 13389 \ + --tensor-parallel-size 4 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --max-num-seqs 16 \ + --max-model-len 4096 \ + --max-num-batched-tokens 4096 \ + --gpu-memory-utilization 0.9 \ + --trust-remote-code \ + --enforce-eager \ + --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}' diff --git a/pyproject.toml b/pyproject.toml index e394895dec..390d8c4dfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,9 @@ requires = [ "scipy", "setuptools>=64", "setuptools-scm>=8", - "torch-npu==2.7.1.dev20250724", - "torch>=2.7.1", - "torchvision", + "torch-npu==2.5.1.post1.dev20250619", + "torch>=2.5.1", + "torchvision<0.21.0", "wheel", "msgpack", "quart", diff --git a/requirements-dev.txt b/requirements-dev.txt index 9be7f39135..4f36cd70d9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ openai pytest >= 6.0 pytest-asyncio pytest-mock -lm-eval==0.4.8 +lm-eval types-jsonschema xgrammar zmq @@ -14,7 +14,6 @@ pytest-cov regex sentence_transformers ray>=2.47.1 -protobuf>3.20.0 +protobuf==4.25.6 librosa soundfile -pytest_mock \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6384149ac0..c2b2a3175e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,8 @@ pyyaml scipy setuptools>=64 setuptools-scm>=8 -torch>=2.7.1 -torchvision +torch>=2.5.1 +torchvision<0.21.0 wheel # Remove after https://github.com/vllm-project/vllm-ascend/issues/2034 transformers<4.54.0 @@ -26,4 +26,4 @@ numba # Install torch_npu --pre --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi -torch-npu==2.7.1.dev20250724 +torch-npu==2.5.1.post1.dev20250619 diff --git a/tests/e2e/long_term/accuracy/accuracy_multicard.py b/tests/e2e/long_term/accuracy/accuracy_multicard.py index 4479c4bf99..2bfb389e4b 100644 --- a/tests/e2e/long_term/accuracy/accuracy_multicard.py +++ b/tests/e2e/long_term/accuracy/accuracy_multicard.py @@ -18,11 +18,15 @@ # import gc import multiprocessing +import signal +import subprocess import sys +import time from multiprocessing import Queue import lm_eval import pytest +import requests import torch SERVER_HOST = "127.0.0.1" @@ -32,7 +36,7 @@ # pre-trained model path on Hugging Face. # Qwen/Qwen2.5-0.5B-Instruct: accuracy test for DP. -# Qwen/Qwen3-30B-A3B: accuracy test for EP and DP. +# Qwen/Qwen3-30B-A3B: accuracy test for EP. # deepseek-ai/DeepSeek-V2-Lite: accuracy test for TP. MODEL_NAME = ["Qwen/Qwen3-30B-A3B", "deepseek-ai/DeepSeek-V2-Lite"] @@ -141,27 +145,58 @@ def test_lm_eval_accuracy(monkeypatch: pytest.MonkeyPatch, model): f"Expected: {EXPECTED_VALUE[model]}±{RTOL} | Measured: {result}" -DP_DENSCE_MODEL = ["Qwen/Qwen2.5-0.5B-Instruct"] -DP_MOE_MOEDL = ["Qwen/Qwen3-30B-A3B"] +@pytest.mark.parametrize("max_tokens", [10]) +@pytest.mark.parametrize("model", ["Qwen/Qwen2.5-0.5B-Instruct"]) +def test_lm_eval_accuracy_dp(model, max_tokens): + log_file = open("accuracy_pd.log", "a+") + cmd = [ + "vllm", "serve", model, "--max_model_len", "4096", + "--tensor_parallel_size", "2", "--data_parallel_size", "2" + ] + server_proc = subprocess.Popen(cmd, + stdout=log_file, + stderr=subprocess.DEVNULL) -DP_MORE_ARGS = { - "Qwen/Qwen2.5-0.5B-Instruct": - "tensor_parallel_size=2,data_parallel_size=2", - "Qwen/Qwen3-30B-A3B": - "tensor_parallel_size=2,data_parallel_size=2,enable_expert_parallel=True,max_model_len=1024,enforce_eager=True", -} + try: + for _ in range(300): + try: + r = requests.get(HEALTH_URL, timeout=1) + if r.status_code == 200: + break + except requests.exceptions.RequestException: + pass + time.sleep(1) + else: + log_file.flush() + log_file.seek(0) + log_content = log_file.read() + pytest.fail( + f"vLLM serve did not become healthy after 300s: {HEALTH_URL}\n" + f"==== vLLM Serve Log Start ===\n{log_content}\n==== vLLM Serve Log End ===" + ) + + prompt = "bejing is a" + payload = { + "prompt": prompt, + "max_tokens": max_tokens, + "sampling_params": { + "temperature": 0.0, + "top_p": 1.0, + "seed": 123 + } + } + resp = requests.post(COMPLETIONS_URL, json=payload, timeout=30) + resp.raise_for_status() + data = resp.json() + generated = data["choices"][0]["text"].strip() + expected = "city in north china, it has many famous attractions" + assert generated == expected, f"Expected `{expected}`, got `{generated}`" -@pytest.mark.parametrize("model", DP_DENSCE_MODEL) -def test_lm_eval_accuracy_dp(model): - result_queue: Queue[float] = multiprocessing.Queue() - p = multiprocessing.Process(target=run_test, - args=(result_queue, model, - MAX_MODEL_LEN[model], MODEL_TYPE[model], - DP_MORE_ARGS[model])) - p.start() - p.join() - result = result_queue.get() - print(result) - assert (EXPECTED_VALUE[model] - RTOL < result < EXPECTED_VALUE[model] + RTOL), \ - f"Expected: {EXPECTED_VALUE[model]}±{RTOL} | Measured: {result}" + finally: + server_proc.send_signal(signal.SIGINT) + try: + server_proc.wait(timeout=10) + except subprocess.TimeoutExpired: + server_proc.kill() + server_proc.wait() diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py index b9654c62c9..0b945e0ce5 100644 --- a/tests/e2e/multicard/test_data_parallel.py +++ b/tests/e2e/multicard/test_data_parallel.py @@ -27,7 +27,7 @@ import pytest -MODELS = ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen3-30B-A3B"] +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] @pytest.mark.parametrize("model", MODELS) @@ -54,8 +54,6 @@ def test_data_parallel_inference(model, max_tokens): "--trust-remote-code", "--enforce-eager", ] - if model == "Qwen/Qwen3-30B-A3B": - cmd.append("--enable-expert-parallel") print(f"Running subprocess: {' '.join(cmd)}") proc = subprocess.run(cmd, diff --git a/tests/e2e/multicard/test_external_launcher.py b/tests/e2e/multicard/test_external_launcher.py deleted file mode 100644 index c5eecab81c..0000000000 --- a/tests/e2e/multicard/test_external_launcher.py +++ /dev/null @@ -1,149 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -Compare the outputs of vLLM with and without aclgraph. - -Run `pytest tests/multicard/test_external_launcher.py`. -""" - -import os -import subprocess -import sys -from pathlib import Path - -import pytest - -MODELS = ["Qwen/Qwen3-0.6B"] -MOE_MODELS = ["Qwen/Qwen3-30B-A3B"] - - -@pytest.mark.parametrize("model", MODELS) -def test_external_launcher(model): - script = Path( - __file__ - ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), - "--model", - model, - "--tp-size", - "1", - "--node-size", - "1", - "--node-rank", - "0", - "--proc-per-node", - "2", - "--trust-remote-code", - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=600, - ) - output = proc.stdout.decode() - - print(output) - - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output - assert "Generated text:" in output - assert proc.returncode == 0 - - -@pytest.mark.parametrize("model", MOE_MODELS) -def test_moe_external_launcher(model): - script = Path( - __file__ - ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), "--model", model, "--tp-size", "2", "--node-size", "1", - "--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code", - "--enable-expert-parallel" - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=600, - ) - output = proc.stdout.decode() - - print(output) - - assert "TP RANKS: [0, 1]" in output - assert "Generated text:" in output - assert proc.returncode == 0 - - -def test_external_launcher_and_sleepmode(): - script = Path( - __file__ - ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" - env = os.environ.copy() - # TODO: Change to 2 when ci machine has 4 cards - cmd = [ - sys.executable, - str(script), - "--model", - "Qwen/Qwen3-8B", - "--tp-size", - "1", - "--node-size", - "1", - "--node-rank", - "0", - "--proc-per-node", - "2", - "--trust-remote-code", - "--enable-sleep-mode", - "--temperature", - "0", - "--model-weight-gib", - "16", - ] - - print(f"Running subprocess: {' '.join(cmd)}") - proc = subprocess.run( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - timeout=300, - ) - output = proc.stdout.decode() - - print(output) - - assert "TP RANKS: [0]" in output - assert "TP RANKS: [1]" in output - assert "Generated text:" in output - assert "Sleep and wake up successfully!!" in output - assert proc.returncode == 0 diff --git a/tests/e2e/multicard/test_offline_inference_310p.py b/tests/e2e/multicard/test_offline_inference_310p.py deleted file mode 100644 index 6bf335686d..0000000000 --- a/tests/e2e/multicard/test_offline_inference_310p.py +++ /dev/null @@ -1,62 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -import pytest -import vllm # noqa: F401 - -import vllm_ascend # noqa: F401 -from tests.e2e.conftest import VllmRunner - -# Pangu local model path -MODELS = [ - "IntervitensInc/pangu-pro-moe-model", -] -# set additional config for ascend scheduler and torchair graph -ADDITIONAL_CONFIG = [{ - "additional_config": { - "torchair_graph_config": { - "enabled": True - }, - "ascend_scheduler_config": { - "enabled": True, - } - } -}] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enfore_eager", [True, False]) -@pytest.mark.parametrize("additional_config", ADDITIONAL_CONFIG) -def test_pangu_model(model: str, dtype: str, max_tokens: int, - enfore_eager: bool, additional_config: dict) -> None: - if enfore_eager: - additional_config = {} - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner(model, - tensor_parallel_size=4, - dtype=dtype, - max_model_len=1024, - enforce_eager=True, - enable_expert_parallel=True, - additional_config=additional_config, - distributed_executor_backend="mp") as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index f4d879a972..224bf45d85 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -157,28 +157,6 @@ def test_models_distributed_topk() -> None: vllm_model.generate(example_prompts, sampling_params) -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"}) -def test_models_distributed_alltoallv() -> None: - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", - "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", - "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", - ] - dtype = "half" - sampling_params = SamplingParams(max_tokens=5, - temperature=0.0, - top_k=50, - top_p=0.9) - - with VllmRunner( - "deepseek-ai/DeepSeek-V2-Lite", - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend="mp", - ) as vllm_model: - vllm_model.generate(example_prompts, sampling_params) - - def test_models_distributed_Qwen3_W8A8(): example_prompts = [ "Hello, my name is", @@ -188,49 +166,9 @@ def test_models_distributed_Qwen3_W8A8(): with VllmRunner( snapshot_download("vllm-ascend/Qwen3-8B-W8A8"), max_model_len=8192, + enforce_eager=True, dtype="auto", tensor_parallel_size=2, quantization="ascend", ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) - - -def test_models_distributed_Qwen3_W4A8DYNAMIC(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - - with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-8B-W4A8"), - max_model_len=8192, - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"}) -def test_models_distributed_DeepSeek_W4A8DYNAMIC(): - prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - with VllmRunner( - snapshot_download("vllm-ascend/DeepSeek-R1-w4a8-pruning"), - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - enforce_eager=True, - enable_expert_parallel=True, - additional_config={ - "torchair_graph_config": { - "enabled": False, - }, - "ascend_scheduler_config": { - "enabled": True, - } - }, - ) as vllm_model: - vllm_model.generate_greedy(prompts, max_tokens) diff --git a/tests/e2e/multicard/test_pyhccl_distributed.py b/tests/e2e/multicard/test_pyhccl_distributed.py index 2300e0a225..e3d9aedf15 100644 --- a/tests/e2e/multicard/test_pyhccl_distributed.py +++ b/tests/e2e/multicard/test_pyhccl_distributed.py @@ -89,7 +89,7 @@ def worker_fn(): def test_pyhccl(): - distributed_run(worker_fn, 2) + distributed_run(worker_fn, 4) def broadcast_worker_fn(): @@ -118,4 +118,4 @@ def broadcast_worker_fn(): def test_pyhccl_broadcast(): - distributed_run(broadcast_worker_fn, 2) + distributed_run(broadcast_worker_fn, 4) diff --git a/tests/e2e/multicard/test_qwen3_moe.py b/tests/e2e/multicard/test_qwen3_moe.py deleted file mode 100644 index dcac7a80bd..0000000000 --- a/tests/e2e/multicard/test_qwen3_moe.py +++ /dev/null @@ -1,74 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py -# -"""Compare the short outputs of HF and vLLM when using greedy sampling. - -Run `pytest tests/e2e/multicard/test_qwen3_moe.py`. -""" - -from modelscope import snapshot_download # type: ignore - -from tests.e2e.conftest import VllmRunner - - -def test_models_distributed_Qwen3_MOE_TP2(): - example_prompts = [ - "Hello, my name is", - ] - dtype = "half" - max_tokens = 5 - with VllmRunner( - "Qwen/Qwen3-30B-A3B", - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend="mp", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -def test_models_distributed_Qwen3_MOE_TP2_WITH_EP(): - example_prompts = [ - "Hello, my name is", - ] - dtype = "half" - max_tokens = 5 - with VllmRunner( - "Qwen/Qwen3-30B-A3B", - dtype=dtype, - tensor_parallel_size=2, - enable_expert_parallel=True, - distributed_executor_backend="mp", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -def test_models_distributed_Qwen3_MOE_W8A8(): - example_prompts = [ - "Hello, my name is", - ] - dtype = "auto" - max_tokens = 5 - with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"), - max_model_len=8192, - dtype=dtype, - tensor_parallel_size=2, - quantization="ascend", - enforce_eager=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py index 71d33f0c82..9ad336c19c 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/e2e/multicard/test_torchair_graph_mode.py @@ -31,7 +31,6 @@ def _deepseek_torchair_test_fixture( additional_config: Dict, *, tensor_parallel_size=2, - use_v1_schduler=False, ): example_prompts = [ "Hello, my name is", @@ -39,14 +38,14 @@ def _deepseek_torchair_test_fixture( "The capital of France is", "The future of AI is", ] - kwargs = {} - if not use_v1_schduler: - kwargs = { - "ascend_scheduler_config": { - "enabled": True, - }, - "refresh": True, - } + + # torchair is only work without chunked-prefill now + kwargs = { + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + } additional_config.update(**kwargs) with VllmRunner( @@ -96,15 +95,6 @@ def test_e2e_deepseekv3_with_torchair_ms_mla(): _deepseek_torchair_test_fixture(additional_config) -def test_e2e_deepseekv3_with_torchair_v1scheduler(): - additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - } - _deepseek_torchair_test_fixture(additional_config, use_v1_schduler=True) - - def _pangu_torchair_test_fixture( additional_config: Dict, *, diff --git a/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml deleted file mode 100644 index eb7196a65c..0000000000 --- a/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml +++ /dev/null @@ -1,8 +0,0 @@ -model_name: "Qwen/Qwen2.5-VL-7B-Instruct" -model: "vllm-vlm" -tasks: -- name: "mmmu_val" - metrics: - - name: "acc,none" - value: 0.51 -max_model_len: 8192 \ No newline at end of file diff --git a/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml b/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml deleted file mode 100644 index be1bbb0d2c..0000000000 --- a/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml +++ /dev/null @@ -1,18 +0,0 @@ -model_name: "Qwen/Qwen3-30B-A3B" -tasks: -- name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.89 - - name: "exact_match,flexible-extract" - value: 0.85 -- name: "ceval-valid" - metrics: - - name: "acc,none" - value: 0.84 -num_fewshot: 5 -gpu_memory_utilization: 0.6 -enable_expert_parallel: True -tensor_parallel_size: 2 -apply_chat_template: False -fewshot_as_multiturn: False \ No newline at end of file diff --git a/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml b/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml deleted file mode 100644 index e60cc9a721..0000000000 --- a/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml +++ /dev/null @@ -1,13 +0,0 @@ -model_name: "Qwen/Qwen3-8B-Base" -tasks: -- name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.82 - - name: "exact_match,flexible-extract" - value: 0.83 -- name: "ceval-valid" - metrics: - - name: "acc,none" - value: 0.82 -num_fewshot: 5 diff --git a/tests/e2e/singlecard/models/configs/accuracy.txt b/tests/e2e/singlecard/models/configs/accuracy.txt deleted file mode 100644 index e29ff1a509..0000000000 --- a/tests/e2e/singlecard/models/configs/accuracy.txt +++ /dev/null @@ -1,3 +0,0 @@ -Qwen3-8B-Base.yaml -Qwen2.5-VL-7B-Instruct.yaml -Qwen3-30B-A3B.yaml \ No newline at end of file diff --git a/tests/e2e/singlecard/models/conftest.py b/tests/e2e/singlecard/models/conftest.py deleted file mode 100644 index 2b25c1a929..0000000000 --- a/tests/e2e/singlecard/models/conftest.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from pathlib import Path - -import pytest - - -def pytest_addoption(parser): - parser.addoption( - "--config-list-file", - action="store", - default=None, - help="Path to the file listing model config YAMLs (one per line)", - ) - parser.addoption( - "--tp-size", - action="store", - default="1", - help="Tensor parallel size to use for evaluation", - ) - parser.addoption( - "--config", - action="store", - default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml", - help="Path to the model config YAML file", - ) - parser.addoption( - "--report_output", - action="store", - default="./benchmarks/accuracy/Qwen3-8B-Base.md", - help="Path to the report output file", - ) - - -@pytest.fixture(scope="session") -def config_list_file(pytestconfig, config_dir): - rel_path = pytestconfig.getoption("--config-list-file") - return config_dir / rel_path - - -@pytest.fixture(scope="session") -def tp_size(pytestconfig): - return pytestconfig.getoption("--tp-size") - - -@pytest.fixture(scope="session") -def config(pytestconfig): - return pytestconfig.getoption("--config") - - -@pytest.fixture(scope="session") -def report_output(pytestconfig): - return pytestconfig.getoption("--report_output") - - -def pytest_generate_tests(metafunc): - if "config_filename" in metafunc.fixturenames: - # If config specified, use the --config directly - single_config = metafunc.config.getoption("--config") - if single_config: - metafunc.parametrize("config_filename", - [Path(single_config).resolve()]) - return - # Otherwise, check --config-list-file - rel_path = metafunc.config.getoption("--config-list-file") - config_list_file = Path(rel_path).resolve() - config_dir = config_list_file.parent - with open(config_list_file, encoding="utf-8") as f: - configs = [ - config_dir / line.strip() for line in f - if line.strip() and not line.startswith("#") - ] - metafunc.parametrize("config_filename", configs) diff --git a/tests/e2e/singlecard/models/report_template.md b/tests/e2e/singlecard/models/report_template.md deleted file mode 100644 index ddaa9c7d94..0000000000 --- a/tests/e2e/singlecard/models/report_template.md +++ /dev/null @@ -1,24 +0,0 @@ -# {{ model_name }} - -**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), -**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }})) -**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }} -**Hardware Environment**: Atlas A2 Series -**Datasets**: {{ datasets }} -**Parallel Mode**: TP -**Execution Mode**: ACLGraph - -**Command**: - -```bash -export MODEL_ARGS={{ model_args }} -lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \ ---apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \ ---limit {{ limit }} --batch_size {{ batch_size}} -``` - -| Task | Metric | Value | Stderr | -|-----------------------|-------------|----------:|-------:| -{% for row in rows -%} -| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} | -{% endfor %} diff --git a/tests/e2e/singlecard/models/test_lm_eval_correctness.py b/tests/e2e/singlecard/models/test_lm_eval_correctness.py deleted file mode 100644 index 3453a05712..0000000000 --- a/tests/e2e/singlecard/models/test_lm_eval_correctness.py +++ /dev/null @@ -1,148 +0,0 @@ -import os -from dataclasses import dataclass - -import lm_eval -import numpy as np -import pytest -import yaml -from jinja2 import Environment, FileSystemLoader - -RTOL = 0.03 -TEST_DIR = os.path.dirname(__file__) - - -@dataclass -class EnvConfig: - vllm_version: str - vllm_commit: str - vllm_ascend_version: str - vllm_ascend_commit: str - cann_version: str - torch_version: str - torch_npu_version: str - - -@pytest.fixture -def env_config() -> EnvConfig: - return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'), - vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'), - vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION', - 'unknown'), - vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT', - 'unknown'), - cann_version=os.getenv('CANN_VERSION', 'unknown'), - torch_version=os.getenv('TORCH_VERSION', 'unknown'), - torch_npu_version=os.getenv('TORCH_NPU_VERSION', - 'unknown')) - - -def build_model_args(eval_config, tp_size): - trust_remote_code = eval_config.get("trust_remote_code", False) - max_model_len = eval_config.get("max_model_len", 4096) - model_args = { - "pretrained": eval_config["model_name"], - "tensor_parallel_size": tp_size, - "dtype": "auto", - "trust_remote_code": trust_remote_code, - "max_model_len": max_model_len, - } - for s in [ - "max_images", "gpu_memory_utilization", "enable_expert_parallel", - "tensor_parallel_size" - ]: - val = eval_config.get(s, None) - if val is not None: - model_args[s] = val - - print("Model Parameters:") - print(model_args) - - return model_args - - -def generate_report(tp_size, eval_config, report_data, report_output, - env_config): - env = Environment(loader=FileSystemLoader(TEST_DIR)) - template = env.get_template("report_template.md") - model_args = build_model_args(eval_config, tp_size) - - report_content = template.render( - vllm_version=env_config.vllm_version, - vllm_commit=env_config.vllm_commit, - vllm_ascend_version=env_config.vllm_ascend_version, - vllm_ascend_commit=env_config.vllm_ascend_commit, - cann_version=env_config.cann_version, - torch_version=env_config.torch_version, - torch_npu_version=env_config.torch_npu_version, - model_name=eval_config["model_name"], - model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'", - model_type=eval_config.get("model", "vllm"), - datasets=",".join([task["name"] for task in eval_config["tasks"]]), - apply_chat_template=eval_config.get("apply_chat_template", True), - fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True), - limit=eval_config.get("limit", None), - batch_size="auto", - num_fewshot=eval_config.get("num_fewshot", "N/A"), - rows=report_data["rows"]) - - os.makedirs(os.path.dirname(report_output), exist_ok=True) - with open(report_output, 'w', encoding='utf-8') as f: - f.write(report_content) - - -def test_lm_eval_correctness_param(config_filename, tp_size, report_output, - env_config): - eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) - model_args = build_model_args(eval_config, tp_size) - success = True - report_data: dict[str, list[dict]] = {"rows": []} - - eval_params = { - "model": eval_config.get("model", "vllm"), - "model_args": model_args, - "tasks": [task["name"] for task in eval_config["tasks"]], - "apply_chat_template": eval_config.get("apply_chat_template", True), - "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True), - "limit": eval_config.get("limit", None), - "batch_size": "auto", - } - for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]: - val = eval_config.get(s, None) - if val is not None: - eval_params[s] = val - - print("Eval Parameters:") - print(eval_params) - - results = lm_eval.simple_evaluate(**eval_params) - - for task in eval_config["tasks"]: - task_name = task["name"] - task_result = results["results"][task_name] - for metric in task["metrics"]: - metric_name = metric["name"] - ground_truth = metric["value"] - measured_value = task_result[metric_name] - task_success = bool( - np.isclose(ground_truth, measured_value, rtol=RTOL)) - success = success and task_success - - print(f"{task_name} | {metric_name}: " - f"ground_truth={ground_truth} | measured={measured_value} | " - f"success={'✅' if task_success else '❌'}") - - report_data["rows"].append({ - "task": - task_name, - "metric": - metric_name, - "value": - f"✅{measured_value}" if success else f"❌{measured_value}", - "stderr": - task_result[ - metric_name.replace(',', '_stderr,') if metric_name == - "acc,none" else metric_name.replace(',', '_stderr,')] - }) - generate_report(tp_size, eval_config, report_data, report_output, - env_config) - assert success diff --git a/tests/e2e/singlecard/ops/test_fused_moe.py b/tests/e2e/singlecard/ops/test_fused_moe.py index d04f3a62f9..78c0d88684 100644 --- a/tests/e2e/singlecard/ops/test_fused_moe.py +++ b/tests/e2e/singlecard/ops/test_fused_moe.py @@ -19,14 +19,15 @@ Run `pytest tests/ops/test_fused_moe.py`. """ - -from unittest.mock import MagicMock, patch +# fused moe ops test will hit the infer_schema error, we need add the patch +# here to make the test pass. +import vllm_ascend.patch.worker.patch_common.patch_utils # type: ignore[import] # isort: skip # noqa import pytest import torch from vllm.model_executor.layers.activation import SiluAndMul -from vllm_ascend.ops.fused_moe import fused_experts, select_experts +from vllm_ascend.ops.fused_moe import fused_experts NUM_EXPERTS = [8, 64] EP_SIZE = [1, 4] @@ -97,97 +98,3 @@ def test_fused_experts( # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1) torch.npu.empty_cache() - - -@pytest.mark.parametrize("m", [1, 33, 64]) -@pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("e", NUM_EXPERTS) -@pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"]) -@pytest.mark.parametrize("use_grouped_topk", [True, False]) -@pytest.mark.parametrize("renormalize", [True, False]) -@pytest.mark.parametrize("with_e_correction", [True, False]) -@pytest.mark.parametrize("custom_routing", [True, False]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("device", DEVICE) -def test_select_experts( - m: int, - n: int, - e: int, - topk: int, - scoring_func: str, - use_grouped_topk: bool, - renormalize: bool, - with_e_correction: bool, - custom_routing: bool, - dtype: torch.dtype, - device: str, -): - topk_group = 4 if use_grouped_topk else None - num_expert_group = e // 4 if use_grouped_topk else None - - hidden_states = torch.randn(m, n, device=device, dtype=dtype) - router_logits = torch.randn(m, e, device=device, dtype=dtype) - - e_score_correction_bias = (torch.randn(e, device=device, dtype=dtype) - if with_e_correction else None) - - custom_routing_function = None - if custom_routing: - custom_routing_function = MagicMock() - mock_weights = torch.randn(m, topk, device=device, dtype=dtype) - mock_ids = torch.randint(0, - e, (m, topk), - device=device, - dtype=torch.int32) - custom_routing_function.return_value = (mock_weights, mock_ids) - - with patch("vllm_ascend.ops.fused_moe.native_grouped_topk" - ) as mock_native_grouped_topk: - mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like( - x) - - topk_weights, topk_ids = select_experts( - hidden_states=hidden_states, - router_logits=router_logits, - top_k=topk, - use_grouped_topk=use_grouped_topk, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, - ) - - if use_grouped_topk: - mock_native_grouped_topk.assert_called_once() - else: - mock_native_grouped_topk.assert_not_called() - - assert topk_weights.shape == (m, topk) - assert topk_ids.shape == (m, topk) - assert topk_ids.dtype == torch.int32 - - -@pytest.mark.parametrize("device", DEVICE) -def test_select_experts_invalid_scoring_func(device: str): - with pytest.raises(ValueError, - match="Unsupported scoring function: invalid"): - select_experts(hidden_states=torch.randn(1, 128, device=device), - router_logits=torch.randn(1, 8, device=device), - top_k=2, - use_grouped_topk=False, - renormalize=False, - scoring_func="invalid") - - -@pytest.mark.parametrize("device", DEVICE) -def test_select_experts_missing_group_params(device: str): - with pytest.raises(AssertionError): - select_experts(hidden_states=torch.randn(1, 128, device=device), - router_logits=torch.randn(1, 64, device=device), - top_k=2, - use_grouped_topk=True, - renormalize=False, - scoring_func="softmax") diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py index 5b150e76f2..2a0374469d 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -84,11 +84,3 @@ def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None: max_model_len=1024, enforce_eager=False) assert "ACL Graph does not support deepseek" in str(excinfo.value) - - -@pytest.mark.parametrize("model", MODELS) -def test_ray_backend_sets_no_compilation(model: str) -> None: - runner = VllmRunner(model, - enforce_eager=False, - distributed_executor_backend="ray") - assert runner.model.llm_engine.vllm_config.compilation_config.level == 0 diff --git a/tests/e2e/singlecard/test_offline_inference_310p.py b/tests/e2e/singlecard/test_offline_inference_310p.py deleted file mode 100644 index d507f69f85..0000000000 --- a/tests/e2e/singlecard/test_offline_inference_310p.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -import pytest -import vllm # noqa: F401 -from vllm import SamplingParams - -import vllm_ascend # noqa: F401 -from tests.e2e.conftest import VllmRunner - -MODELS = ["Qwen/Qwen3-0.6B-Base", "Qwen/Qwen2.5-7B-Instruct"] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float16"]) -@pytest.mark.parametrize("max_tokens", [5]) -def test_models(model: str, dtype: str, max_tokens: int) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner(model, - tensor_parallel_size=1, - dtype=dtype, - max_model_len=2048, - enforce_eager=True, - compilation_config={ - "custom_ops": - ["none", "+rms_norm", "+rotary_embedding"] - }) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float16"]) -def test_vl_model_with_samples(model: str, dtype: str) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", - ] - - with VllmRunner(model, - tensor_parallel_size=1, - dtype=dtype, - max_model_len=2048, - enforce_eager=True, - compilation_config={ - "custom_ops": - ["none", "+rms_norm", "+rotary_embedding"] - }) as vllm_model: - sampling_params = SamplingParams(max_tokens=100, - top_p=0.95, - top_k=50, - temperature=0.6) - vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/ut/device_allocator/test_camem.py b/tests/ut/device_allocator/test_camem.py deleted file mode 100644 index ec500e73fb..0000000000 --- a/tests/ut/device_allocator/test_camem.py +++ /dev/null @@ -1,188 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from tests.ut.base import PytestBase -from vllm_ascend.device_allocator.camem import (AllocationData, CaMemAllocator, - create_and_map, - find_loaded_library, - get_pluggable_allocator, - unmap_and_release) - - -def dummy_malloc(args): - pass - - -def dummy_free(ptr): - return (0, 0, 0, 0) - - -class TestCaMem(PytestBase): - - def test_find_loaded_library_success_and_not_found(self): - path = find_loaded_library("libc") - assert path is not None, "Expected to find libc library" - assert path.endswith(".so.6") or ".so" in path - assert "libc" in path - - path = find_loaded_library("non_existent_library") - assert path is None, "Expected to not find non-existent library" - - @pytest.mark.parametrize("handle", [ - (1, 2, 3), - ("device", 99), - (None, ), - ]) - def test_create_and_map_calls_python_create_and_map(self, handle): - with patch("vllm_ascend.device_allocator.camem.python_create_and_map" - ) as mock_create: - create_and_map(handle) - mock_create.assert_called_once_with(*handle) - - @pytest.mark.parametrize("handle", [ - (42, "bar"), - ("foo", ), - ]) - def test_unmap_and_release_calls_python_unmap_and_release(self, handle): - with patch( - "vllm_ascend.device_allocator.camem.python_unmap_and_release" - ) as mock_release: - unmap_and_release(handle) - mock_release.assert_called_once_with(*handle) - - @patch("vllm_ascend.device_allocator.camem.init_module") - @patch( - "vllm_ascend.device_allocator.camem.torch.npu.memory.NPUPluggableAllocator" - ) - def test_get_pluggable_allocator(self, mock_allocator_class, - mock_init_module): - mock_allocator_instance = MagicMock() - mock_allocator_class.return_value = mock_allocator_instance - - def side_effect_malloc_and_free(malloc_fn, free_fn): - malloc_fn((1, 2, 3)) - free_fn(123) - - mock_init_module.side_effect = side_effect_malloc_and_free - - allocator = get_pluggable_allocator(dummy_malloc, dummy_free) - mock_init_module.assert_called_once_with(dummy_malloc, dummy_free) - assert allocator == mock_allocator_instance - - def test_singleton_behavior(self): - instance1 = CaMemAllocator.get_instance() - instance2 = CaMemAllocator.get_instance() - assert instance1 is instance2 - - def test_python_malloc_and_free_callback(self): - allocator = CaMemAllocator.get_instance() - - # mock allocation_handle - handle = (1, 100, 1234, 0) - allocator.current_tag = "test_tag" - - allocator.python_malloc_callback(handle) - # check pointer_to_data store data - ptr = handle[2] - assert ptr in allocator.pointer_to_data - data = allocator.pointer_to_data[ptr] - assert data.handle == handle - assert data.tag == "test_tag" - - # check free callback with cpu_backup_tensor - data.cpu_backup_tensor = torch.zeros(1) - result_handle = allocator.python_free_callback(ptr) - assert result_handle == handle - assert ptr not in allocator.pointer_to_data - assert data.cpu_backup_tensor is None - - @patch("vllm_ascend.device_allocator.camem.unmap_and_release") - @patch("vllm_ascend.device_allocator.camem.memcpy") - def test_sleep_offload_and_discard(self, mock_memcpy, mock_unmap): - allocator = CaMemAllocator.get_instance() - - # prepare allocation, one tag match,one not match - handle1 = (1, 10, 1000, 0) - data1 = AllocationData(handle1, "tag1") - handle2 = (2, 20, 2000, 0) - data2 = AllocationData(handle2, "tag2") - allocator.pointer_to_data = { - 1000: data1, - 2000: data2, - } - - # mock is_pin_memory_available, return False as some machine only has cpu - with patch( - "vllm_ascend.device_allocator.camem.NPUPlatform.is_pin_memory_available", - return_value=False): - allocator.sleep(offload_tags="tag1") - - # only offload tag1, other tag2 call unmap_and_release - assert data1.cpu_backup_tensor is not None - assert data2.cpu_backup_tensor is None - mock_unmap.assert_any_call(handle1) - mock_unmap.assert_any_call(handle2) - assert mock_unmap.call_count == 2 - assert mock_memcpy.called - - @patch("vllm_ascend.device_allocator.camem.create_and_map") - @patch("vllm_ascend.device_allocator.camem.memcpy") - def test_wake_up_loads_and_clears_cpu_backup(self, mock_memcpy, - mock_create_and_map): - allocator = CaMemAllocator.get_instance() - - handle = (1, 10, 1000, 0) - tensor = torch.zeros(5, dtype=torch.uint8) - data = AllocationData(handle, "tag1", cpu_backup_tensor=tensor) - allocator.pointer_to_data = {1000: data} - - allocator.wake_up(tags=["tag1"]) - - mock_create_and_map.assert_called_once_with(handle) - assert data.cpu_backup_tensor is None - assert mock_memcpy.called - - def test_use_memory_pool_context_manager(self): - allocator = CaMemAllocator.get_instance() - old_tag = allocator.current_tag - - # mock use_memory_pool_with_allocator - mock_ctx = MagicMock() - mock_ctx.__enter__.return_value = "data" - mock_ctx.__exit__.return_value = None - - with patch( - "vllm_ascend.device_allocator.camem.use_memory_pool_with_allocator", - return_value=mock_ctx): - with allocator.use_memory_pool(tag="my_tag"): - assert allocator.current_tag == "my_tag" - # restore old tag after context manager exits - assert allocator.current_tag == old_tag - - def test_get_current_usage(self): - allocator = CaMemAllocator.get_instance() - - allocator.pointer_to_data = { - 1: AllocationData((0, 100, 1, 0), "tag"), - 2: AllocationData((0, 200, 2, 0), "tag"), - } - - usage = allocator.get_current_usage() - assert usage == 300 diff --git a/tests/ut/distributed/test_distributed_tensor_parallel.py b/tests/ut/distributed/test_distributed_tensor_parallel.py deleted file mode 100644 index 48a88fa1f6..0000000000 --- a/tests/ut/distributed/test_distributed_tensor_parallel.py +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. - -import importlib - -import pytest -import torch -from pytest_mock import MockerFixture - -from tests.ut.base import PytestBase -from vllm_ascend.distributed.tensor_parallel import ( - _gather_along_first_dim, _gather_along_last_dim, - _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim, - all_to_all_hp2sp, all_to_all_sp2hp) - - -class TestDistributedCommunication(PytestBase): - - @pytest.fixture(autouse=True) - def context(self, mocker: MockerFixture): - mocker.patch("torch.npu.current_device", return_value="cpu") - mocker.patch("torch.distributed.get_world_size", return_value=4) - - mocker.patch("torch.distributed.get_rank", return_value=0) - - @pytest.mark.parametrize("world_size, test_tensor, expected", - [(1, torch.randn(8, 16), (8, 16)), - (4, torch.randn(8, 16), (32, 16))]) - def test_gather_along_first_dim(self, test_tensor, expected, world_size, - mocker: MockerFixture): - """test _gather_along_first_dim""" - mocker.patch("torch.distributed.get_world_size", - return_value=world_size) - - result = _gather_along_first_dim(test_tensor, mocker.MagicMock()) - - assert result.shape == expected - - @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [ - (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)), - ]) - def test_gather_along_first_dim_unequal_split(self, test_tensor, expected, - output_split_sizes, - mocker: MockerFixture): - """test _gather_along_first_dim""" - - result = _gather_along_first_dim(test_tensor, mocker.MagicMock(), - output_split_sizes) - - assert result.shape == expected - - @pytest.mark.parametrize("world_size, test_tensor, expected", - [(1, torch.randn(8, 16, 32), (8, 16, 32)), - (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))]) - def test_gather_along_last_dim(self, test_tensor, expected, world_size, - mocker: MockerFixture): - """test _gather_along_last_dim""" - mocker.patch("torch.distributed.get_world_size", - return_value=world_size) - - result = _gather_along_last_dim(test_tensor, mocker.MagicMock()) - - assert result.shape == expected - - @pytest.mark.parametrize("input_shape,expected_shape", [ - ((32, 16), (8, 16)), - ((40, 10), (10, 10)), - ]) - def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape, - mocker: MockerFixture): - input_tensor = torch.randn(*input_shape) - result = _reduce_scatter_along_first_dim(input_tensor, - mocker.MagicMock()) - assert result.shape == expected_shape - - @pytest.mark.parametrize("input_shape,expected_shape", [ - ((8, 16, 32), (8, 16, 8)), - ]) - def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape, - mocker: MockerFixture): - input_tensor = torch.randn(*input_shape) - result = _reduce_scatter_along_last_dim(input_tensor, - mocker.MagicMock()) - assert result.shape == expected_shape - - @pytest.mark.parametrize("func,input_shape,expected_shape", [ - ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32), - (8, 16, 128)), - ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)), - ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32), - (8, 16, 8)), - ("gather_from_sequence_parallel_region", (8, 16), (32, 16)), - ]) - def test_wrapper_functions(self, func, input_shape, expected_shape, - mocker: MockerFixture): - """test wrapper funcs""" - mod = importlib.import_module( - 'vllm_ascend.distributed.tensor_parallel') - globals = mod.__dict__ - test_func = globals[func] - input_tensor = torch.randn(*input_shape) - result = test_func(input_tensor, mocker.MagicMock()) - assert result.shape == expected_shape - - @pytest.mark.parametrize( - "input_shape,output_shape", - [ - ((8, 16), (32, 4)), # [num_tokens/TP, H] -> [num_tokens, H/TP] - ]) - def test_all_to_all_sp2hp(self, input_shape, output_shape, - mocker: MockerFixture): - input_tensor = torch.randn(*input_shape) - result = all_to_all_sp2hp(input_tensor, mocker.MagicMock()) - assert result.shape == output_shape - - @pytest.mark.parametrize( - "input_shape,output_shape", - [ - ((32, 4), (8, 16)), # [num_tokens, H/TP] -> [num_tokens/TP, H] - ]) - def test_all_to_all_hp2sp(self, input_shape, output_shape, - mocker: MockerFixture): - input_tensor = torch.randn(*input_shape) - result = all_to_all_hp2sp(input_tensor, mocker.MagicMock()) - assert result.shape == output_shape diff --git a/tests/ut/kv_connector/test_llmdatadist_connector.py b/tests/ut/kv_connector/test_llmdatadist_connector.py index b70482f9e1..94650f43e9 100644 --- a/tests/ut/kv_connector/test_llmdatadist_connector.py +++ b/tests/ut/kv_connector/test_llmdatadist_connector.py @@ -2,13 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -import os -import types - from tests.ut.kv_connector.utils import (create_request, create_scheduler, create_vllm_config) -from vllm_ascend.distributed.llmdatadist_c_mgr_connector import ( - LLMDataDistCMgrConnectorMetadata, LLMDataDistCMgrConnectorWorker, LLMRole) +from vllm_ascend.distributed.llmdatadist_c_mgr_connector import \ + LLMDataDistCMgrConnectorMetadata def test_basic_inferface(): @@ -43,54 +40,3 @@ def test_basic_inferface(): req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator. single_type_managers[0].req_to_blocks[request_id]): assert block_id == block.block_id - - -def test_read_agent_metadata(): - rank_table = { - "version": - "1.2", - "server_count": - "2", - "prefill_device_list": [{ - "server_id": "192.168.1.1", - "device_id": "0", - "device_ip": "10.30.0.1", - "cluster_id": "0", - }, { - "server_id": "192.168.1.1", - "device_id": "1", - "device_ip": "10.30.0.2", - "cluster_id": "1", - }, { - "server_id": "192.168.1.2", - "device_id": "0", - "device_ip": "10.30.0.3", - "cluster_id": "2", - }, { - "server_id": "192.168.1.2", - "device_id": "1", - "device_ip": "10.30.0.4", - "cluster_id": "3", - }] - } - - def get_device_ip(worker_local_ip, worker_tp_rank, worker_visible_devices): - old_visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "") - worker = types.SimpleNamespace() - worker.local_ip = worker_local_ip - worker.tp_rank = worker_tp_rank - worker.llm_datadist_role = LLMRole.PROMPT - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = worker_visible_devices - agent_metadata = LLMDataDistCMgrConnectorWorker.read_agent_metadata( - worker, rank_table) - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = old_visible_devices - return agent_metadata.device_ip - - assert get_device_ip("192.168.1.1", 0, "0") == "10.30.0.1" - assert get_device_ip("192.168.1.1", 0, "1") == "10.30.0.2" - assert get_device_ip("192.168.1.2", 0, "0") == "10.30.0.3" - assert get_device_ip("192.168.1.2", 0, "1") == "10.30.0.4" - assert get_device_ip("192.168.1.1", 0, "0,1") == "10.30.0.1" - assert get_device_ip("192.168.1.1", 1, "0,1") == "10.30.0.2" - assert get_device_ip("192.168.1.1", 0, "") == "10.30.0.1" - assert get_device_ip("192.168.1.1", 1, "") == "10.30.0.2" diff --git a/tests/ut/kv_connector/test_remote_decode_lifecycle.py b/tests/ut/kv_connector/test_remote_decode_lifecycle.py index 0a337437d0..2f241f1c32 100644 --- a/tests/ut/kv_connector/test_remote_decode_lifecycle.py +++ b/tests/ut/kv_connector/test_remote_decode_lifecycle.py @@ -25,7 +25,6 @@ create_model_runner_output, create_request, create_scheduler, create_vllm_config) -from vllm_ascend.utils import vllm_version_is def test_basic_lifecycle(): @@ -103,13 +102,7 @@ def test_basic_lifecycle(): # (3b): execute_model() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - if vllm_version_is("0.10.0"): - model_runner_output.finished_sending = [request_id] - else: - from vllm.v1.worker.kv_connector_model_runner_mixin import \ - KVConnectorOutput # type: ignore # noqa - model_runner_output.kv_connector_output = KVConnectorOutput( - finished_sending=[request_id]) + model_runner_output.finished_sending = [request_id] # (3c): update_from_output() scheduler.update_from_output(scheduler_output, model_runner_output) @@ -164,13 +157,7 @@ def test_prefix_cache_lifecycle(): scheduler_output = scheduler.schedule() scheduler.schedule() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - if vllm_version_is("0.10.0"): - model_runner_output.finished_sending = [request_remote.request_id] - else: - from vllm.v1.worker.kv_connector_model_runner_mixin import \ - KVConnectorOutput # noqa - model_runner_output.kv_connector_output = KVConnectorOutput( - finished_sending=[request_remote.request_id]) + model_runner_output.finished_sending = [request_remote.request_id] scheduler.update_from_output(scheduler_output, model_runner_output) _ = scheduler.schedule() assert_scheduler_empty(scheduler) diff --git a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py index cb070ad74d..516d6c6fcf 100644 --- a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py +++ b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py @@ -19,7 +19,7 @@ import copy from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT -from vllm.v1.request import RequestStatus +from vllm.v1.request import FinishReason, RequestStatus from tests.ut.kv_connector.utils import (assert_scheduler_empty, create_model_runner_output, @@ -55,7 +55,10 @@ def test_basic_lifecycle(): # Nothing running and empty scheduler output. assert len(scheduler.running) == 0 assert len(scheduler_output.scheduled_new_reqs) == 0 - assert scheduler_output.scheduled_cached_reqs.num_reqs == 0 + if vllm_version_is("0.9.1"): + assert len(scheduler_output.scheduled_cached_reqs) == 0 + else: + assert scheduler_output.scheduled_cached_reqs.num_reqs == 0 assert len(scheduler_output.num_scheduled_tokens) == 0 assert scheduler_output.total_num_scheduled_tokens == 0 @@ -91,13 +94,7 @@ def test_basic_lifecycle(): # (2b): forward(): request finishes recv. model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - if vllm_version_is("0.10.0"): - model_runner_output.finished_recving = [request_id] - else: - from vllm.v1.worker.kv_connector_model_runner_mixin import \ - KVConnectorOutput # type: ignore # noqa - model_runner_output.kv_connector_output = KVConnectorOutput( - finished_recving=[request_id]) + model_runner_output.finished_recving = [request_id] # (2c): update_from_output(): engine_core_outputs = scheduler.update_from_output(scheduler_output, @@ -138,6 +135,11 @@ def test_basic_lifecycle(): model_runner_output) scheduler.schedule() + if vllm_version_is("0.9.1"): + outputs = engine_core_outputs[0].outputs + assert len(outputs) == 1 + output = outputs[0] + assert output.finish_reason == FinishReason.STOP assert_scheduler_empty(scheduler) @@ -211,13 +213,7 @@ def test_full_block_prompt(): # # STEP (2): Recv. scheduler_output = scheduler.schedule() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - if vllm_version_is("0.10.0"): - model_runner_output.finished_recving = [request_id] - else: - from vllm.v1.worker.kv_connector_model_runner_mixin import \ - KVConnectorOutput # type: ignore # noqa - model_runner_output.kv_connector_output = KVConnectorOutput( - finished_recving=[request_id]) + model_runner_output.finished_recving = [request_id] scheduler.update_from_output(scheduler_output, model_runner_output) assert len(scheduler.waiting) == 1 assert (request_id in scheduler.finished_recving_kv_req_ids) @@ -240,6 +236,13 @@ def test_full_block_prompt(): # # Step (4): Hit EOS. scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output([request], use_eos=True) + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) scheduler.schedule() + if vllm_version_is("0.9.1"): + outputs = engine_core_outputs[0].outputs + assert len(outputs) == 1 + output = outputs[0] + assert output.finish_reason == FinishReason.STOP assert_scheduler_empty(scheduler) diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 2c540b30f0..450d62e036 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -186,20 +186,6 @@ def create_model_runner_output( sampled_token_ids = [[sampled_token] for _ in req_ids] # Make output data structure. - extra_args = {} - if not vllm_version_is("0.10.0"): - from vllm.v1.worker.kv_connector_model_runner_mixin import \ - KVConnectorOutput # type: ignore # noqa - kv_connector_output = KVConnectorOutput( - finished_sending=finished_sending, - finished_recving=finished_recving) - extra_args = {"kv_connector_output": kv_connector_output} - else: - extra_args = { - "finished_sending": finished_sending, - "finished_recving": finished_recving, - } - return ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_id_to_index, @@ -207,6 +193,9 @@ def create_model_runner_output( spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - pooler_output=[], - **extra_args, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {}), + finished_sending=finished_sending, + finished_recving=finished_recving, ) diff --git a/tests/ut/models/test_deepseek_mtp.py b/tests/ut/models/test_deepseek_mtp.py deleted file mode 100644 index 6704fc647f..0000000000 --- a/tests/ut/models/test_deepseek_mtp.py +++ /dev/null @@ -1,175 +0,0 @@ -import pytest -import torch -from pytest_mock import MockerFixture -from transformers import PretrainedConfig -from vllm.config import CacheConfig, ModelConfig, VllmConfig - -from tests.ut.base import PytestBase -from vllm_ascend.models.deepseek_mtp import ( - CustomDeepSeekMTP, CustomDeepSeekMultiTokenPredictor, - CustomDeepSeekMultiTokenPredictorLayer) - - -class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase): - - @pytest.fixture - def setup_mtp_layer(self, mocker: MockerFixture): - config = PretrainedConfig(vocab_size=1000, - hidden_size=768, - rms_norm_eps=1e-5) - mocker.patch( - "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", - return_value=None) - mocker.patch("vllm.model_executor.layers.layernorm.RMSNorm.__init__", - return_value=None) - mocker.patch( - "vllm.model_executor.models.deepseek_mtp.SharedHead.__init__", - return_value=None) - mocker.patch( - "vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__", - return_value=None) - mocker_deepseek_v2_decode_layer = mocker.patch( - "vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__", - return_value=None) - - mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None) - mocker_deepseek_v2_decode_layer.assert_called_once() - return mtp_layer - - def test_init(self, mocker: MockerFixture, setup_mtp_layer): - mtp_layer = setup_mtp_layer - assert isinstance(mtp_layer, CustomDeepSeekMultiTokenPredictorLayer) - - def test_forward(self, mocker: MockerFixture, setup_mtp_layer): - mtp_layer = setup_mtp_layer - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch.object(mtp_layer, - 'eh_proj', - return_value=torch.randn(2, 3, 768)) - mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768)) - mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768), - torch.randn(2, 3, 768)) - - input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]) - positions = torch.tensor([[0, 1, 2], [0, 1, 2]]) - kv_cache = torch.randn(2, 3, 768) - previous_hidden_states = torch.randn(2, 3, 768) - inputs_embeds = torch.tensor([[1.0, 2.0, 3.0]]) - - output = mtp_layer(input_ids, positions, kv_cache, None, - previous_hidden_states, inputs_embeds, 0) - assert output.shape == (2, 3, 768) - - -class TestCustomDeepSeekMultiTokenPredictor(PytestBase): - - @pytest.fixture - def setup_predictor(self, mocker: MockerFixture): - mock_vllm_config = mocker.MagicMock(spec=VllmConfig) - mock_model_config = mocker.MagicMock(spec=ModelConfig) - mock_hf_config = mocker.MagicMock() - mock_hf_config.num_hidden_layers = 12 - mock_hf_config.num_nextn_predict_layers = 3 - mock_hf_config.vocab_size = 30000 - mock_model_config.hf_config = mock_hf_config - mock_vllm_config.model_config = mock_model_config - mock_vllm_config.cache_config = CacheConfig() - mock_vllm_config.quant_config = mocker.MagicMock() - mocker.patch( - "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__init__", - return_value=None) - - predictor = CustomDeepSeekMultiTokenPredictor( - vllm_config=mock_vllm_config) - return predictor - - def test_init(self, mocker: MockerFixture, setup_predictor): - predictor = setup_predictor - assert predictor.num_mtp_layers == 3 - assert isinstance(predictor, CustomDeepSeekMultiTokenPredictor) - - @pytest.mark.parametrize('kv_caches, inputs_embeds', [ - (torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]])), - (None, None), - ]) - def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches, - inputs_embeds): - predictor = setup_predictor - mock_layer = mocker.MagicMock() - mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0]) - predictor.layers_list = [mock_layer] - - # todo: need or not? - # predictor.num_mtp_layers = 1 - input_ids = torch.tensor([[1, 2, 3]]) - positions = torch.tensor([[0, 1, 2]]) - mocker.patch( - "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__", - return_value=torch.tensor([[1.0, 2.0, 3.0]])) - output = predictor.forward(input_ids, positions, kv_caches, None, None, - inputs_embeds, 0) - mock_layer.assert_called_once() - assert torch.allclose(output, torch.tensor([1.0, 2.0, 3.0])) - - def test_compute_logits(self, mocker: MockerFixture, setup_predictor): - hidden_states = torch.tensor([[1, 2, 3], [4, 5, 6]]) - predictor = setup_predictor - - mock_layer = mocker.MagicMock() - mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0]) - predictor.layers_list = [mock_layer] - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch( - "vllm.model_executor.layers.logits_processor.LogitsProcessor.__init__", - return_value=None) - predictor.logits_processor.return_value = torch.tensor([1.0, 2.0, 3.0]) - - result_logits = predictor.compute_logits(hidden_states=hidden_states, - sampling_metadata=None) - predictor.logits_processor.assert_called_once() - assert torch.allclose(result_logits, torch.tensor([1.0, 2.0, 3.0])) - - -class TestCustomDeepSeekMTP(PytestBase): - - @pytest.fixture - def setup_mtp(self, mocker: MockerFixture): - vllm_config = mocker.MagicMock() - vllm_config.model_config.hf_config.num_hidden_layers = 12 - vllm_config.model_config.hf_config.num_nextn_predict_layers = 3 - vllm_config.cache_config = mocker.MagicMock() - vllm_config.quant_config = mocker.MagicMock() - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch( - "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__", - return_value=None) - mocker.patch("vllm.model_executor.layers.sampler.get_sampler", - return_value=None) - - mtp = CustomDeepSeekMTP(vllm_config=vllm_config) - return mtp - - def test_init(self, mocker: MockerFixture, setup_mtp): - mtp = setup_mtp - assert isinstance(mtp, CustomDeepSeekMTP) - - def test_forward(self, mocker: MockerFixture, setup_mtp): - input_ids = torch.tensor([[1, 2, 3]]) - positions = torch.tensor([[0, 1, 2]]) - kv_caches = [torch.tensor([[0.1, 0.2, 0.3]])] - previous_hidden_states = torch.tensor([[0.1, 0.2, 0.3]]) - inputs_embeds = torch.tensor([[0.1, 0.2, 0.3]]) - spec_step_idx = 0 - setup_mtp.model.return_value = torch.tensor([[1.0, 2.0, 3.0]]) - - output = setup_mtp.forward(input_ids, positions, kv_caches, None, - previous_hidden_states, inputs_embeds, - spec_step_idx) - assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]])) diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py deleted file mode 100644 index 15367ebe7b..0000000000 --- a/tests/ut/models/test_qwen2_5_vl.py +++ /dev/null @@ -1,424 +0,0 @@ -import pytest -import torch -import torch.nn.functional as F -from pytest_mock import MockerFixture - -from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_5_vl import ( - AscendQwen2_5_VisionAttention, AscendQwen2_5_VisionBlock, - AscendQwen2_5_VisionPatchEmbed, AscendQwen2_5_VisionRotaryEmbedding, - AscendQwen2_5_VisionTransformer, AscendQwen2_5_VLForConditionalGeneration) - - -class TestAscendQwen2_5_VisionAttention(PytestBase): - - def init_attention( - self, - mocker, - embed_dim=1000, - num_heads=10, - projection_size=100, - quant_config=None, - prefix="", - ): - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionAttention.__init__") - - attention = AscendQwen2_5_VisionAttention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - args, kwargs = mocker_attn.call_args - assert args == (embed_dim, num_heads, projection_size, None, "") - assert not kwargs - attention.num_attention_heads_per_partition = num_heads - return attention - - def test_attn_init_should_normal(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 10 - projection_size = 100 - quant_config = None - prefix = "" - vit = self.init_attention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - mocker=mocker, - ) - assert vit.embed_dim == 1000 - assert vit.hidden_size_per_attention_head == 10 - - def test_attn_init_should_raise_error(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 7 - projection_size = 100 - quant_config = None - prefix = "" - with pytest.raises(AssertionError): - # projection_size should divided by num heads - self.init_attention( - mocker=mocker, - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - - def test_split_qkv(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - q, k, v = attention.split_qkv(torch.rand((100, 10, 300))) - assert q.shape == (100, 10, 10, 10) - assert k.shape == (100, 10, 10, 10) - assert v.shape == (100, 10, 10, 10) - - def test_attn_forward(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - - qkv = lambda x: (x, 0) # noqa - split_qkv = lambda x: [ #noqa - torch.rand((100, 3, 10, 128)) for i in range(3) - ] # noqa - npu_rotary_mul = lambda q, cos, sin: q # noqa - _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa - proj = lambda x: (x, 0) # noqa - - mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) - mocker_split_qkv = mocker.patch.object( - attention, - "split_qkv", - side_effect=split_qkv, - ) - mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", - side_effect=npu_rotary_mul) - mocker_npu_flash_attention_unpad = mocker.patch( - "torch_npu._npu_flash_attention_unpad", - side_effect=_npu_flash_attention_unpad, - ) - mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) - attention.__dict__["qkv"] = mocker_qkv - attention.__dict__["split_qkv"] = mocker_split_qkv - attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul - attention.__dict__["_npu_flash_attention_unpad"] = ( - mocker_npu_flash_attention_unpad) - attention.__dict__["proj"] = mocker_proj - - output = attention.forward( - x=x, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin, - ) - qkv_args, qkv_kwargs = mocker_qkv.call_args - assert qkv_args == (x, ) - assert not qkv_kwargs - - split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args - assert split_qkv_args == (x, ) - assert not split_qkv_kwargs - - npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args - assert npu_rotary_mul_args[1:] == (cos, sin) - assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) - assert not npu_rotary_mul_kwargs - - assert output.shape == torch.Size([100, 3, 1280]) - - -class TestAscendQwen2_5_VisionBlock(PytestBase): - - def init_vision_block( - self, - mocker, - dim=100, - num_heads=10, - mlp_hidden_dim=100, - ): - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", - return_value=None, - ) - - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionAttention.__init__", - return_value=None, - ) - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_block = AscendQwen2_5_VisionBlock( - dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - ) - args, kwargs = mocker_vit.call_args - assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") - assert not kwargs - - args1, kwargs1 = mocker_attn.call_args - assert not args1 - assert kwargs1 == { - "embed_dim": dim, - "num_heads": num_heads, - "projection_size": dim, - "quant_config": None, - "prefix": ".attn", - } - return vision_block - - def test_init_vision_block_should_normal( - self, - mocker: MockerFixture, - ): - vision_block = self.init_vision_block(mocker) - assert isinstance(vision_block, AscendQwen2_5_VisionBlock) - - def test_vision_block_forward(self, mocker: MockerFixture): - x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - vision_block = self.init_vision_block(mocker) - mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) - mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) - vision_block.__dict__["attn"] = mocker_attn - vision_block.__dict__["mlp"] = mocker_mlp - - output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) - - _, attn_kwargs = mocker_attn.call_args - assert attn_kwargs == { - "cu_seqlens": cu_seqlens, - "cos": cos, - "sin": sin, - } - - assert torch.all(x * 3 == output) - - -class TestAscendQwen2_5_VisionPatchEmbed(PytestBase): - - def test_forward(self): - patch_embed = AscendQwen2_5_VisionPatchEmbed() - - ret = patch_embed(torch.rand((120, 1176))) - assert ret.shape == (120, 1152) - - -class TestAscendQwen2_5_VisionRotaryEmbedding(PytestBase): - - def init_rotary_embedding( - self, - mocker, - dim=128, - ): - mocker_ebed = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding.__init__", - return_value=None, - ) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - rotary_embedding = AscendQwen2_5_VisionRotaryEmbedding(dim=dim, ) - args, kwargs = mocker_ebed.call_args - assert args == (dim, 10000.0) - assert not kwargs - return rotary_embedding - - def test_init_rotary_embedding_should_normal(self, mocker: MockerFixture): - rotary_embedding = self.init_rotary_embedding(mocker) - assert isinstance(rotary_embedding, - AscendQwen2_5_VisionRotaryEmbedding) - - -class TestAscendQwen2_5_VisionTransformer(PytestBase): - - input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) - - def init_vision_transformer( - self, - mocker, - ): - norm_eps = 1e-6 - vision_config = mocker.MagicMock() - vision_config.patch_size = 16 - vision_config.temporal_patch_size = 2 - vision_config.in_channels = 3 - vision_config.hidden_act = "gelu" - vision_config.depth = 0 - vision_config.num_heads = 10 - vision_config.hidden_size = 300 - - mocker.patch( - "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_rank", - return_value=0, - ) - mocker.patch("vllm.distributed.utils.divide", return_value=100) - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", - return_value=2, - ) - mocker.patch( - "vllm.model_executor.layers.linear.divide", - return_value=2, - ) - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_rank", - return_value=0) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_world_size", - return_value=2, - ) - - vision_transformer = AscendQwen2_5_VisionTransformer( - vision_config, - norm_eps, - ) - - assert not vision_transformer.interleaved - return vision_transformer - - def test_init_vision_transformer(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - assert isinstance(vision_transformer, AscendQwen2_5_VisionTransformer) - - @pytest.mark.parametrize( - "interleaved, expected", - [ - ( - False, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - ]), - ), - ( - True, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 1].cos(), - ]), - ), - ], - ) - def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_transformer.__dict__["interleaved"] = interleaved - vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 - vision_transformer.hidden_size_per_attention_head = 4 - cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) - assert cos_new.shape == (1, 32, 1, 2) - - def test_forward(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.randn(1, 3, 224, 224) - grid_thw = torch.tensor([[1, 4, 4]]) - mocker_patch_embed = mocker.patch.object( - vision_transformer, - "patch_embed", - side_effect=lambda _: torch.randn(16, 512), # noqa - ) - mocker_rot_pos_emb = mocker.patch.object( - vision_transformer, - "rot_pos_emb", - side_effect=lambda _: torch.randn(16, 64), # noqa - ) - mocker_get_window_index = mocker.patch.object( - vision_transformer, - "get_window_index", - side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa - ) - mocker_cal_cos_sin = mocker.patch.object( - vision_transformer, - "cal_cos_sin", - side_effect=lambda _: - (torch.randn(16, 32), torch.randn(16, 32)), # noqa - ) - mocker_merger = mocker.patch.object( - vision_transformer, - "merger", - side_effect=lambda _: torch.randn(16, 256), # noqa - ) - vision_transformer.__dict__["vision_blocks"] = [ - lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa - ] - vision_transformer.__dict__["patch_embed"] = mocker_patch_embed - vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb - vision_transformer.__dict__[ - "get_window_index"] = mocker_get_window_index - vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin - vision_transformer.__dict__["merger"] = mocker_merger - vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] - vision_transformer.__dict__["spatial_merge_unit"] = 2 - ret = vision_transformer.forward(x, grid_thw) - assert ret.shape == (8, 256) - mocker_patch_embed.assert_called_with(x) - mocker_rot_pos_emb.assert_called_with(grid_thw) - mocker_get_window_index.assert_called_with(grid_thw) - mocker_cal_cos_sin.assert_called_once() - mocker_merger.assert_called_once() - - -class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase): - - def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): - vllm_config = mocker.MagicMock() - vllm_config.vision_config = "vision_config" - vllm_config.rms_norm_eps = 1e-5 - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vl = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", - return_value=None, - ) - mocker_vit = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionTransformer.__init__", - return_value=None, - ) - - vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration( - vllm_config=vllm_config) - args, kwargs = mocker_vl.call_args - assert not args - assert kwargs == {"vllm_config": vllm_config, "prefix": ""} - mocker_vit.assert_called_once() - assert isinstance( - vl_for_conditional_generation, - AscendQwen2_5_VLForConditionalGeneration, - ) diff --git a/tests/ut/models/test_qwen2_5_vl_without_padding.py b/tests/ut/models/test_qwen2_5_vl_without_padding.py index d6c99540b5..0ae1afa390 100644 --- a/tests/ut/models/test_qwen2_5_vl_without_padding.py +++ b/tests/ut/models/test_qwen2_5_vl_without_padding.py @@ -231,8 +231,6 @@ def init_vision_transformer( vision_config.in_channels = 3 vision_config.hidden_act = "gelu" vision_config.depth = 0 - vision_config.hidden_size = 1280 - vision_config.num_heads = 16 mocker.patch("torch.nn.Module.__setattr__") mocker.patch("torch.nn.Module.__getattr__") @@ -241,10 +239,6 @@ def init_vision_transformer( "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__", return_value=None, ) - mocker_vision_rotary_embedding = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__", - return_value=None, - ) mocker.patch( "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__", return_value=None, @@ -270,7 +264,7 @@ def init_vision_transformer( args, kwargs = mocker_vit.call_args assert args == (vision_config, norm_eps, None, "") assert not kwargs - mocker_vision_rotary_embedding.assert_called_once() + return vision_transformer def test_init_vision_transformer(self, mocker: MockerFixture): diff --git a/tests/ut/models/test_qwen2_vl.py b/tests/ut/models/test_qwen2_vl.py deleted file mode 100644 index d62b8594ba..0000000000 --- a/tests/ut/models/test_qwen2_vl.py +++ /dev/null @@ -1,200 +0,0 @@ -import pytest -import torch -from pytest_mock import MockerFixture -from vllm.model_executor.layers.activation import QuickGELU - -from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_vl import (AscendQwen2VisionAttention, - AscendQwen2VisionBlock) - - -class TestAscendQwen2VisionAttention(PytestBase): - - def init_attention( - self, - mocker, - embed_dim=1000, - num_heads=10, - projection_size=100, - quant_config=None, - prefix="", - ): - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_vl.Qwen2VisionAttention.__init__") - - attention = AscendQwen2VisionAttention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - args, kwargs = mocker_attn.call_args - assert args == (embed_dim, num_heads, projection_size, None, "") - assert not kwargs - attention.num_attention_heads_per_partition = num_heads - return attention - - def test_attn_init_should_normal(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 10 - projection_size = 100 - quant_config = None - prefix = "" - vit = self.init_attention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - mocker=mocker, - ) - assert vit.hidden_size_per_attention_head == 10 - - def test_attn_init_should_raise_error(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 7 - projection_size = 100 - quant_config = None - prefix = "" - with pytest.raises(AssertionError): - # projection_size should divided by num heads - self.init_attention( - mocker=mocker, - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - - def test_attn_forward(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - - qkv = lambda x: (x, 0) # noqa - split_qkv = lambda x: [ #noqa - torch.rand((100, 3, 10, 128)) for i in range(3) - ] # noqa - npu_rotary_mul = lambda q, cos, sin: q # noqa - _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa - proj = lambda x: (x, 0) # noqa - - mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) - mocker_split_qkv = mocker.patch.object( - attention, - "split_qkv", - side_effect=split_qkv, - ) - mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", - side_effect=npu_rotary_mul) - mocker_npu_flash_attention_unpad = mocker.patch( - "torch_npu._npu_flash_attention_unpad", - side_effect=_npu_flash_attention_unpad, - ) - mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) - attention.__dict__["qkv"] = mocker_qkv - attention.__dict__["split_qkv"] = mocker_split_qkv - attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul - attention.__dict__["_npu_flash_attention_unpad"] = ( - mocker_npu_flash_attention_unpad) - attention.__dict__["proj"] = mocker_proj - - output = attention.forward( - x=x, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin, - ) - qkv_args, qkv_kwargs = mocker_qkv.call_args - assert qkv_args == (x, ) - assert not qkv_kwargs - - split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args - assert split_qkv_args == (x, ) - assert not split_qkv_kwargs - - npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args - assert npu_rotary_mul_args[1:] == (cos, sin) - assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) - assert not npu_rotary_mul_kwargs - - assert output.shape == torch.Size([100, 3, 1280]) - - -class TestAscendQwen2VisionBlock(PytestBase): - - def init_vision_block( - self, - mocker, - dim=100, - num_heads=10, - mlp_ratio=0.5, - ): - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.__init__", - return_value=None, - ) - - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_vl.AscendQwen2VisionAttention.__init__", - return_value=None, - ) - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_block = AscendQwen2VisionBlock( - dim=dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - ) - args, kwargs = mocker_vit.call_args - assert args == (dim, num_heads, mlp_ratio, QuickGELU, None, None, "") - assert not kwargs - - args1, kwargs1 = mocker_attn.call_args - assert not args1 - assert kwargs1 == { - "embed_dim": dim, - "num_heads": num_heads, - "projection_size": dim, - "quant_config": None, - "prefix": ".attn", - } - return vision_block - - def test_init_vision_block_should_normal( - self, - mocker: MockerFixture, - ): - vision_block = self.init_vision_block(mocker) - assert isinstance(vision_block, AscendQwen2VisionBlock) - - def test_vision_block_forward(self, mocker: MockerFixture): - x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - vision_block = self.init_vision_block(mocker) - mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) - mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) - vision_block.__dict__["attn"] = mocker_attn - vision_block.__dict__["mlp"] = mocker_mlp - - output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) - - _, attn_kwargs = mocker_attn.call_args - assert attn_kwargs == { - "cu_seqlens": cu_seqlens, - "cos": cos, - "sin": sin, - } - - assert torch.all(x * 3 == output) diff --git a/tests/ut/multistream/test_decorator.py b/tests/ut/multistream/test_decorator.py deleted file mode 100644 index bd3da9402e..0000000000 --- a/tests/ut/multistream/test_decorator.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest -from pytest_mock import MockFixture - -from tests.ut.base import PytestBase -from vllm_ascend.multistream.decorator import set_multistream_support - - -class Context: - - def __init__(self, attn_metadata=None): - self.attn_metadata = attn_metadata - - -class TestDecorator(PytestBase): - - @pytest.mark.parametrize( - 'layer_context, microbatch_context, expected_metadata', [ - ((-1, None, None), -1, { - "original": True - }), - ((-1, None, None), 0, { - "original": True - }), - ((0, None, None), -1, { - "original": True - }), - ((0, None, [{ - "new": True - }]), 0, { - "new": True - }), - ]) - def test_decorator(self, mocker: MockFixture, layer_context, - microbatch_context, expected_metadata): - - def context_func(): - return Context(attn_metadata={"original": True}) - - mocker.patch( - 'vllm_ascend.multistream.decorator.get_multistream_layer_context', - return_value=layer_context) - mocker.patch( - 'vllm_ascend.multistream.decorator.get_multistream_microbatch_context', - return_value=microbatch_context) - - context = set_multistream_support()(context_func)() - assert context.attn_metadata == expected_metadata diff --git a/tests/ut/multistream/test_layers.py b/tests/ut/multistream/test_layers.py deleted file mode 100644 index cf34c6a09b..0000000000 --- a/tests/ut/multistream/test_layers.py +++ /dev/null @@ -1,198 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from tests.ut.base import PytestBase -from vllm_ascend.multistream.base import MSEventKey -from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer, - MultiStreamPreTransformerLayer) -from vllm_ascend.multistream.metadata import MultiStreamMetadata - - -# === fixture: mock tensor input === -@pytest.fixture -def input_tensors(): - return [torch.randn(2, 128), torch.randn(2, 128)] - - -# === mock get_forward_context === -class DummyContext: - - def __init__(self, attn_metadata): - self.attn_metadata = attn_metadata - - -class TestMultiStreamPreTransformerLayer(PytestBase): - - # === test when multistream_metadata is None === - @patch("vllm_ascend.multistream.layers.get_forward_context") - @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") - def test_forward_no_multistream_metadata(self, mock_set_ctx, mock_get_ctx, - input_tensors): - mock_get_ctx.return_value = DummyContext(attn_metadata="dummy_meta") - layer = MultiStreamPreTransformerLayer(multistream_metadata=None) - attn_out, input_out = layer.forward(input_tensors) - - assert attn_out == "dummy_meta" - assert input_out == input_tensors - mock_set_ctx.assert_called_once_with(-1, None, None) - - # === test when attn_metadata is None === - @patch("vllm_ascend.multistream.layers.get_forward_context") - @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") - def test_forward_no_attn_metadata(self, mock_set_ctx, mock_get_ctx, - input_tensors): - mock_get_ctx.return_value = DummyContext(attn_metadata=None) - dummy_metadata = MagicMock(spec=MultiStreamMetadata) - layer = MultiStreamPreTransformerLayer( - multistream_metadata=dummy_metadata) - - attn_out, input_out = layer.forward(input_tensors) - - assert attn_out is None - assert input_out == input_tensors - mock_set_ctx.assert_called_once_with(-1, None, None) - - # === test when do_ms=False (no split needed) === - @patch("vllm_ascend.multistream.layers.get_forward_context") - @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") - def test_forward_no_split(self, mock_set_ctx, mock_get_ctx, input_tensors): - dummy_attn = "original_attn" - mock_get_ctx.return_value = DummyContext(attn_metadata=dummy_attn) - - dummy_metadata = MagicMock(spec=MultiStreamMetadata) - dummy_metadata.split_micro_batch.return_value = (False, "same_attn", - input_tensors, None) - - layer = MultiStreamPreTransformerLayer( - multistream_metadata=dummy_metadata) - - attn_out, input_out = layer.forward(input_tensors) - - assert attn_out == "same_attn" - assert input_out == input_tensors - mock_set_ctx.assert_called_once_with(-1, None, None) - - # === test when do_ms=True (split occurred) === - @patch("vllm_ascend.multistream.layers.get_forward_context") - @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") - def test_forward_split(self, mock_set_ctx, mock_get_ctx, input_tensors): - dummy_attn = "original_attn" - mock_get_ctx.return_value = DummyContext(attn_metadata=dummy_attn) - - split_inputs = [[t[:1], t[1:]] for t in input_tensors] - - dummy_metadata = MagicMock(spec=MultiStreamMetadata) - dummy_metadata.start_layer = 2 - dummy_metadata.split_micro_batch.return_value = (True, - ["attn1", "attn2"], - split_inputs, None) - - layer = MultiStreamPreTransformerLayer( - multistream_metadata=dummy_metadata) - - attn_out, input_out = layer.forward(input_tensors) - - assert attn_out == ["attn1", "attn2"] - assert input_out == split_inputs - mock_set_ctx.assert_called_once_with(2, dummy_metadata, - ["attn1", "attn2"]) - - -class TestMultiStreamPostTransformerLayer(PytestBase): - - def test_post_forward_metadata_none(self, input_tensors): - layer = MultiStreamPostTransformerLayer(multistream_metadata=None) - output = layer.forward(input_tensors) - assert output == input_tensors - - dummy_metadata = MagicMock(spec=MultiStreamMetadata) - dummy_metadata.ms_config = None - layer = MultiStreamPostTransformerLayer( - multistream_metadata=dummy_metadata) - output = layer.forward(input_tensors) - assert output == input_tensors - - @patch("vllm_ascend.multistream.layers.get_multistream_layer_context") - @patch("vllm_ascend.multistream.layers.reset_multistream_layer_context") - def test_post_forward_normal_flow(self, mock_reset_ctx, mock_get_ctx, - input_tensors): - A_instance_of_MultiStreamMetadata = MultiStreamMetadata( - calculate_stream=MagicMock(), - communicate_stream=MagicMock(), - start_layer=0, - end_layer=1, - event_keys=[], - multistream_config=None, - ) - dummy_metadata = MagicMock(spec=A_instance_of_MultiStreamMetadata) - dummy_metadata.ms_config.num_micro_batches = 4 - dummy_metadata.end_layer = 10 - - mock_get_ctx.return_value = ( - 5, # layer_index - dummy_metadata, # ms_metadata - "dummy_attn_metadata" # ms_attn_metadata - ) - - dummy_metadata.merge_micro_batches.return_value = "merged_result" - - layer = MultiStreamPostTransformerLayer( - multistream_metadata=dummy_metadata) - output = layer.forward(input_tensors) - - # check wait_event - dummy_metadata.try_wait_event.assert_called_once_with( - 9, # end_layer - 1 - 3, # num_micro_batches - 1 - MSEventKey.FFN_AR_FINISH) - mock_reset_ctx.assert_called_once() - assert output == "merged_result" - - @patch("vllm_ascend.multistream.layers.get_multistream_layer_context") - @patch("vllm_ascend.multistream.layers.reset_multistream_layer_context") - def test_post_forward_with_custom_wait_layer(self, mock_reset_ctx, - mock_get_ctx, input_tensors): - A_instance_of_MultiStreamMetadata = MultiStreamMetadata( - calculate_stream=MagicMock(), - communicate_stream=MagicMock(), - start_layer=0, - end_layer=1, - event_keys=[], - multistream_config=None, - ) - dummy_metadata = MagicMock(spec=A_instance_of_MultiStreamMetadata) - dummy_metadata.ms_config.num_micro_batches = 4 - dummy_metadata.end_layer = 10 - - mock_get_ctx.return_value = ( - 3, # layer_index - dummy_metadata, - "dummy_attn_metadata") - - dummy_metadata.merge_micro_batches.return_value = "merged_result" - - layer = MultiStreamPostTransformerLayer( - multistream_metadata=dummy_metadata) - output = layer.forward(input_tensors, wait_layer_index=7) - - dummy_metadata.try_wait_event.assert_called_once_with( - 7, 3, MSEventKey.FFN_AR_FINISH) - mock_reset_ctx.assert_called_once() - assert output == "merged_result" diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py index 6c89f6fc1d..2b6f1aa2dc 100644 --- a/tests/ut/ops/test_fused_ops.py +++ b/tests/ut/ops/test_fused_ops.py @@ -21,7 +21,7 @@ import torch_npu from pytest_mock import MockerFixture -from vllm_ascend.ascend_forward_context import _get_fused_moe_state +from vllm_ascend.ascend_forward_context import get_fused_moe_state from vllm_ascend.ops.fused_moe import (AscendFusedMoE, AscendUnquantizedFusedMoEMethod) from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402 @@ -297,8 +297,9 @@ def test_process_weights_after_loading(self, moe_method, mock_dist_env): assert not layer.w13_weight.requires_grad assert not layer.w2_weight.requires_grad - @pytest.mark.parametrize("others_param", - [[256, 4], [128, 1], [128, 1], [128, 4]]) + @pytest.mark.parametrize( + "others_param", + [[256, 4, False], [128, 1, False], [128, 1, True], [128, 4, False]]) def test_apply_without_expert_map(self, moe_method, mock_dist_env, mock_moe_env, others_param): """ @@ -307,13 +308,15 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env, 3 test use select_gating_topk_softmax_experts and fused_experts 4 test use select_experts and fused_experts_with_all2all_buffer """ - global_num_experts, ep_size = others_param + global_num_experts, ep_size, select_softmax = others_param is_prefill = False is_deepseek_v3_r1 = global_num_experts == 256 - forward_context = MagicMock(fused_moe_state=_get_fused_moe_state( + forward_context = MagicMock(fused_moe_state=get_fused_moe_state( ep_size, is_prefill, is_deepseek_v3_r1)) - with patch("vllm_ascend.ops.fused_moe.get_forward_context", - return_value=forward_context): + with patch( + "vllm_ascend.ops.fused_moe.SELECT_GATING_TOPK_SOTFMAX_EXPERTS", + select_softmax), \ + patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context): moe_method.ep_size = ep_size x = torch.randn(8, 2, 2) router_logits = torch.randn(8, 8) @@ -346,7 +349,7 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env, ep_size, alltoall_buffer = others_param is_prefill = False forward_context = MagicMock( - fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True)) + fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True)) with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER", alltoall_buffer), \ patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \ diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py deleted file mode 100644 index 3a42b93c42..0000000000 --- a/tests/ut/ops/test_token_dispatcher.py +++ /dev/null @@ -1,65 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. - -import pytest -from pytest_mock import MockerFixture - -from tests.ut.base import PytestBase -from vllm_ascend.ops.moe_dispatcher.token_dispatcher import ( - MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig) -from vllm_ascend.utils import adapt_patch # noqa E402 - - -class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase): - - @pytest.fixture - def config(self): - config = MoEDispatcherConfig() - config.set_num_local_experts(2) - config.set_num_moe_experts(4) - config.set_moe_pad_expert_input_to_capacity(False) - config.set_moe_expert_capacity_factor(None) - config.set_moe_router_topk(2) - config.set_moe_grouped_gemm(False) - config.set_group_topk(0) - config.set_num_groups(1) - config.set_is_fused(False) - return config.build() - - def mock_ep_group(self, mocker): - mock_group = mocker.MagicMock() - mock_group.rank_in_group = 0 - mock_group.world_size = 2 - mock_group.device_group = "mock_group" - return mock_group - - @pytest.fixture - def dispatcher(self, config, mocker: MockerFixture): - mocker.patch( - "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group", - return_value=self.mock_ep_group(mocker)) - mocker.patch("torch.npu.current_device", return_value="cpu") - mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock) - return MoEAlltoAllSeqOverLapDispatcher(config) - - def test_initialization(self, dispatcher, config): - assert dispatcher.num_local_experts == config.num_local_experts - assert dispatcher.num_experts == config.num_moe_experts - assert dispatcher.local_expert_indices == [0, 1] - assert dispatcher.ep_rank == 0 - assert dispatcher.ep_size == 2 - assert dispatcher.overlap_stream is not None diff --git a/tests/ut/ops/test_vocab_parallel_embedding.py b/tests/ut/ops/test_vocab_parallel_embedding.py deleted file mode 100644 index ff7d060aa0..0000000000 --- a/tests/ut/ops/test_vocab_parallel_embedding.py +++ /dev/null @@ -1,299 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm/tests/lora/test_layers.py - -from unittest.mock import MagicMock, patch - -import torch -from vllm.model_executor.layers.vocab_parallel_embedding import \ - VocabParallelEmbedding - -from tests.ut.base import TestBase -from vllm_ascend.ops.vocab_parallel_embedding import ( - get_masked_input_and_mask, vocab_parallel_embedding_forward) - -VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 - - -class TestGetMaskedInputAndMask(TestBase): - - def setUp(self): - self.input_ = torch.arange(12) - - def test_get_masked_input_and_mask(self): - # tp 1 no padding - input_modified, _ = get_masked_input_and_mask( - self.input_, - org_vocab_start_index=0, - org_vocab_end_index=8, - added_vocab_start_index=8, - added_vocab_end_index=12, - num_org_vocab_padding=0) - assert torch.equal(self.input_, input_modified) - - # tp 2 no padding - input_rank_0, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=0, - org_vocab_end_index=4, - added_vocab_start_index=8, - added_vocab_end_index=10, - num_org_vocab_padding=0) - - input_rank_1, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=4, - org_vocab_end_index=8, - added_vocab_start_index=10, - added_vocab_end_index=12, - num_org_vocab_padding=0) - - assert torch.equal(input_rank_0, - torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0])) - assert torch.equal(input_rank_1, - torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5])) - - # tp 4 no padding - input_rank_0, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=0, - org_vocab_end_index=2, - added_vocab_start_index=8, - added_vocab_end_index=9, - num_org_vocab_padding=0) - - input_rank_1, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=2, - org_vocab_end_index=4, - added_vocab_start_index=9, - added_vocab_end_index=10, - num_org_vocab_padding=0) - - input_rank_2, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=4, - org_vocab_end_index=6, - added_vocab_start_index=10, - added_vocab_end_index=11, - num_org_vocab_padding=0) - - input_rank_3, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=6, - org_vocab_end_index=8, - added_vocab_start_index=11, - added_vocab_end_index=12, - num_org_vocab_padding=0) - assert torch.equal(input_rank_0, - torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0])) - assert torch.equal(input_rank_1, - torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0])) - assert torch.equal(input_rank_2, - torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0])) - assert torch.equal(input_rank_3, - torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2])) - - # tp 1 with padding - input_modified, _ = get_masked_input_and_mask( - self.input_, - org_vocab_start_index=0, - org_vocab_end_index=8, - added_vocab_start_index=8, - added_vocab_end_index=12, - num_org_vocab_padding=2) - assert torch.equal( - input_modified, - torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13])) - - # tp 2 with padding - input_rank_0, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=0, - org_vocab_end_index=4, - added_vocab_start_index=8, - added_vocab_end_index=10, - num_org_vocab_padding=2) - - input_rank_1, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=4, - org_vocab_end_index=8, - added_vocab_start_index=10, - added_vocab_end_index=12, - num_org_vocab_padding=2) - assert torch.equal(input_rank_0, - torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0])) - assert torch.equal(input_rank_1, - torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7])) - - # tp 4 with padding - input_rank_0, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=0, - org_vocab_end_index=2, - added_vocab_start_index=8, - added_vocab_end_index=9, - num_org_vocab_padding=2) - - input_rank_1, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=2, - org_vocab_end_index=4, - added_vocab_start_index=9, - added_vocab_end_index=10, - num_org_vocab_padding=2) - - input_rank_2, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=4, - org_vocab_end_index=6, - added_vocab_start_index=10, - added_vocab_end_index=11, - num_org_vocab_padding=2) - - input_rank_3, _ = get_masked_input_and_mask(self.input_, - org_vocab_start_index=6, - org_vocab_end_index=8, - added_vocab_start_index=11, - added_vocab_end_index=12, - num_org_vocab_padding=2) - assert torch.equal(input_rank_0, - torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0])) - assert torch.equal(input_rank_1, - torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0])) - assert torch.equal(input_rank_2, - torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0])) - assert torch.equal(input_rank_3, - torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4])) - - -class TestVocabParallelEmbedding(TestBase): - - def setUp(self): - # Create a mock VocabParallelEmbedding instance - self.mock_embedding = MagicMock(spec=VocabParallelEmbedding) - self.mock_embedding.tp_size = 2 # Test with tensor parallelism - self.mock_embedding.shard_indices = MagicMock() - self.mock_embedding.shard_indices.org_vocab_start_index = 10 - self.mock_embedding.shard_indices.org_vocab_end_index = 20 - self.mock_embedding.shard_indices.num_org_vocab_padding = 5 - self.mock_embedding.shard_indices.added_vocab_start_index = 30 - self.mock_embedding.shard_indices.added_vocab_end_index = 40 - self.mock_embedding.quant_method = MagicMock() - - # Set consistent embedding dimension for all tests - self.embedding_dim = 10 - # Mock embedding returns tensor with shape (input_length, embedding_dim) - self.mock_embedding.quant_method.embedding = MagicMock( - side_effect=lambda _, x: torch.randn(x.shape[0], self.embedding_dim - )) - - def test_get_masked_input_and_mask(self): - """Test the mask and offset calculation helper function.""" - input_ = torch.tensor([5, 15, 25, 35, 45]) # includes all cases - - masked_input, mask = get_masked_input_and_mask( - input_, - org_vocab_start_index=10, - org_vocab_end_index=20, - num_org_vocab_padding=5, - added_vocab_start_index=30, - added_vocab_end_index=40) - - # The mask should be True for INVALID tokens (ones we want to mask out) - expected_mask = torch.tensor([True, False, True, False, True]) - self.assertTrue( - torch.equal(mask, expected_mask), - f"Mask mismatch. Expected {expected_mask}, got {mask}") - - # Check masked input values - expected_masked = torch.tensor([0, 5, 0, 20, 0]) - self.assertTrue( - torch.equal(masked_input, expected_masked), - f"Masked input mismatch. Expected {expected_masked}, got {masked_input}" - ) - - def test_forward_with_tp_size_1(self): - """Test forward pass without tensor parallelism.""" - # Create a fresh mock embedding with tp_size=1 - mock_embedding = MagicMock(spec=VocabParallelEmbedding) - mock_embedding.tp_size = 1 - mock_embedding.quant_method = MagicMock() - mock_embedding.quant_method.embedding = MagicMock( - return_value=torch.randn(3, self.embedding_dim)) - - input_ = torch.tensor([1, 2, 3]) - - with patch( - "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", - side_effect=lambda x: x) as mock_reduce_tp1: - output = vocab_parallel_embedding_forward(mock_embedding, input_) - - # Should just pass through without masking - mock_embedding.quant_method.embedding.assert_called_once_with( - mock_embedding, input_.long()) - self.assertEqual(output.shape, (3, self.embedding_dim)) - - # Verify all_reduce was called once - mock_reduce_tp1.assert_called_once() - - def test_forward_with_tp(self): - """Test forward pass with tensor parallelism.""" - input_ = torch.tensor([15, 35]) # one org vocab, one added vocab - with patch( - "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", - side_effect=lambda x: x) as mock_reduce_tp: - output = vocab_parallel_embedding_forward(self.mock_embedding, - input_) - - # Check that masking was applied correctly - self.mock_embedding.quant_method.embedding.assert_called_once() - called_input = self.mock_embedding.quant_method.embedding.call_args[0][ - 1] - expected_input = torch.tensor([5, 20]) # after offset calculation - self.assertTrue(torch.all(called_input == expected_input)) - - # Check that all reduce was called - # self.dist_mock.tensor_model_parallel_all_reduce.assert_called_once() - mock_reduce_tp.assert_called_once() - self.assertEqual(output.shape, (2, self.embedding_dim)) - - def test_forward_with_invalid_vocab(self): - """Test that invalid vocab indices are properly masked out.""" - input_ = torch.tensor([5, 15, 25, 35, 45]) # includes invalid cases - - # Create predictable mock output - mock_output = torch.randn(5, self.embedding_dim) - self.mock_embedding.quant_method.embedding = MagicMock( - return_value=mock_output.clone()) - with patch( - "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", - side_effect=lambda x: x): - output = vocab_parallel_embedding_forward(self.mock_embedding, - input_) - - # Check that invalid positions (0, 2, 4) were zeroed out - self.assertTrue(torch.all(output[0] == 0)) - self.assertTrue(torch.all(output[2] == 0)) - self.assertTrue(torch.all(output[4] == 0)) - self.assertTrue(torch.all(output[1] == mock_output[1])) - self.assertTrue(torch.all(output[3] == mock_output[3])) - self.assertEqual(output.shape, (5, self.embedding_dim)) - - def test_output_shape(self): - """Test that output shape is correct.""" - test_cases = [ - (torch.tensor([15]), (1, self.embedding_dim)), - (torch.tensor([15, 35]), (2, self.embedding_dim)), - (torch.tensor([15, 35, 16, 36]), (4, self.embedding_dim)), - ] - - for input_, expected_shape in test_cases: - with self.subTest(input=input_): - with patch( - "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", - side_effect=lambda x: x): - output = vocab_parallel_embedding_forward( - self.mock_embedding, input_) - self.assertEqual(output.shape, expected_shape) diff --git a/tests/ut/patch/worker/patch_common/test_patch_utils.py b/tests/ut/patch/worker/patch_common/test_patch_utils.py new file mode 100644 index 0000000000..e746550541 --- /dev/null +++ b/tests/ut/patch/worker/patch_common/test_patch_utils.py @@ -0,0 +1,104 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from typing import List, Optional +from unittest.mock import MagicMock, patch + +import torch +from torch.library import Library + +from tests.ut.base import TestBase +from vllm_ascend.patch.worker.patch_common.patch_utils import \ + ascend_direct_register_custom_op + + +class TestPatchUtils(TestBase): + + def setUp(self): + super().setUp() + + self.mock_op_func = MagicMock() + self.mock_op_func.__annotations__ = { + 'param1': list[int], + 'param2': Optional[list[int]], + 'param3': str + } + + self.mock_fake_impl = MagicMock() + self.mock_lib = MagicMock(spec=Library) + + self.op_name = "test_op" + self.mutates_args = ["arg1"] + self.dispatch_key = "NPU" + self.tags = (torch.Tag.pt2_compliant_tag, ) + + self.patch_infer_schema = patch( + 'vllm_ascend.patch.worker.patch_common.patch_utils.torch.library.infer_schema' + ) + self.patch_vllm_lib = patch( + 'vllm_ascend.patch.worker.patch_common.patch_utils.vllm_lib') + + self.mock_infer_schema = self.patch_infer_schema.start() + self.mock_vllm_lib = self.patch_vllm_lib.start() + + self.addCleanup(self.patch_infer_schema.stop) + self.addCleanup(self.patch_vllm_lib.stop) + + def test_utils_patched(self): + from vllm import utils + + self.assertIs(utils.direct_register_custom_op, + ascend_direct_register_custom_op) + + def test_register_with_default_lib(self): + self.mock_infer_schema.return_value = "(Tensor self) -> Tensor" + + ascend_direct_register_custom_op(op_name=self.op_name, + op_func=self.mock_op_func, + mutates_args=self.mutates_args, + fake_impl=self.mock_fake_impl, + dispatch_key=self.dispatch_key, + tags=self.tags) + + self.assertEqual(self.mock_op_func.__annotations__['param1'], + List[int]) + self.assertEqual(self.mock_op_func.__annotations__['param2'], + Optional[List[int]]) + self.assertEqual(self.mock_op_func.__annotations__['param3'], str) + + self.mock_infer_schema.assert_called_once_with( + self.mock_op_func, mutates_args=self.mutates_args) + + self.mock_vllm_lib.define.assert_called_once_with( + f"{self.op_name}(Tensor self) -> Tensor", tags=self.tags) + self.mock_vllm_lib.impl.assert_called_once_with( + self.op_name, self.mock_op_func, dispatch_key=self.dispatch_key) + self.mock_vllm_lib._register_fake.assert_called_once_with( + self.op_name, self.mock_fake_impl) + + def test_register_with_custom_lib(self): + self.mock_infer_schema.return_value = "(Tensor a, Tensor b) -> Tensor" + + ascend_direct_register_custom_op(op_name=self.op_name, + op_func=self.mock_op_func, + mutates_args=self.mutates_args, + target_lib=self.mock_lib) + + self.mock_lib.define.assert_called_once_with( + f"{self.op_name}(Tensor a, Tensor b) -> Tensor", tags=()) + self.mock_lib.impl.assert_called_once_with(self.op_name, + self.mock_op_func, + dispatch_key="CUDA") + self.mock_lib._register_fake.assert_not_called() diff --git a/tests/ut/quantization/test_quantizer.py b/tests/ut/quantization/test_quantizer.py index a51faeeb30..559cf19379 100644 --- a/tests/ut/quantization/test_quantizer.py +++ b/tests/ut/quantization/test_quantizer.py @@ -3,7 +3,6 @@ from tests.ut.base import TestBase from vllm_ascend.quantization.quant_config import AscendQuantConfig from vllm_ascend.quantization.quantizer import (VLLMAscendQuantizer, - W4A8DYNAMICQuantizer, W8A8Quantizer) SUPPORT_ASCEND_QUANTIZER_TYPE = {"test": "1"} @@ -121,25 +120,3 @@ def test_build_attention_method(self): result = self.quantizer.build_attention_method() mock_linear.assert_called_once_with() self.assertIsInstance(result, MagicMock) - - -class TestW4A8DYNAMICQuantizer(TestBase): - - def setUp(self): - self.quantizer = W4A8DYNAMICQuantizer(quant_description={}) - - def test_build_linear_method(self): - with patch( - 'vllm_ascend.quantization.quantizer.AscendW4A8DynamicLinearMethod', - return_value=MagicMock()) as mock_linear: - result = self.quantizer.build_linear_method() - mock_linear.assert_called_once_with() - self.assertIsInstance(result, MagicMock) - - def test_build_moe_method(self): - with patch( - 'vllm_ascend.quantization.quantizer.AscendW4A8DynamicFusedMoEMethod', - return_value=MagicMock()) as mock_fused_moe: - result = self.quantizer.build_moe_method() - mock_fused_moe.assert_called_once_with() - self.assertIsInstance(result, MagicMock) diff --git a/tests/ut/quantization/test_w4a8_dynamic.py b/tests/ut/quantization/test_w4a8_dynamic.py deleted file mode 100644 index 8c52e3252f..0000000000 --- a/tests/ut/quantization/test_w4a8_dynamic.py +++ /dev/null @@ -1,109 +0,0 @@ -from unittest.mock import Mock, patch - -import torch - -from tests.ut.base import TestBase -from vllm_ascend.quantization.w4a8_dynamic import ( - AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod) - - -class TestAscendW4A8DynamicLinearMethod(TestBase): - - def setUp(self): - self.method = AscendW4A8DynamicLinearMethod() - self.method.group_size = 8 - - def test_get_weight(self): - weight = self.method.get_weight(8, 32, torch.bfloat16) - self.assertEqual(weight["weight"].dtype, torch.int8) - self.assertEqual(weight["weight"].shape, (32, 8)) - - def test_get_pergroup_param(self): - params = self.method.get_pergroup_param(8, 32, torch.bfloat16) - self.assertEqual(params["weight_scale"].dtype, torch.bfloat16) - self.assertEqual(params["weight_scale"].shape, (32, 1)) - self.assertEqual(params["weight_offset"].dtype, torch.bfloat16) - self.assertEqual(params["weight_offset"].shape, (32, 1)) - self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16) - self.assertEqual(params["weight_scale_second"].shape, (32, 1)) - self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16) - self.assertEqual(params["weight_offset_second"].shape, (32, 1)) - - -class TestAscendW4A8DynamicFusedMoEMethod(TestBase): - - @patch('vllm_ascend.quantization.w4a8_dynamic.get_ep_group') - @patch("vllm_ascend.ascend_config.get_ascend_config") - @patch('vllm_ascend.quantization.w4a8_dynamic.get_mc2_group') - @patch('torch.distributed.get_rank', return_value=0) - def setUp(self, mock_get_rank, mock_get_mc2_group, mock_get_ascend_config, - mock_get_ep_group): - mock_ascend_config = Mock() - mock_ascend_config.torchair_graph_config = Mock(enabled=False) - mock_get_ascend_config.return_value = mock_ascend_config - self.quant_method = AscendW4A8DynamicFusedMoEMethod() - - def test_get_weight(self): - param_dict = self.quant_method.get_weight(8, 4, 14, torch.bfloat16) - self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) - self.assertEqual(param_dict["w13_weight"].shape, (8, 8, 14)) - - @patch('vllm_ascend.quantization.w4a8_dynamic.get_current_vllm_config') - def test_get_dynamic_quant_param(self, mock_get_current_vllm_config): - mock_vllm_config = Mock() - mock_vllm_config.quant_config = Mock( - quant_description={"group_size": 2}) - mock_get_current_vllm_config.return_value = mock_vllm_config - param_dict = self.quant_method.get_dynamic_quant_param( - 8, 4, 14, torch.bfloat16) - self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16) - self.assertEqual(param_dict["w13_weight_scale"].shape, (8, 8, 1)) - self.assertEqual(param_dict["w13_weight_scale_second"].dtype, - torch.bfloat16) - self.assertEqual(param_dict["w13_weight_scale_second"].shape, - (8, 8, 7)) - self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16) - self.assertEqual(param_dict["w2_weight_scale"].shape, (8, 14, 1)) - self.assertEqual(param_dict["w2_weight_scale_second"].dtype, - torch.bfloat16) - self.assertEqual(param_dict["w2_weight_scale_second"].shape, - (8, 14, 2)) - - @patch('torch_npu.npu_quantize') - @patch('torch.Tensor.npu') - def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize): - layer = torch.nn.Module() - layer.w13_weight = torch.nn.Parameter(torch.zeros((8, 8, 14), - dtype=torch.int8), - requires_grad=False) - layer.w2_weight = torch.nn.Parameter(torch.zeros((8, 14, 4), - dtype=torch.int8), - requires_grad=False) - layer.w13_weight_scale = torch.nn.Parameter(torch.ones( - (8, 8, 1), dtype=torch.bfloat16), - requires_grad=False) - layer.w13_weight_offset = torch.nn.Parameter(torch.zeros( - (8, 8, 1), dtype=torch.bfloat16), - requires_grad=False) - layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones( - (8, 8, 7), dtype=torch.bfloat16), - requires_grad=False) - layer.w2_weight_scale = torch.nn.Parameter(torch.ones( - (8, 14, 1), dtype=torch.bfloat16), - requires_grad=False) - layer.w2_weight_offset = torch.nn.Parameter(torch.zeros( - (8, 14, 1), dtype=torch.bfloat16), - requires_grad=False) - layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones( - (8, 14, 2), dtype=torch.bfloat16), - requires_grad=False) - - mock_npu.return_value = torch.Tensor() - mock_npu_quantize.return_value = torch.Tensor() - self.quant_method.process_weights_after_loading(layer) - self.assertTrue(hasattr(layer, "w13_scale_bias")) - self.assertEqual(layer.w13_scale_bias.data.shape, (8, 8)) - self.assertEqual(layer.w13_scale_bias.data.dtype, torch.float32) - self.assertTrue(hasattr(layer, "w2_scale_bias")) - self.assertEqual(layer.w2_scale_bias.data.shape, (8, 14)) - self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32) diff --git a/tests/ut/quantization/test_w8a8_dynamic.py b/tests/ut/quantization/test_w8a8_dynamic.py deleted file mode 100644 index 59ab60487d..0000000000 --- a/tests/ut/quantization/test_w8a8_dynamic.py +++ /dev/null @@ -1,75 +0,0 @@ -from unittest.mock import MagicMock, patch - -import torch - -from tests.ut.base import TestBase -from vllm_ascend.quantization.w8a8_dynamic import fused_experts_with_all2all - - -class TestAscendW8A8FusedMoEMethod(TestBase): - - def setUp(self): - self.hidden_size = 128 - self.num_tokens = 128 - self.placeholder = torch.randn(self.num_tokens, - self.hidden_size, - dtype=torch.bfloat16) - - @patch("torch.distributed.all_to_all_single") - @patch("torch_npu.npu_moe_re_routing") - @patch("torch_npu.npu_grouped_matmul") - @patch("torch_npu.npu_swiglu") - @patch("torch_npu.npu_dynamic_quant") - @patch("torch_npu.npu_moe_finalize_routing") - @patch("torch_npu.npu_moe_init_routing") - def test_fused_experts_with_all2all(self, mock_moe_init_routing, - mock_moe_finalize_routing, - mock_dynamic_quant, mock_swiglu, - mock_grouped_matmul, - mock_moe_re_routing, - mock_all_to_all_single): - expert_map = MagicMock() - ep_group = MagicMock() - placeholder_int8 = torch.randint(0, - 100, - (self.num_tokens, self.hidden_size), - dtype=torch.int8) - placeholder_ones = torch.ones(self.num_tokens, dtype=torch.int32) - mock_all_to_all_single.side_effect = lambda output, input, *args, **kwargs: output.copy_( - input) - mock_moe_init_routing.return_value = ( - placeholder_int8, - placeholder_ones, - placeholder_ones, - ) - mock_moe_re_routing.return_value = (placeholder_int8, self.placeholder, - torch.randint(0, - 100, - (self.num_tokens, ), - dtype=torch.int32), - self.placeholder) - mock_grouped_matmul.return_value = self.placeholder - mock_swiglu.return_value = self.placeholder - mock_dynamic_quant.return_value = ( - placeholder_int8, - torch.randn(self.num_tokens), - ) - mock_moe_finalize_routing.return_value = self.placeholder - - result = fused_experts_with_all2all( - hidden_states=self.placeholder, - w1=self.placeholder, - w1_scale=self.placeholder, - w2=self.placeholder, - w2_scale=self.placeholder, - topk_weights=self.placeholder, - topk_ids=self.placeholder, - top_k=8, - expert_map=expert_map, - ep_group=ep_group, - log2phy=None, - global_redundant_expert_num=256, - ) - self.assertIsNotNone(result) - self.assertEqual(result.dtype, torch.bfloat16) - self.assertEqual(result.shape, (128, 128)) diff --git a/tests/ut/sample/test_rejection_sampler.py b/tests/ut/sample/test_rejection_sampler.py deleted file mode 100644 index b6aaf868c5..0000000000 --- a/tests/ut/sample/test_rejection_sampler.py +++ /dev/null @@ -1,201 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from unittest.mock import patch - -import torch - -from tests.ut.base import TestBase -from vllm_ascend.sample.rejection_sampler import ( - expand_batch_to_tokens, expand_pytorch, rejection_greedy_sample_pytorch, - rejection_random_sample_pytorch, sample_recovered_tokens_pytorch) - -# Global constants -PLACEHOLDER_TOKEN_ID = -1 -GREEDY_TEMPERATURE = 0.0 -MAX_SPEC_LEN = 8 # Used as MAX_NUM_TOKENS in expand_batch_to_tokens - - -class TestAscendRejectionSampler(TestBase): - - def test_rejection_greedy_sample_pytorch(self): - """Test greedy rejection sampling: stop when draft doesn't match, otherwise append bonus token""" - batch_size = 2 - max_spec_len = 3 - output_token_ids = torch.full((batch_size, max_spec_len + 1), - PLACEHOLDER_TOKEN_ID) - - cu_num_draft_tokens = torch.tensor([2, 4]) - draft_token_ids = torch.tensor([10, 11, 20, 21]) - target_argmax = torch.tensor([10, 99, 20, 22]) - bonus_token_ids = torch.tensor([[100], [200]]) - - is_greedy = torch.tensor([True, True]) - - rejection_greedy_sample_pytorch( - output_token_ids, - cu_num_draft_tokens, - draft_token_ids, - target_argmax, - bonus_token_ids, - is_greedy, - max_spec_len, - ) - - assert output_token_ids[0, 0].item() == 10 - assert output_token_ids[0, 1].item() == 99 - assert output_token_ids[1, 0].item() == 20 - assert output_token_ids[1, 2].item() == PLACEHOLDER_TOKEN_ID - - def test_rejection_random_sample_pytorch(self): - """Test random rejection sampling: accept based on uniform probability""" - batch_size = 2 - max_spec_len = 3 - output_token_ids = torch.full((batch_size, max_spec_len + 1), - PLACEHOLDER_TOKEN_ID) - - cu_num_draft_tokens = torch.tensor([2, 1]) - draft_token_ids = torch.tensor([1, 0, 2]) - draft_probs = torch.tensor([ - [0.0, 0.6, 0.0, 0.4], # vocab_size=4 - [0.1, 0.2, 0.3, 0.4], - [0.5, 0.5, 0.0, 0.0], - ]) - target_probs = torch.tensor([ - [0.0, 0.8, 0.0, 0.2], - [0.2, 0.1, 0.3, 0.4], - [0.9, 0.1, 0.0, 0.0], - ]) - bonus_token_ids = torch.tensor([[100], [200]]) - recovered_token_ids = torch.tensor([1, 2, 3]) - uniform_probs = torch.tensor([0.7, 0.6, 0.5]) - is_greedy = torch.tensor([False, False]) - vocab_size = 4 - - rejection_random_sample_pytorch( - output_token_ids, - cu_num_draft_tokens, - draft_token_ids, - draft_probs, - target_probs, - bonus_token_ids, - recovered_token_ids, - uniform_probs, - is_greedy, - max_spec_len, - vocab_size, - IS_NGRAM=False, - ) - - assert output_token_ids[0, 0].item() == 1 - assert output_token_ids[0, 1].item() == 0 - assert output_token_ids[0, 2].item() == 100 - - def test_expand_pytorch(self): - """Test expand_pytorch functionality""" - input_ptr = torch.tensor([10, 20, 30], dtype=torch.int32) - cu_num_tokens_ptr = torch.tensor([2, 5, 7]) - output_ptr = torch.empty(7, dtype=torch.int32) - - expand_pytorch( - output_ptr, - input_ptr, - cu_num_tokens_ptr, - replace_from=0, - replace_to=0, - MAX_NUM_TOKENS=MAX_SPEC_LEN, - ) - - expected = torch.tensor([10, 10, 20, 20, 20, 30, 30]) - assert torch.equal(output_ptr, expected) - - def test_expand_batch_to_tokens(self): - """Test expand_batch_to_tokens wrapper""" - x = torch.tensor([10, 20, 30]) - cu_num_tokens = torch.tensor([2, 5, 7]) - num_tokens = 7 - - with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch" - ) as mock_kernel: - expand_batch_to_tokens(x, cu_num_tokens, num_tokens) - mock_kernel.assert_called_once() - args = mock_kernel.call_args[0] - assert (args[1] == x).all() - assert (args[2] == cu_num_tokens).all() - - # Run actual function - result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens) - expected = torch.tensor([10, 10, 20, 20, 20, 30, 30]) - assert torch.equal(result, expected) - - def test_sample_recovered_tokens_pytorch_ngram(self): - """Test recovered token sampling under n-gram mode""" - output_token_ids = torch.empty(2, dtype=torch.int32) - cu_num_draft_tokens = torch.tensor([1, 2]) - draft_token_ids = torch.tensor([1, 2]) - draft_probs = None - target_probs = torch.tensor([ - [0.1, 0.2, 0.7], - [0.3, 0.3, 0.4], - ]) - q = torch.tensor([ - [0.1, 0.2, 0.7], - [0.5, 0.4, 0.1], - ]) - vocab_size = 3 - - sample_recovered_tokens_pytorch( - output_token_ids, - cu_num_draft_tokens, - draft_token_ids, - draft_probs, - target_probs, - q, - vocab_size, - IS_NGRAM=True, - ) - - assert output_token_ids[0].item() == 0 - assert output_token_ids[1].item() == 1 - - def test_sample_recovered_tokens_pytorch_autoregressive(self): - """Test recovered token sampling for autoregressive models""" - output_token_ids = torch.empty(2, dtype=torch.int32) - cu_num_draft_tokens = torch.tensor([1, 1]) - draft_token_ids = torch.tensor([0, 1]) - draft_probs = torch.tensor([ - [0.6, 0.1, 0.3], - [0.2, 0.7, 0.1], - ]) - target_probs = torch.tensor([ - [0.8, 0.1, 0.1], - [0.3, 0.6, 0.1], - ]) - q = torch.tensor([ - [0.5, 0.3, 0.2], - [0.1, 0.8, 0.1], - ]) - vocab_size = 3 - - sample_recovered_tokens_pytorch( - output_token_ids, - cu_num_draft_tokens, - draft_token_ids, - draft_probs, - target_probs, - q, - vocab_size, - IS_NGRAM=False, - ) - assert output_token_ids[0].item() == 0 diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py index ec00c0d965..34a5cca3f8 100644 --- a/tests/ut/test_ascend_config.py +++ b/tests/ut/test_ascend_config.py @@ -236,71 +236,3 @@ def test_check_torchair_supported(self): for model_type, expected_output in test_cases: self.assertEqual(_check_torchair_supported(model_type), expected_output) - - @_clean_up_ascend_config - def test_ascend_config_load_error(self): - test_vllm_config = VllmConfig() - # graph_batch_sizes should be list. - with self.assertRaises(TypeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "graph_batch_sizes": "fake_size", - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - - # use_cached_graph should not be enabled without torchair graph mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - "use_cached_graph": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - - # graph_batch_sizes_init should not be enabled without torchair graph mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - "graph_batch_sizes_init": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - - # enable_multistream_mla should not be enabled without torchair graph mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - "enable_multistream_mla": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - - # enable_multistream_moe should not be enabled without torchair graph mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - "enable_multistream_moe": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - - # enable_kv_nz should not be enabled without torchair graph mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - "enable_kv_nz": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) diff --git a/tests/ut/test_envs.py b/tests/ut/test_envs.py deleted file mode 100644 index 6cf5f81d24..0000000000 --- a/tests/ut/test_envs.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. - -import inspect -import os - -from tests.ut.base import TestBase -from vllm_ascend import envs - - -class TestEnvVariables(TestBase): - - def setUp(self): - self.env_vars = list(envs.env_variables.keys()) - - def test_env_vars_behavior(self): - for var_name in self.env_vars: - with self.subTest(var=var_name): - original_val = os.environ.get(var_name) - var_handler = envs.env_variables[var_name] - - try: - if var_name in os.environ: - del os.environ[var_name] - self.assertEqual(getattr(envs, var_name), var_handler()) - - handler_source = inspect.getsource(var_handler) - if 'int(' in handler_source: - test_vals = ["123", "456"] - elif 'bool(int(' in handler_source: - test_vals = ["0", "1"] - else: - test_vals = [f"test_{var_name}", f"custom_{var_name}"] - - for test_val in test_vals: - os.environ[var_name] = test_val - self.assertEqual(getattr(envs, var_name), - var_handler()) - - finally: - if original_val is None: - os.environ.pop(var_name, None) - else: - os.environ[var_name] = original_val - - def test_dir_and_getattr(self): - self.assertEqual(sorted(envs.__dir__()), sorted(self.env_vars)) - for var_name in self.env_vars: - with self.subTest(var=var_name): - getattr(envs, var_name) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index c22db8b2c5..89441f8a45 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -543,9 +543,13 @@ def test_get_piecewise_backend_cls_returns_correct_value(self): @patch("torch.distributed.is_hccl_available", return_value=True) @patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL") + @patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options") @patch("torch.distributed.ProcessGroup") - def test_successful_initialization(self, mock_pg, mock_pg_hccl, _): + def test_successful_initialization(self, mock_pg, mock_options_cls, + mock_pg_hccl, _): mock_prefix = MagicMock(spec=PrefixStore) + mock_options = MagicMock(spec=ProcessGroup.Options) + mock_options_cls.return_value = mock_options mock_backend = MagicMock() mock_pg_hccl.return_value = mock_backend group_rank = 0 @@ -570,7 +574,8 @@ def test_successful_initialization(self, mock_pg, mock_pg_hccl, _): timeout=timedelta(seconds=30), ) - mock_pg.assert_called_once_with(mock_prefix, group_rank, group_size) + mock_pg.assert_called_once_with(mock_prefix, group_rank, group_size, + unittest.mock.ANY) mock_pg_hccl.assert_called_once_with(mock_prefix, group_rank, group_size, unittest.mock.ANY) mock_backend._set_sequence_number_for_group.assert_called_once() diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py index 7588e70ed9..c8f33313ba 100644 --- a/vllm_ascend/__init__.py +++ b/vllm_ascend/__init__.py @@ -23,5 +23,9 @@ def register(): def register_model(): + # fix pytorch schema check error, remove this line after pytorch + # is upgraded to 2.7.0 + import vllm_ascend.patch.worker.patch_common.patch_utils # noqa: F401 + from .models import register_model register_model() diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index b8fd24e7d1..4bc6e88839 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -76,31 +76,6 @@ def __init__(self, torchair_graph_config): raise ValueError( "graph_batch_sizes_init is only valid when graph_batch_sizes is empty" ) - if not self.enabled: - if self.use_cached_graph: - raise RuntimeError( - "use_cached_graph is valid only when Torchair graph mode is enabled" - ) - if self.graph_batch_sizes: - raise RuntimeError( - "graph_batch_sizes is valid only when Torchair graph mode is enabled" - ) - if self.graph_batch_sizes_init: - raise RuntimeError( - "graph_batch_sizes_init is valid only when Torchair graph mode is enabled" - ) - if self.enable_multistream_mla: - raise RuntimeError( - "enable_multistream_mla is valid only when Torchair graph mode is enabled" - ) - if self.enable_multistream_moe: - raise RuntimeError( - "enable_multistream_moe is valid only when Torchair graph mode is enabled" - ) - if self.enable_kv_nz: - raise RuntimeError( - "enable_kv_nz is valid only when Torchair graph mode is enabled" - ) class AscendSchedulerConfig: diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index c86253472f..83e4ee8fea 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -7,9 +7,9 @@ from vllm.config import VllmConfig from vllm.distributed import get_dp_group, get_ep_group, get_tp_group from vllm.forward_context import get_forward_context, set_forward_context +from vllm.platforms import current_platform import vllm_ascend.envs as envs -from vllm_ascend.platform import NPUPlatform class FusedMoEState(Enum): @@ -18,12 +18,11 @@ class FusedMoEState(Enum): MC2 = 2 AllGatherEP = 3 NaiveMulticast = 4 - All2AllSeq = 5 # TODO(zzzzwwjj): add soc_version to choose branch -def _get_fused_moe_state(ep_size: int, with_prefill: bool, - is_deepseek_v3_r1: bool): +def get_fused_moe_state(ep_size: int, with_prefill: bool, + is_deepseek_v3_r1: bool): # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep # only supports deepseek v3/r1 if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1 @@ -34,10 +33,6 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool, return FusedMoEState.NaiveMulticast else: return FusedMoEState.AllGather - elif envs.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ: - # MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage. - return (FusedMoEState.All2AllSeq if - (ep_size < 16 or with_prefill) else FusedMoEState.MC2) # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph. elif ep_size < 16 or with_prefill: return FusedMoEState.All2All @@ -73,9 +68,11 @@ def set_ascend_forward_context( is_deepseek_v3_r1 = hasattr( vllm_config.model_config.hf_config, 'n_routed_experts' ) and vllm_config.model_config.hf_config.n_routed_experts == 256 - fused_moe_state = _get_fused_moe_state(ep_size, with_prefill, - is_deepseek_v3_r1) + fused_moe_state = get_fused_moe_state(ep_size, with_prefill, + is_deepseek_v3_r1) + forward_context.fused_moe_state = fused_moe_state + forward_context.in_profile_run = in_profile_run # NOTE: This cannot be set using set_forward_context @@ -83,7 +80,15 @@ def set_ascend_forward_context( forward_context.capturing = False if num_tokens is None and attn_metadata is not None: - num_tokens = attn_metadata.num_actual_tokens + if hasattr(attn_metadata, 'num_actual_tokens'): + # for v1 engine + num_tokens = attn_metadata.num_actual_tokens + else: + # for v0 engine + num_tokens = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens + + if num_actual_tokens is None: + num_actual_tokens = num_tokens dp_world_size = get_dp_group().world_size if dp_world_size > 1 and forward_context.dp_metadata is not None: @@ -95,8 +100,6 @@ def set_ascend_forward_context( forward_context.max_tokens_across_dp = max_tokens_across_dp if num_tokens is not None: - if num_actual_tokens is None: - num_actual_tokens = num_tokens tp_world_size = get_tp_group().world_size # NOTE: token num which need to pad to when mc2 forward_context.padded_num_tokens = math.ceil( @@ -104,7 +107,7 @@ def set_ascend_forward_context( mc2_mask = torch.zeros(forward_context.padded_num_tokens, dtype=torch.bool, - device=NPUPlatform.device_type) + device=current_platform.device_type) mc2_mask[:num_actual_tokens] = True forward_context.mc2_mask = mc2_mask diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 6f7473f614..668c802c40 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -150,8 +150,6 @@ class AscendMetadata: # (num_tokens,) slot_mapping: torch.Tensor = None - enable_dbo_across_dp: bool = False - class AscendAttentionMetadataBuilder: @@ -162,11 +160,7 @@ def reorder_batch(self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput") -> bool: return False - def build(self, - num_reqs, - num_actual_tokens, - max_query_len, - enable_dbo_across_dp: bool = False): + def build(self, num_reqs, num_actual_tokens, max_query_len): block_table = self.runner.input_batch.block_table[0].get_device_tensor( ) @@ -193,17 +187,15 @@ def build(self, attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ) - attn_metadata = AscendMetadata( - num_actual_tokens=num_actual_tokens, - block_tables=block_table, - query_start_loc=query_start_loc, - query_lens=query_lens, - seq_lens=seq_lens, - max_query_len=max_query_len, - slot_mapping=slot_mapping, - attn_mask=attn_mask, - attn_state=attn_state, - enable_dbo_across_dp=enable_dbo_across_dp) + attn_metadata = AscendMetadata(num_actual_tokens=num_actual_tokens, + block_tables=block_table, + query_start_loc=query_start_loc, + query_lens=query_lens, + seq_lens=seq_lens, + max_query_len=max_query_len, + slot_mapping=slot_mapping, + attn_mask=attn_mask, + attn_state=attn_state) return attn_metadata diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py index fe7eb9dde6..48437b4025 100644 --- a/vllm_ascend/attention/attention_v1_torchair.py +++ b/vllm_ascend/attention/attention_v1_torchair.py @@ -140,8 +140,6 @@ class AscendTorchairMetadata: decode: Optional[AscendDecodeMetadata] = None - enable_dbo_across_dp: bool = False - class AscendAttentionTorchairMetadataBuilder: @@ -222,8 +220,7 @@ def build(self, num_reqs, num_actual_tokens, max_query_len, - graph_pad_size: int = -1, - enable_dbo_across_dp: bool = False): + graph_pad_size: int = -1): device = self.runner.device @@ -301,8 +298,7 @@ def build(self, max_query_len=max_query_len, slot_mapping=slot_mapping, attn_mask=attn_mask, - attn_state=attn_state, - enable_dbo_across_dp=enable_dbo_across_dp) + attn_state=attn_state) return attn_metadata diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index b2b3ad0e59..4e247562cf 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -137,7 +137,6 @@ class AscendMLAMetadata: decode: Optional[AscendMLADecodeMetadata] = None prefill: Optional[AscendMLAPrefillMetadata] = None - enable_dbo_across_dp: bool = False def __post_init__(self): pass @@ -371,7 +370,6 @@ def build( max_query_len: int, graph_pad_size: int = -1, query_start_loc: torch.Tensor = None, - enable_dbo_across_dp: bool = False, ) -> AscendMLAMetadata: assert self._num_decodes + self._num_prefills == num_reqs @@ -538,7 +536,6 @@ def build( query_start_loc=query_start_loc, block_tables=block_table, seq_lens=seq_lens, - enable_dbo_across_dp=enable_dbo_across_dp, ) @@ -1082,10 +1079,11 @@ def forward( ] num_actual_toks = attn_metadata.num_actual_tokens if k_pe is None and not self.running_in_graph: - kv_c, k_pe = self.kv_a_proj_with_mqa( - hidden_states_or_kv_c_normed)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + if not self.torchair_graph_enabled: + kv_c, k_pe = self.kv_a_proj_with_mqa( + hidden_states_or_kv_c_normed)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) else: kv_c_normed = hidden_states_or_kv_c_normed assert attn_metadata.num_decodes is not None and \ @@ -1104,13 +1102,12 @@ def forward( if not self.running_in_graph: hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] - decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] - prefill_hs = hidden_states_or_kv_c_normed[num_decode_tokens:] - # if not self.torchair_graph_enabled: - k_pe = k_pe[:num_actual_toks, ...] - k_pe = k_pe.unsqueeze(1) - decode_k_pe = k_pe[:num_decode_tokens] - prefill_k_pe = k_pe[num_decode_tokens:] + if not self.torchair_graph_enabled: + decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + k_pe = k_pe[:num_actual_toks, ...] + k_pe = k_pe.unsqueeze(1) + decode_k_pe = k_pe[:num_decode_tokens] + prefill_k_pe = k_pe[num_decode_tokens:] else: decode_hs_or_q_c = hidden_states_or_q_c if has_decode: @@ -1170,11 +1167,11 @@ def forward( prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) prefill_k_pe, prefill_k_nope = self.exec_kv_prefill( - prefill_hs, cos, sin, kv_cache, - attn_metadata.slot_mapping[num_decode_tokens:]) + hidden_states_or_kv_c_normed, cos, sin, kv_cache, + attn_metadata.slot_mapping) kv_c_normed = prefill_k_nope[:num_actual_toks, ...] - prefill_k_c_normed = prefill_k_nope + prefill_k_c_normed = prefill_k_nope[num_decode_tokens:] prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads, -1) prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1) diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index dfdc9aa863..3ff286124c 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -17,6 +17,7 @@ import time from collections import deque from typing import Iterable, Union +import os from vllm.config import VllmConfig from vllm.distributed.kv_events import KVEventBatch @@ -50,9 +51,13 @@ def __init__( include_finished_set, log_stats) self.scheduled_req_ids: set[str] = set() self.running: list[Request] = [] + self.cp_size = int(os.getenv("VLLM_CP_SIZE", '1')) + if self.cp_size > 1: + assert not self.cache_config.enable_prefix_caching #暂不兼容 prefix cache def schedule(self) -> SchedulerOutput: if self.scheduler_config.chunked_prefill_enabled: + assert self.cp_size == 1 return super().schedule() scheduled_new_reqs: list[Request] = [] scheduled_resumed_reqs: list[Request] = [] @@ -129,6 +134,26 @@ def skip_cur_request(): num_new_local_computed_tokens = 0 num_computed_tokens = request.num_computed_tokens + # 匹配完prefix之后再进行序列切分 + # 序列切分 只为分配block 后续还是采用全量的序列长度传入worker worker自己做切分 尽量不改动state内容 + if self.cp_size > 1: + # block_size align + num_total_blocks = cdiv((request.num_tokens - num_computed_tokens), self.block_size) + + request.num_blocks_cp = [num_total_blocks // self.cp_size] * self.cp_size + remain_blocks = num_total_blocks % self.cp_size + for i in range(remain_blocks): + request.num_blocks_cp[i] += 1 + + # 分block时采用正常顺序切分,保证正常顺序且block对齐存储kv + start_id = 0 + request.token_ids_cp = [0] * self.cp_size + request.num_computed_tokens_cp = [0] * self.cp_size + for i in range(self.cp_size): + request.token_ids_cp[i] = request.all_token_ids[start_id:start_id+request.num_blocks_cp[i]*self.block_size] + request.num_computed_tokens_cp[i] = len(request.token_ids_cp[i]) + num_computed_tokens # 实际存kv cache的数量,不包含pad,用于prefill slot计算,decode分配block和slot计算 + start_id += request.num_blocks_cp[i] * self.block_size + # P/D: loading remote KV, do not allocate for new work. if load_kv_async: assert num_external_computed_tokens > 0 @@ -140,7 +165,10 @@ def skip_cur_request(): # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed # requests, which have output tokens. - num_new_tokens = request.num_tokens - num_computed_tokens + if self.cp_size > 1: + num_new_tokens = len(request.token_ids_cp[0]) # 各种校验以及block分配数 都 /cp_size 这里校验用的也是block对齐的长度,实际使用参与计算的长度更合理 + else: + num_new_tokens = request.num_tokens - num_computed_tokens max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * self.block_size) prompt_limit = min(prompt_limit, max_tokens_in_kvcache) @@ -220,8 +248,10 @@ def skip_cur_request(): req_to_new_block_ids[request.request_id] = ( self.kv_cache_manager.get_block_ids(request.request_id)) # Update request info. + token_budget -= num_new_tokens # token_budget只减切过的序列长度,减去非block对齐的切分长度更符合设计初衷,这里先按减去block对齐后的长度走 + if self.cp_size: + num_new_tokens = request.num_tokens - num_computed_tokens #恢复成正常的序列长度传入worker num_scheduled_tokens[request.request_id] = num_new_tokens - token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens # Count the number of prefix cached tokens. @@ -243,6 +273,15 @@ def skip_cur_request(): req_index += 1 continue + if self.cp_size > 1: + # 补pad逻辑在block对齐下较复杂,先实现直接轮询 + kv_rank = len(request.output_token_ids) % self.cp_size - 1 + if kv_rank == -1: + kv_rank = self.cp_size - 1 + + request.token_ids_cp[kv_rank].append(request.output_token_ids[-1]) #更新一下,暂时用不上 + request.kv_rank = kv_rank + num_new_tokens = (request.num_tokens_with_spec - request.num_computed_tokens) assert (request.num_tokens - request.num_computed_tokens) == 1 @@ -310,6 +349,8 @@ def skip_cur_request(): req_to_new_block_ids[request.request_id] = ( new_blocks.get_block_ids()) num_scheduled_tokens[request.request_id] = num_new_tokens + if self.cp_size > 1: + request.num_computed_tokens_cp[kv_rank] += num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -330,7 +371,7 @@ def skip_cur_request(): # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) - assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens + assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens * self.cp_size assert token_budget >= 0 assert len(self.running) <= self.max_num_running_reqs assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len( diff --git a/vllm_ascend/distributed/communicator.py b/vllm_ascend/distributed/communicator.py index 79adc89c79..7c14befa80 100644 --- a/vllm_ascend/distributed/communicator.py +++ b/vllm_ascend/distributed/communicator.py @@ -20,7 +20,6 @@ import torch.distributed as dist from vllm.distributed.device_communicators.base_device_communicator import \ DeviceCommunicatorBase -from vllm.utils import logger class NPUCommunicator(DeviceCommunicatorBase): @@ -35,12 +34,6 @@ def __init__(self, # init device according to rank self.device = torch.npu.current_device() - if self.use_all2all: - from vllm.distributed.device_communicators.all2all import \ - NaiveAll2AllManager - self.all2all_manager = NaiveAll2AllManager(self.cpu_group) - logger.info("Using naive all2all manager.") - def all_to_all(self, input_: torch.Tensor, scatter_dim: int = 0, @@ -80,17 +73,3 @@ def all_to_all(self, dist.all_to_all(output_list, input_list, group=self.device_group) output_tensor = torch.cat(output_list, dim=gather_dim).contiguous() return output_tensor - - # TODO: Add ut for dispatch and combine - def dispatch( - self, hidden_states: torch.Tensor, - router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert self.all2all_manager is not None - hidden_states, router_logits = self.all2all_manager.dispatch( - hidden_states, router_logits) - return hidden_states, router_logits - - def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: - assert self.all2all_manager is not None - hidden_states = self.all2all_manager.combine(hidden_states) - return hidden_states diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py index 7631a09167..66fc313a27 100644 --- a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py +++ b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py @@ -9,7 +9,7 @@ from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from enum import Enum -from typing import Any, Callable, Optional, Tuple +from typing import Any, Optional, Tuple import llm_datadist # type: ignore import msgspec @@ -28,7 +28,7 @@ from vllm.v1.request import Request, RequestStatus from vllm_ascend import envs -from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version +from vllm_ascend.soc_info import NPUSocInfo TORCH_DTYPE_TO_NPU_DTYPE = { torch.half: llm_datadist.DataType.DT_FLOAT16, @@ -331,12 +331,14 @@ def __init__(self, vllm_config: VllmConfig): self.prefill_device_list: list[tuple[int, int]] = [] self.decode_device_list: list[tuple[int, int]] = [] global_rank_table = self.read_offline_rank_table() - self.local_agent_metadata = self.read_agent_metadata(global_rank_table) + self.local_agent_metadata = self.read_agent_metadata( + global_rank_table, self.local_ip, self.local_rank_on_node, + self.llm_datadist_role) self.llm_datadist = LLMDataDist(self.llm_datadist_role, self.local_agent_metadata.cluster_id) self.init_llm_datadist() self.finished_reqs: set[str] = set() - self.soc_info = get_ascend_soc_version() + self.soc_info = NPUSocInfo() # Set hccl deterministic for model execute os.environ["HCCL_DETERMINISTIC"] = "true" self.done_receiving_counts: defaultdict[str, @@ -446,20 +448,8 @@ def read_offline_rank_table(self): # global_rank_table = json.dumps(global_rank_table) return global_rank_table - @staticmethod - def _get_visible_devices() -> Callable[[str], bool]: - """ - Return a test function that check if the given device ID is visible. - i.e. ASCEND_RT_VISIBLE_DEVICES is not set or contains the device_id. - """ - visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "") - if not visible_devices: - return lambda device_id: True - visible_device_list = visible_devices.split(",") - return lambda device_id: device_id in visible_device_list - - def read_agent_metadata(self, global_rank_table): - device_filter = LLMDataDistCMgrConnectorWorker._get_visible_devices() + def read_agent_metadata(self, global_rank_table, server_id, device_rank, + agent_role): devices_type_list = [] agent_metadata = None if self.llm_datadist_role == LLMRole.PROMPT: @@ -472,12 +462,11 @@ def read_agent_metadata(self, global_rank_table): for device_type in devices_type_list: device_list = global_rank_table[device_type] device_list = [ - d for d in device_list if d.get("server_id") == self.local_ip - and device_filter(d.get("device_id", "")) + d for d in device_list if d.get("server_id") == server_id ] - if len(device_list) <= self.tp_rank: + if len(device_list) <= device_rank: continue - device_info = device_list[self.tp_rank] + device_info = device_list[device_rank] super_pod_id_ = device_info.get("super_pod_id", None) server_id_ = device_info["server_id"] device_id_ = device_info["device_id"] @@ -492,7 +481,7 @@ def read_agent_metadata(self, global_rank_table): super_device_id=super_device_id_, cluster_id=cluster_id_, ) - assert agent_metadata is not None, f"Can't read the target server_id {self.local_ip} and device_rank {self.rank} from rank table" + assert agent_metadata is not None, f"Can't read the target server_id {server_id} and device_rank {device_rank} from rank table" return agent_metadata def register_kv_caches(self, kv_caches: dict[str, Tuple[torch.Tensor]]): @@ -681,7 +670,7 @@ def add_remote_agent(self, metadata: LLMDataDistCMgrAgentMetadata) -> int: rank_table["server_list"].append( # type: ignore[attr-defined] decode_server_device_info) - if self.soc_info == AscendSocVersion.A3: + if self.soc_info.is_a3: # generate super_pod_list for rank table super_pod_list = [] prefill_super_pod_info = { diff --git a/vllm_ascend/distributed/tensor_parallel.py b/vllm_ascend/distributed/tensor_parallel.py deleted file mode 100644 index 3fff0a7243..0000000000 --- a/vllm_ascend/distributed/tensor_parallel.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved. -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapts from: Megatron/megatron/core/tensor_parallel/mappings.py. -# This file is a part of the vllm-ascend project. -import torch - - -def _gather_along_first_dim(input_, group, output_split_sizes=None): - """Gather tensors and concatenate along the first dimension. - - Args: - input_tensor (torch.Tensor): - A tensor to be gathered. - output_split_sizes (List[int], optional): - A list specifying the sizes of the output splits along the first dimension. - If None, equal splitting is assumed. Default: None. - - Returns: - torch.Tensor: Gathered tensor. - """ - world_size = torch.distributed.get_world_size(group) - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - - dim_size = list(input_.size()) - if output_split_sizes is None: - dim_size[0] = dim_size[0] * world_size - - output = torch.empty(dim_size, - dtype=input_.dtype, - device=torch.npu.current_device()) - torch.distributed.all_gather_into_tensor(output, - input_.contiguous(), - group=group) - else: - dim_size[0] = sum(output_split_sizes) - output = torch.empty(dim_size, - dtype=input_.dtype, - device=torch.npu.current_device()) - output_tensor_list = list( - torch.split(output, output_split_sizes, dim=0)) - torch.distributed.all_gather(output_tensor_list, input_, group=group) - - return output - - -def _gather_along_last_dim(input_, group): - """Gather tensors and concatenate along the last dimension.""" - - world_size = torch.distributed.get_world_size(group) - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - - dim_size = list(input_.size()) - dim_size[0] = dim_size[0] * world_size - - output = torch.empty(dim_size, - dtype=input_.dtype, - device=torch.npu.current_device()) - torch.distributed.all_gather_into_tensor(output, - input_.contiguous(), - group=group) - tensor_list = output.chunk(world_size, dim=0) - output = torch.cat(tensor_list, dim=-1).contiguous() - - return output - - -def _reduce_scatter_along_first_dim(input_, - group, - input_split_sizes=None, - use_global_buffer=False): - """Reduce-scatter the input tensor across model parallel group. - - Args: - input_ (torch.Tensor): The input tensor to be reduce-scattered. - input_split_sizes (List[int], optional): A list specifying the sizes of - the input splits along the first dimension for each rank. If None, - equal splitting is assumed. Default: None. - """ - world_size = torch.distributed.get_world_size(group) - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - - if input_split_sizes is None: - dim_size = list(input_.size()) - assert ( - dim_size[0] % world_size == 0 - ), "First dimension of the tensor should be divisible by tensor parallel size" - - dim_size[0] = dim_size[0] // world_size - - output = torch.empty(dim_size, - dtype=input_.dtype, - device=torch.npu.current_device()) - torch.distributed.reduce_scatter_tensor(output, - input_.contiguous(), - group=group) - else: - rank = torch.distributed.get_rank(group) - input_tensor_list = list(torch.split(input_, input_split_sizes, dim=0)) - - output = torch.empty_like(input_tensor_list[rank]) - torch.distributed.reduce_scatter(output, - input_tensor_list, - group=group) - return output - - -def _reduce_scatter_along_last_dim(input_, group): - """Reduce-scatter tensors on the last dimension.""" - world_size = torch.distributed.get_world_size(group) - target_shape = list(input_.size()) - target_shape[-1] = target_shape[-1] // world_size - input_ = input_.reshape(-1, input_.shape[-1]) - split_tensors = torch.split(input_, - split_size_or_sections=input_.shape[-1] // - world_size, - dim=1) - concat_tensor = torch.cat(split_tensors, dim=0) - output = _reduce_scatter_along_first_dim(concat_tensor, - group).reshape(target_shape) - return output - - -def all_gather_last_dim_from_tensor_parallel_region(input_, group): - """Wrapper for autograd function: forward: AG, backward RS """ - return _gather_along_last_dim(input_, group) - - -def reduce_scatter_to_sequence_parallel_region(input_, - group, - input_split_sizes=None): - """Wrapper for autograd function: forward: RS, backward AG """ - return _reduce_scatter_along_first_dim(input_, group, input_split_sizes) - - -def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group): - """Wrapper for autograd function: forward: RS, backward AG: AG """ - return _reduce_scatter_along_last_dim(input_, group) - - -def gather_from_sequence_parallel_region( - input_, - group, - output_split_sizes=None, -): - """Wrapper for autograd function: forward: AG, backward: RS """ - return _gather_along_first_dim(input_, group, output_split_sizes) - - -def all_to_all(group, input, output_split_sizes=None, input_split_sizes=None): - world_size = torch.distributed.get_world_size(group=group) - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input - - input = input.contiguous() - if output_split_sizes is None: - # Equal split (all2all) - output = torch.empty_like(input) - else: - # Unequal split (all2all-v) - output = input.new_empty( - size=[sum(output_split_sizes)] + list(input.size()[1:]), - dtype=input.dtype, - device=torch.npu.current_device(), - ) - torch.distributed.all_to_all_single( - output, - input, - output_split_sizes=output_split_sizes, - input_split_sizes=input_split_sizes, - group=group, - ) - return output - - -def all_to_all_sp2hp(input_, group): - """ - Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape - [num_tokens/TP, H] to [num_tokens, H/TP]. - - Args: - input_ (torch.Tensor): - The input tensor which has been distributed along the sequence - dimension. - - Returns: - torch.Tensor: The output tensor with shape [num_tokens, H/TP]. - - """ - if group is None: - return input_ - world_size = torch.distributed.get_world_size(group=group) - tp_group = group - input_ = input_.reshape(-1, input_.shape[-1]) - split_tensors = torch.split(input_, - split_size_or_sections=input_.shape[-1] // - world_size, - dim=1) - concat_tensor = torch.cat(split_tensors, dim=0) - output = all_to_all(tp_group, concat_tensor) - return output - - -def all_to_all_hp2sp(input_, group): - """ - Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape - [num_tokens, H/TP] to [num_tokens/TP, H]. - - Args: - input_ (torch.Tensor): - The input tensor which has been distributed along the hidden - dimension. - - Returns: - torch.Tensor: The output tensor with shape [num_tokens/TP, H]. - """ - if group is None: - return input_ - world_size = torch.distributed.get_world_size(group=group) - input_ = input_.reshape(-1, input_.shape[-1]) - tp_group = group - input_exchanged = all_to_all(tp_group, input_) - input_reshaped = input_exchanged.reshape(-1, input_exchanged.shape[-1]) - split_tensors = torch.split( - input_reshaped, - split_size_or_sections=input_reshaped.shape[0] // world_size, - dim=0) - output = torch.cat(split_tensors, dim=-1) - return output diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index dee6f5a542..a7bb9fad95 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -117,6 +117,11 @@ # value to False to disable the optimized model. "USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))), + # SELECT_GATING_TOPK_SOTFMAX_EXPERTS is the equivalent of select_experts in non-quantized scenarios. + # In theory, it should have better performance than select_experts. + # Subsequent versions will remove the SELECT_GATING_TOPK_SOTFMAX_EXPERTS tag and use it as the default mode. + "SELECT_GATING_TOPK_SOTFMAX_EXPERTS": + lambda: bool(int(os.getenv("SELECT_GATING_TOPK_SOTFMAX_EXPERTS", '0'))), # The tolerance of the kv cache size, if the difference between the # actual kv cache size and the cached kv cache size is less than this value, # then the cached kv cache size will be used. @@ -154,11 +159,6 @@ # this feature is supported in A2, and eager mode will get better performance. "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))), - # Whether to enable the alltoall_seq flag, this provides a basic framework on the basis of alltoall for easy expansion. - # 0: default, normal init. - # 1: enable moe all2all seq. - "VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": - lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ', '0'))), } # end-env-vars-definition diff --git a/vllm_ascend/lora/punica_wrapper/punica_npu.py b/vllm_ascend/lora/punica_wrapper/punica_npu.py index 8f1eaf901f..9ca747b2d9 100644 --- a/vllm_ascend/lora/punica_wrapper/punica_npu.py +++ b/vllm_ascend/lora/punica_wrapper/punica_npu.py @@ -322,7 +322,7 @@ def add_lora_logits(self, **kwargs) -> None: """ Applies lora specifically for LogitsProcessorWithLoRA. - + Semantics: buffer = (x @ lora_a_stacked) * scale y += buffer @ lora_b_stacked @@ -338,27 +338,18 @@ def add_lora_logits(self, y_org = y y = y.view(-1, y.shape[-1]) x = x.view(-1, x.shape[-1]) - - if lora_a_stacked.dim() == 2: - lora_a_stacked = lora_a_stacked.unsqueeze(0) - if lora_b_stacked.dim() == 2: - lora_b_stacked = lora_b_stacked.unsqueeze(0) - - r = lora_a_stacked.size(-1) - + r = lora_b_stacked.size(-1) if buffer is None: + # We set the buffer to be float32 by default, consistent with the + # triton op buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - - indices = self.sampler_indices - if indices.max() >= lora_a_stacked.size(0): - indices = torch.clamp(indices, 0, lora_a_stacked.size(0) - 1) - - lora_a_reshaped = lora_a_stacked.transpose(1, 2) - lora_b_reshaped = lora_b_stacked.transpose(1, 2) - - bgmv_shrink(x, lora_a_reshaped, buffer, indices, scale) - bgmv_expand(buffer, lora_b_reshaped, y, indices, add_inputs=True) - + # LogitsProcessorWithLoRA always using bgmv. + bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, + lora_b_stacked, + y, + self.sampler_indices, + add_inputs=True) y = y.view_as(y_org) diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index f47e821b34..0b1b67a4f1 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -40,6 +40,7 @@ def register_model(): ModelRegistry.register_model( "DeepseekV3ForCausalLM", "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM") + else: ModelRegistry.register_model( "DeepseekV2ForCausalLM", diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index 9469e99999..13e5efac62 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -75,6 +75,7 @@ from vllm_ascend.multistream.metadata import (MultiStreamConfig, MultiStreamStepMetadata, make_multistream_metadata_ds) +from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.utils import dispose_tensor @@ -871,9 +872,24 @@ def forward( def can_run_ms(self): attn_metadata = get_forward_context().attn_metadata + # support mla attention and V1 engine at present + if not self.use_mla: + return False # enable prefill overlap - return not (attn_metadata is None or attn_metadata.num_prefills == 0 - or not attn_metadata.enable_dbo_across_dp) + if attn_metadata is None or attn_metadata.num_prefills == 0: + return False + else: + [token_index, seq_index + ] = compute_split_seq_index(attn_metadata.query_lens, + attn_metadata.attn_state, + attn_metadata.num_decode_tokens) + if token_index == 0 or seq_index == 0 or seq_index == len( + attn_metadata.query_lens): + return False + # check whether the total tokens exceed the threshold + if self.multistream_config is None or attn_metadata.num_actual_tokens < self.multistream_config.min_total_tokens_to_split: + return False + return True def _forward_ms_layers( self, diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index ce051c4d84..888697219c 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -313,8 +313,7 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_multistream_moe = \ - ascend_config.torchair_graph_config.enable_multistream_moe and \ - self.torchair_graph_enabled + ascend_config.torchair_graph_config.enable_multistream_moe self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, @@ -905,8 +904,6 @@ def load_weights(self, weights: Iterable[tuple[str, for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - if "module" in name: - continue spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) if spec_layer is not None: diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 4629f760eb..d1a94d1dac 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -18,7 +18,7 @@ # limitations under the License. from functools import partial -from typing import Callable, Iterable, Optional, Set, Tuple, Union +from typing import Callable, Iterable, Optional, Set, Tuple import torch import torch.nn as nn @@ -30,8 +30,7 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY, - get_act_and_mul_fn) +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -43,8 +42,6 @@ from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import vllm_version_is - MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -200,16 +197,12 @@ def __init__( in_channels=vision_config.in_channels, hidden_size=self.hidden_size, ) - - act_fn = get_act_and_mul_fn(vision_config.hidden_act) - if vllm_version_is("0.10.0"): - act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act] self.blocks = nn.ModuleList([ AscendQwen2_5_VisionBlock( dim=self.hidden_size, num_heads=self.num_heads, mlp_hidden_dim=vision_config.intermediate_size, - act_fn=act_fn, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.blocks.{layer_idx}") @@ -298,17 +291,12 @@ def pad_proj_weight(self, data): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - stacked_params_mapping: list[tuple[str, str, Union[str, int]]] = [ + stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), ] - if not vllm_version_is("0.10.0"): - stacked_params_mapping.extend([ - ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), - ("mlp.gate_up_proj.", "mlp.up_proj.", 1), - ]) params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py index 8877456a6d..47ddd4455a 100644 --- a/vllm_ascend/models/qwen2_5_vl_without_padding.py +++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py @@ -30,8 +30,7 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY, - get_act_and_mul_fn) +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.qwen2_5_vl import ( @@ -42,9 +41,6 @@ from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding -from vllm_ascend.utils import vllm_version_is - class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): @@ -164,25 +160,18 @@ def __init__( super().__init__(vision_config, norm_eps, quant_config, prefix) norm_layer = partial(RMSNorm, eps=norm_eps) self.interleaved = interleaved - head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // - 2) self.patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding( patch_size=vision_config.patch_size, temporal_patch_size=vision_config.temporal_patch_size, in_channels=vision_config.in_channels, hidden_size=self.hidden_size, ) - - act_fn = get_act_and_mul_fn(vision_config.hidden_act) - if vllm_version_is("0.10.0"): - act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act] self.blocks = nn.ModuleList([ AscendQwen2_5_VisionBlock_Without_Padding( dim=self.hidden_size, num_heads=self.num_heads, mlp_hidden_dim=vision_config.intermediate_size, - act_fn=act_fn, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.blocks.{layer_idx}") @@ -213,66 +202,6 @@ def cal_cos_sin(self, rotary_pos_emb): self.hidden_size_per_attention_head) return cos_new, sin_new - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - def get_window_index(self, grid_thw): - window_index: list = [] - cu_window_seqlens: list = [0] - window_index_id = 0 - vit_merger_window_size = (self.window_size // - self.spatial_merge_size // self.patch_size) - - for grid_t, grid_h, grid_w in grid_thw: - llm_grid_h = grid_h // self.spatial_merge_size - llm_grid_w = grid_w // self.spatial_merge_size - index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w) - pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size - pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size - num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size - num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) - index_padded = index_padded.reshape(grid_t, num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size) - index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, num_windows_h * num_windows_w, vit_merger_window_size, - vit_merger_window_size) - seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) - index_padded = index_padded.reshape(-1) - index_new = index_padded[index_padded != -100] - window_index.append(index_new + window_index_id) - cu_seqlens_tmp = seqlens.cumsum( - 0) * self.spatial_merge_unit + cu_window_seqlens[-1] - cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) - window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() - window_index = torch.cat(window_index, dim=0) - return window_index, cu_window_seqlens - def forward( self, x: torch.Tensor, @@ -324,39 +253,6 @@ def forward( x = x[reverse_indices, :] return x - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: - - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - # Split concatenated embeddings for each image item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: - - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) - @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index f3598cc623..8ff1b52a7a 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -1,7 +1,6 @@ # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,205 +15,8 @@ # limitations under the License. # Adapted from vllm/model_executor/models/qwen3_moe.py # This file is a part of the vllm-ascend project. -from typing import Optional - -import torch -from torch import nn -from transformers import PretrainedConfig -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, CompilationLevel, VllmConfig -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, - get_tp_group) -from vllm.forward_context import get_forward_context -from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ReplicatedLinear -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.models.interfaces import (MixtureOfExperts, - SupportsLoRA, SupportsPP) -from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention, - Qwen3MoeDecoderLayer, - Qwen3MoeForCausalLM, - Qwen3MoeMLP, Qwen3MoeModel, - Qwen3MoeSparseMoeBlock) -from vllm.model_executor.models.utils import ( - PPMissingLayer, extract_layer_index, - make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) - -from vllm_ascend.ops.fused_moe import AscendFusedMoE - - -class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): - - def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - nn.Module.__init__(self) - self.tp_size = get_tensor_model_parallel_world_size() - if self.tp_size > config.num_experts: - raise ValueError( - f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {config.num_experts}.") - - self.gate = ReplicatedLinear( - config.hidden_size, - config.num_experts, - bias=False, - quant_config=None, - prefix=f"{prefix}.gate", - ) - - self.experts = AscendFusedMoE( - num_experts=config.num_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - prefix=f"{prefix}.experts", - ) - - self.top_k = config.num_experts_per_tok - - self.dp_size = get_dp_group().world_size - - self.tp_group = get_tp_group().device_group - self.tp_rank = get_tp_group().rank_in_group - self.ep_group = get_ep_group() - - self.params_dtype = torch.get_default_dtype() - - def forward( - self, - hidden_states, - attn_metadata=None, - ): - if attn_metadata is None: - attn_metadata = get_forward_context().attn_metadata - # when profile runs, force experts to load balanced tokens - # to avoid high memory consumption on a single rank. - enable_force_load_balance = get_forward_context().in_profile_run - is_prefill = get_forward_context().with_prefill - - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - - hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits, - is_prefill=is_prefill, - top_k=self.top_k, - enable_force_load_balance=enable_force_load_balance, - shared_experts=None, - ) - return hidden_states - - -class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer): - - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - vllm_config: Optional[VllmConfig] = None, - prefix: str = "", - ) -> None: - - nn.Module.__init__(self) - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = Qwen3MoeAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, 'attention_bias', False), - head_dim=getattr(config, 'head_dim', None), - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - - # `mlp_only_layers` in the config. - layer_idx = extract_layer_index(prefix) - mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else - config.mlp_only_layers) - use_aclgraph = (vllm_config is not None - and vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE - and not vllm_config.model_config.enforce_eager) - if (layer_idx not in mlp_only_layers) and ( - config.num_experts > 0 and - (layer_idx + 1) % config.decoder_sparse_step == 0): - if not use_aclgraph: - # FIXME: custom sparse moe block doesn't work with aclgraph. - self.mlp = CustomSparseMoeBlock(config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - else: - self.mlp = Qwen3MoeSparseMoeBlock(config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - else: - self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - -@support_torch_compile -class CustomQwen3MoeModel(Qwen3MoeModel): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - nn.Module.__init__(self) - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - parallel_config = vllm_config.parallel_config - self.num_redundant_experts = parallel_config.num_redundant_experts - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - self.config = config - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - prefix=f"{prefix}.embed_tokens") - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: CustomQwen3MoeDecoderLayer( - config=config, - cache_config=cache_config, - quant_config=quant_config, - vllm_config=vllm_config, - prefix=prefix), - prefix=f"{prefix}.layers", - ) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) +from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM): @@ -231,45 +33,3 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM): "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - nn.Module.__init__(self) - SupportsPP.__init__(self) - SupportsLoRA.__init__(self) - MixtureOfExperts.__init__(self) - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = CustomQwen3MoeModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=maybe_prefix(prefix, "lm_head")) - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - # Set MoE hyperparameters - self.expert_weights: list[torch.Tensor] = [] - - self.moe_layers: list[FusedMoE] = [] - example_layer = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - - assert isinstance(layer, Qwen3MoeDecoderLayer) - if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): - example_layer = layer.mlp - self.moe_layers.append(layer.mlp.experts) - - if example_layer is None: - raise RuntimeError("No Qwen3MoE layer found in the model.layers.") - - self.num_moe_layers = len(self.moe_layers) - self.num_expert_groups = 1 - self.num_shared_experts = 0 diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py index fd32a18abb..3af6337e47 100644 --- a/vllm_ascend/multistream/ms_split.py +++ b/vllm_ascend/multistream/ms_split.py @@ -96,12 +96,10 @@ def model_input_split_v1_mla_attn( seq_lens = attn_metadata.prefill.seq_lens if attn_metadata.num_prefills > 0 else attn_metadata.decode.seq_lens [seq_lens_pre, seq_lens_post] = split_attn_tensor_type(seq_lens, seq_index) - query_start_loc_pre = query_start_loc_post = None - if attn_metadata.query_start_loc is not None: - query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1] - query_start_loc_post = deepcopy( - attn_metadata.query_start_loc[seq_index:] - ) - attn_metadata.query_start_loc[seq_index] + query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1] + query_start_loc_post = deepcopy( + attn_metadata.query_start_loc[seq_index:] + ) - attn_metadata.query_start_loc[seq_index] [block_table_pre, block_table_post] = split_attn_tensor_type(attn_metadata.block_tables, seq_index) @@ -225,7 +223,6 @@ def model_input_split_v1_mla_attn( attn_mask=attn_mask_pre, prefill=prefill_pre, decode=decode_pre, - enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp, ) attention_metadata_post = _metadata_cls( num_actual_tokens=attn_metadata.num_actual_tokens - token_index, @@ -242,6 +239,5 @@ def model_input_split_v1_mla_attn( attn_state=attn_state_post, prefill=prefill_post, decode=decode_post, - enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp, ) return [attention_metadata_pre, attention_metadata_post] diff --git a/vllm_ascend/ops/comm_utils.py b/vllm_ascend/ops/comm_utils.py deleted file mode 100644 index e893049ed8..0000000000 --- a/vllm_ascend/ops/comm_utils.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -import torch -import torch.distributed -import torch.distributed as dist -import torch_npu - -COMM_STREAM = None - - -def async_all_to_all(input_, - output_split_sizes, - input_split_sizes, - group, - event=None): - if output_split_sizes is None: - # Equal split (all2all) - a2a_out = torch.empty_like(input_) - else: - # Unequal split (all2all-v) - a2a_out = input_.new_empty( - size=[sum(output_split_sizes)] + list(input_.size()[1:]), - dtype=input_.dtype, - device=torch.npu.current_device(), - ) - - if event: - # multi stream wait event - global COMM_STREAM - if COMM_STREAM is None: - COMM_STREAM = torch_npu.npu.Stream( - device=torch.npu.current_device()) - with torch_npu.npu.stream(COMM_STREAM): - event.wait() - handle = dist.all_to_all_single( - a2a_out, - input_.contiguous(), - output_split_sizes=output_split_sizes, - input_split_sizes=input_split_sizes, - group=group, - async_op=True) - else: - handle = dist.all_to_all_single(a2a_out, - input_.contiguous(), - output_split_sizes=output_split_sizes, - input_split_sizes=input_split_sizes, - group=group, - async_op=True) - return input_, a2a_out, handle diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index eeb8ec3223..49880c72a3 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -22,11 +22,13 @@ from vllm.model_executor.layers.fused_moe.layer import \ UnquantizedFusedMoEMethod -from vllm_ascend.ascend_config import get_ascend_config +import vllm_ascend.envs as envs_ascend from vllm_ascend.ops.fused_moe import (fused_experts, fused_experts_moge, - select_experts) + select_experts, + select_gating_top_k_softmax_experts) from vllm_ascend.utils import is_310p +SELECT_GATING_TOPK_SOTFMAX_EXPERTS: bool = envs_ascend.SELECT_GATING_TOPK_SOTFMAX_EXPERTS original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__ @@ -34,15 +36,7 @@ def unquantized_fused_moe_init_func(self, *args, **kwargs): original_unquantized_fused_moe_init_func(self, *args, **kwargs) vllm_config = get_current_vllm_config() self.max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens - - ascend_config = get_ascend_config() - - if ascend_config.torchair_graph_config.enabled: - self.use_aclgraph = False - else: - self.use_aclgraph = (vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE - and not vllm_config.model_config.enforce_eager) + self.use_aclgraph = vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager def forward_oot( @@ -67,19 +61,26 @@ def forward_oot( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor: - topk_weights, topk_ids = select_experts( - global_num_experts=global_num_experts, - hidden_states=x, - router_logits=router_logits, - top_k=top_k, - use_grouped_topk=use_grouped_topk, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, - ) + if SELECT_GATING_TOPK_SOTFMAX_EXPERTS: + topk_weights, topk_ids = select_gating_top_k_softmax_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + renormalize=renormalize) + else: + topk_weights, topk_ids = select_experts( + global_num_experts=global_num_experts, + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) if topk_ids.shape[1] < top_k or is_310p(): assert global_num_experts is not None diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 04d288b063..6b3338ad63 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -16,7 +16,7 @@ # Adapted from vllm/tests/kernels/test_moe.py import os -from typing import Any, Callable, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -45,8 +45,6 @@ data_parallel_reduce_scatter from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer -from vllm_ascend.ops.moe_dispatcher.token_dispatcher import ( - MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig) from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, get_all_reduce_merge_state, @@ -54,6 +52,7 @@ get_rm_router_logits_state, is_310p) MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER +SELECT_GATING_TOPK_SOTFMAX_EXPERTS: bool = envs_ascend.SELECT_GATING_TOPK_SOTFMAX_EXPERTS def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int, @@ -275,13 +274,11 @@ def fused_experts_with_mc2( return hidden_states, shared_hidden_states -def apply_mlp( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - group_list: torch.Tensor, - group_list_type: int = 1, -) -> torch.Tensor: +def apply_mlp(hidden_states_wrapper: List[torch.Tensor], + w1: torch.Tensor, + w2: torch.Tensor, + group_list: torch.Tensor, + group_list_type: int = 1) -> torch.Tensor: """ apply MLP: gate_up_proj -> swiglu -> down_proj @@ -303,6 +300,9 @@ def apply_mlp( hidden_states: output hidden states after MLP. """ + assert len(hidden_states_wrapper) == 1 + hidden_states = hidden_states_wrapper.pop() + w1 = w1.transpose(1, 2) hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], @@ -330,8 +330,6 @@ def apply_mlp( return hidden_states -# currently expert parallelism implemented with all2all -# is under-optimized. def fused_experts_with_all2all( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -546,7 +544,10 @@ def fused_experts_with_all2all_buffer( hidden_states = hidden_states[sorted_idx] group_list_type = 0 - hidden_states = apply_mlp(hidden_states, + hidden_states_wrapper = [hidden_states] + del hidden_states + + hidden_states = apply_mlp(hidden_states_wrapper, w1, w2, expert_tokens, @@ -682,24 +683,6 @@ def fused_experts_moge( return final_hidden_states -def fused_experts_with_all2allv( - token_dispatcher, - probs, - routing_map, - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, -): - # Enable moe alltoallv, it's a balanced policy for precision and efficiency. - (share_experts_output, dispatched_input, - tokens_per_expert) = (token_dispatcher.token_permutation( - hidden_states, probs, routing_map)) - - expert_output = apply_mlp(dispatched_input, w1, w2, tokens_per_expert) - output, mlp_bias = token_dispatcher.token_unpermutation(expert_output) - return output - - def fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -876,6 +859,39 @@ def fused_experts( return final_hidden_states +def select_gating_top_k_softmax_experts( + hidden_states: torch.Tensor, router_logits: torch.Tensor, top_k: int, + renormalize: bool) -> tuple[torch.Tensor, torch.Tensor]: + """ + Select top-k experts based on router logits. + only supports float16、bfloat16、float32 + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + router_logits: Router logits of shape (num_tokens, num_experts). + top_k: Number of experts to select. + renormalize: Whether to renormalize the routing weights. + + Returns: + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + + Raises: + ValueError: If an unsupported scoring function is provided. + """ + topk_weights, topk_ids, row_idx = torch_npu.npu_moe_gating_top_k_softmax( + router_logits, None, k=top_k) + + # # Required by npu_moe_init_routing + # topk_weights = topk_weights.to(hidden_states.dtype) + # topk_ids = topk_ids.to(torch.int32) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights, topk_ids + + def native_grouped_topk( topk_weights: torch.Tensor, num_expert_group: Optional[int], @@ -937,24 +953,8 @@ def select_experts( ValueError: If an unsupported scoring function is provided. """ - def _renormalize_topk_weights( - topk_weights: torch.Tensor, - renormalize: bool, - ): - if renormalize: - topk_weights = topk_weights / topk_weights.sum(dim=-1, - keepdim=True) - return topk_weights - if scoring_func == "softmax": # NOTE: vLLM use dtype=torch.float here - if not use_grouped_topk and custom_routing_function is None: - topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax( - x=router_logits, finished=None, k=top_k) - topk_ids = topk_ids.to(torch.int32) - topk_weights = _renormalize_topk_weights(topk_weights, renormalize) - return topk_weights, topk_ids - topk_weights = router_logits.softmax(dim=-1) elif scoring_func == "sigmoid": topk_weights = router_logits.sigmoid() @@ -988,11 +988,10 @@ def _renormalize_topk_weights( k=top_k, dim=-1, sorted=False) - topk_ids = topk_ids.to(torch.int32) - topk_weights = _renormalize_topk_weights(topk_weights, renormalize) - return topk_weights, topk_ids - - if custom_routing_function is not None: + elif custom_routing_function is None: + topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1) + topk_weights = topk_weights.to(hidden_states.dtype) + else: topk_weights, topk_ids = custom_routing_function( hidden_states=hidden_states, gating_output=router_logits, @@ -1003,12 +1002,11 @@ def _renormalize_topk_weights( topk_ids = topk_ids.to(torch.int32) return topk_weights, topk_ids - topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1) - topk_weights = topk_weights.to(hidden_states.dtype) - # Required by npu_moe_init_routing topk_ids = topk_ids.to(torch.int32) - topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) return topk_weights, topk_ids @@ -1072,18 +1070,23 @@ def apply( if is_deepseek_v3_r1: topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( router_logits, - k=top_k, # topk currently is 8 + k=top_k, # topk当前写8 bias=e_score_correction_bias, k_group=topk_group, # fix: 4 group_count=num_expert_group, # fix 8 - group_select_mode= - 1, # 0: the maximum in the group; 1: topk2.sum(fix) + group_select_mode=1, # 0: group中的最大; 1: topk2.sum(fix) renorm=0, # 0: softmax->topk(fix); 1: topk->softmax norm_type=1, # 0: softmax; 1: sigmoid(fix) - # out_flag=False, # todo new api; should the third output be output - # y2_flag=False, # old api; should the third output be output + # out_flag=False, # todo new api; 第三个输出是否输出 + # y2_flag=False, # old api; 第三个输出是否输出 routed_scaling_factor=1, eps=float(1e-20)) + elif SELECT_GATING_TOPK_SOTFMAX_EXPERTS: + topk_weights, topk_ids = select_gating_top_k_softmax_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + renormalize=renormalize) else: topk_weights, topk_ids = select_experts( hidden_states=x, @@ -1102,7 +1105,7 @@ def apply( # this is a naive implementation for experts load balance so as # to avoid accumulating too much tokens on a single rank. # currently it is only activated when doing profile runs. - if enable_force_load_balance and not self.use_aclgraph: + if enable_force_load_balance: topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) fused_moe_state = get_forward_context().fused_moe_state @@ -1142,16 +1145,6 @@ def apply( global_batch_size=self.global_batch_size, expert_map=expert_map, ep_group=get_ep_group()) - elif fused_moe_state == FusedMoEState.All2AllSeq: - token_dispatcher = kwargs.get("token_dispatcher") - return fused_experts_with_all2allv( - token_dispatcher=token_dispatcher, - probs=topk_weights, - routing_map=topk_ids, - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - ) else: return fused_experts_with_all2all(hidden_states=x, w1=layer.w13_weight, @@ -1260,8 +1253,7 @@ def __init__( self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_multistream_moe = \ - ascend_config.torchair_graph_config.enable_multistream_moe and \ - self.torchair_graph_enabled + ascend_config.torchair_graph_config.enable_multistream_moe if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " @@ -1303,25 +1295,6 @@ def __init__( # NOTE: self.tp_group is not expert_tp_group self.tp_group = get_tp_group().device_group self.quant_method.create_weights(layer=self, **moe_quant_params) - self.token_dispatcher = None - if envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ and isinstance( - self.quant_method, AscendUnquantizedFusedMoEMethod): - self.reduce_results = False - moe_dispatcher_config = ( - MoEDispatcherConfig().set_num_moe_experts( - self.global_num_experts).set_num_local_experts( - self.local_num_experts).set_moe_router_topk( - top_k).set_group_topk(topk_group). - set_num_groups(num_expert_group).set_expert_bias( - e_score_correction_bias).set_scaling_factor(1.0).build()) - self.token_dispatcher = MoEAlltoAllSeqOverLapDispatcher( - moe_dispatcher_config) - if envs_ascend.VLLM_ASCEND_ENABLE_DBO: - token_dispatcher1 = MoEAlltoAllSeqOverLapDispatcher( - moe_dispatcher_config) - self.token_dispatchers = [ - self.token_dispatcher, token_dispatcher1 - ] def naive_multicast(self, x: torch.Tensor, cu_tokens_across_dp_cpu: torch.Tensor): @@ -1461,7 +1434,6 @@ def forward(self, shared_experts=shared_experts if self.torchair_graph_enabled and self.enable_multistream_moe and not is_prefill else None, mc2_mask=mc2_mask, - token_dispatcher=self.token_dispatcher, quantized_x_for_share=quantized_x_for_share, dynamic_scale_for_share=dynamic_scale_for_share, ) @@ -1478,11 +1450,11 @@ def forward(self, dist.all_gather(list(chunk_hidden_states), e_hidden_states, self.tp_group) final_hidden_states = torch.cat(chunk_hidden_states, dim=0) - dispose_tensor(e_hidden_states) else: final_hidden_states = e_hidden_states if num_tokens < padding_size: final_hidden_states = final_hidden_states[:num_tokens] + dispose_tensor(e_hidden_states) elif self.dp_size > 1: if fused_moe_state == FusedMoEState.NaiveMulticast: start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[ @@ -1497,8 +1469,6 @@ def forward(self, e_hidden_states, dim=0) final_hidden_states = final_hidden_states[:num_tokens] dispose_tensor(e_hidden_states) - else: - final_hidden_states = e_hidden_states else: final_hidden_states = e_hidden_states @@ -1539,7 +1509,6 @@ def _forward_ms_fused_moe_comp( scoring_func=self.scoring_func, e_score_correction_bias=self.e_score_correction_bias, is_prefill=is_prefill, - enable_force_load_balance=enable_force_load_balance, - ) + enable_force_load_balance=enable_force_load_balance) return hidden_states diff --git a/vllm_ascend/ops/moe_dispatcher/__init__.py b/vllm_ascend/ops/moe_dispatcher/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py deleted file mode 100644 index 402e8fb93a..0000000000 --- a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py +++ /dev/null @@ -1,453 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved. -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Optional - -import torch -import torch_npu -from vllm.distributed.parallel_state import get_ep_group - -from vllm_ascend.distributed.tensor_parallel import ( - all_gather_last_dim_from_tensor_parallel_region, all_to_all_hp2sp, - all_to_all_sp2hp, gather_from_sequence_parallel_region, - reduce_scatter_last_dim_to_tensor_parallel_region) -from vllm_ascend.ops.comm_utils import async_all_to_all - - -class MoEDispatcherConfig: - - def __init__(self): - self.num_local_experts: int = 0 - self.num_moe_experts: int = 0 - self.moe_pad_expert_input_to_capacity: bool = False - self.moe_expert_capacity_factor: Optional[float] = None - self.moe_router_topk: int = 2 - self.moe_grouped_gemm: bool = False - self.group_topk: int = 0 - self.num_groups: int = 1 - self.expert_bias: torch.Tensor = None - self.scaling_factor: Optional[float] = None - self.is_fused: bool = True - - def set_num_local_experts(self, num_local_experts): - self.num_local_experts = num_local_experts - return self - - def set_num_moe_experts(self, num_moe_experts): - self.num_moe_experts = num_moe_experts - return self - - def set_moe_pad_expert_input_to_capacity(self, - moe_pad_expert_input_to_capacity): - self.moe_pad_expert_input_to_capacity = moe_pad_expert_input_to_capacity - return self - - def set_moe_expert_capacity_factor(self, moe_expert_capacity_factor): - self.moe_expert_capacity_factor = moe_expert_capacity_factor - return self - - def set_moe_router_topk(self, moe_router_topk): - self.moe_router_topk = moe_router_topk - return self - - def set_moe_grouped_gemm(self, moe_grouped_gemm): - self.moe_grouped_gemm = moe_grouped_gemm - return self - - def set_group_topk(self, group_topk): - self.group_topk = group_topk - return self - - def set_num_groups(self, num_groups): - self.num_groups = num_groups - return self - - def set_expert_bias(self, expert_bias): - self.expert_bias = expert_bias - return self - - def set_scaling_factor(self, scaling_factor): - self.scaling_factor = scaling_factor - return self - - def set_is_fused(self, is_fused): - self.is_fused = is_fused - return self - - def build(self): - return self - - -class MoEDispatcher: - - def __init__(self, config: MoEDispatcherConfig) -> None: - """ - Initialize the MoE Token Dispatcher. - """ - self.config = config - self.shared_experts = None - - def set_shared_experts(self, shared_experts): - self.shared_experts = shared_experts - - @property - def ep_group(self): - """Get expert model parallel group.""" - return get_ep_group().device_group - - @property - def ep_rank(self): - return get_ep_group().rank_in_group - - @property - def ep_size(self): - return get_ep_group().world_size - - @property - def tp_ep_group(self): - """Get expert tensor and model parallel group.""" - return None - - @property - def tp_ep_size(self): - return 1 - - -class MoEAlltoAllSeqOverLapDispatcher(MoEDispatcher): - overlap_stream = None - """ - The implementation of the AlltoAll-based token dispatcher, which handles token - dispatching on the sequence level instead of token level. The core of this implementation - lies in each device dispatching on the entire sequence, with the hidden state being partitioned. - - """ - - def __init__(self, config: MoEDispatcherConfig): - """ - Initialize the AlltoAllSeq token dispatcher. - - Args: - config (MoEDispatcherConfig): Configuration for the transformer model. - """ - super().__init__(config) - self.num_local_experts = config.num_local_experts - self.config = config - # use MOEAlltoAllSEQTokenDispatcher to init - - self.hidden_shape = None - self.num_input_tokens = None - self.num_experts = config.num_moe_experts - assert self.num_local_experts > 0, "Expected at least one expert" - if self.num_local_experts > 1: - self.expert_ids_per_ep_rank = torch.tensor( - [i % self.num_local_experts for i in range(self.num_experts)], - dtype=torch.int32, - device=torch.npu.current_device(), - ) - - local_expert_indices_offset = (self.ep_rank * self.num_local_experts) - - self.local_expert_indices = [ - local_expert_indices_offset + i - for i in range(self.num_local_experts) - ] - assert (len(self.local_expert_indices) == self.num_local_experts - ), "Invalid local expert indices" - for i in range(len(self.local_expert_indices) - 1): - assert (self.local_expert_indices[i] == - self.local_expert_indices[i + 1] - - 1), "local_expert_indices must be continuous" - self.probs = None - self.input_splits = None - self.output_splits = None - self.routing_map = None - self.hidden_shape_before_permute = None - - # [tp_ep_size * ep_size, num_local_experts]. Represents the number of tokens sent - # to each local expert by all ranks. - self.num_global_tokens_per_local_expert_cpu = None - self.num_global_tokens_per_local_expert = None - - # A cuda stream synchronization is needed in self.token_permutation() - # in some cases, because there are several non-blocking DtoH data - # transfers called in self.preprocess(). The synchronization happens - # at different points based on MoE settings as late as possible. - # Valid sync points are "before_permutation_1", "before_ep_alltoall", - # "before_finish", and "no_sync". - self.device_sync_point = "no_sync" - - # cached intermediate tensors. - self.cached_permutated_local_input_tokens = None - self.cached_global_input_tokens = None - self.cached_shared_expert_output = None - self.tokens_per_expert = None - self.perm1_finish_event = None - self.global_input_tokens_local_experts_indices = None - - if MoEAlltoAllSeqOverLapDispatcher.overlap_stream is None: - MoEAlltoAllSeqOverLapDispatcher.overlap_stream = torch.npu.Stream() - - self.overlap_stream = MoEAlltoAllSeqOverLapDispatcher.overlap_stream - - def preprocess(self, - indices: torch.Tensor, - with_sync=True) -> torch.Tensor: - """ - Preprocess routing map for AlltoAll communication and token permutation. - This method computes the number of tokens assigned to each expert based on - the routing map. It also initializes the necessary data structures for - AlltoAll communication, such as input and output splits, and the mapping - between global tokens and local experts. - - Args: - routing_map (torch.Tensor): The mapping of tokens to experts, with shape - [num_tokens, num_experts]. - - Returns: - torch.Tensor: Tensor containing the number of tokens assigned to local expert. - """ - num_local_tokens_per_expert = torch.histc(indices, - bins=self.num_experts, - min=0, - max=self.num_experts) - - # num_local_tokens_per_expert: [num_experts] - - ep_size = self.ep_size - - # Dropless - self.num_out_tokens = indices.numel() - if self.ep_size > 1 or self.num_local_experts > 1: - # Token dropless and enable ep. A synchronization is needed before expert parallel - # AlltoAll communication to get the `input_splits` and `output_splits` CPU values. - self.device_sync_point = "before_ep_alltoall" - else: - # Token dropless and no ep. A synchronization is needed to get the - # `tokens_per_expert` CPU value. - self.device_sync_point = "before_finish" - - if ep_size > 1: - # =================================================== - # Calculate input_splits, output_splits for alltoall-v. - # =================================================== - self.input_splits = (num_local_tokens_per_expert.reshape( - ep_size, self.num_local_experts).sum(axis=1).to( - torch.device("cpu"), non_blocking=True).numpy()) - num_global_tokens_per_expert = gather_from_sequence_parallel_region( - num_local_tokens_per_expert, - group=self.ep_group).reshape(ep_size, self.num_experts) - self.num_global_tokens_per_local_expert = num_global_tokens_per_expert[:, self.local_expert_indices[ - 0]:self.local_expert_indices[-1] + 1] - if self.num_global_tokens_per_local_expert is None: - raise ValueError( - "num_global_tokens_per_local_expert must be set before sum." - ) - self.output_splits = (self.num_global_tokens_per_local_expert.sum( - axis=-1).to(torch.device("cpu"), non_blocking=True).numpy()) - num_tokens_per_local_expert = self.num_global_tokens_per_local_expert.sum( - axis=0) - # =================================================== - # num_global_tokens_per_expert: [ep_size, num_experts] - # num_global_tokens_per_local_expert: [ep_size, num_local_experts] - # num_tokens_per_local_expert: [num_local_experts] - # =================================================== - else: - self.num_global_tokens_per_local_expert = num_local_tokens_per_expert.reshape( - -1, self.num_experts) - num_tokens_per_local_expert = num_local_tokens_per_expert - - if self.num_local_experts > 1 and with_sync: - if self.num_global_tokens_per_local_expert is None: - raise ValueError( - "num_global_tokens_per_local_expert must be set before operations." - ) - self.device_sync_point = "no_sync" - self.global_input_tokens_local_experts_indices = torch.repeat_interleave( - self.expert_ids_per_ep_rank, - self.num_global_tokens_per_local_expert.ravel()) - - return num_tokens_per_local_expert - - def token_permutation( - self, - hidden_states: torch.Tensor, - probs: torch.Tensor, - routing_map: torch.Tensor, - ): - """ - Dispatch tokens to local experts using AlltoAllSeq communication. - - Args: - hidden_states (torch.Tensor): Input token embeddings. - probs (torch.Tensor): Probs of tokens assigned to experts. - Shape: [num_tokens, num_experts]. - routing_map (torch.Tensor): Mapping of tokens assigned to experts. - Shape: [num_tokens, num_experts]. - - Returns: - Tuple[torch.Tensor, torch.Tensor]: - - Permuted token embeddings for local experts. - - Number of tokens per expert. - """ - self.hidden_shape = hidden_states.shape - self.probs = probs - self.top_indices = routing_map - assert probs.dim() == 2, "Expected 2D tensor for probs" - assert routing_map.dim() == 2, "Expected 2D tensor for routing map" - - # Permutation 1: input to AlltoAll input - def alltoall_token_permutation1(hidden_states, routing_map): - assert self.hidden_shape is not None - hidden_states = hidden_states.view(-1, self.hidden_shape[-1]) - tokens_per_expert = self.preprocess(routing_map) - if self.tp_ep_size > 1: - hidden_states = all_to_all_sp2hp(hidden_states, - group=self.tp_ep_group) - self.hidden_shape_before_permute = hidden_states.shape - - if self.device_sync_point == "before_permutation_1": - torch.npu.current_stream().synchronize() - - permutated_local_input_tokens, reversed_local_input_permutation_mapping = torch_npu.npu_moe_token_permute( - tokens=hidden_states, - indices=self.top_indices, - num_out_tokens=self.num_out_tokens, - ) - return permutated_local_input_tokens, reversed_local_input_permutation_mapping, tokens_per_expert - - permutated_local_input_tokens, reversed_local_input_permutation_mapping, tokens_per_expert = alltoall_token_permutation1( - hidden_states, routing_map) - self.reversed_local_input_permutation_mapping = reversed_local_input_permutation_mapping - # permute 1 - - ep_group = self.ep_group - - # Perform expert parallel AlltoAll communication - if self.device_sync_point == "before_ep_alltoall": - torch.npu.current_stream().synchronize() - _, global_input_tokens, permute1_ep_all_to_all_handle = async_all_to_all( - permutated_local_input_tokens, - self.output_splits, - self.input_splits, - ep_group, - ) - - # shared experts compute - if self.shared_experts is not None: - (share_experts_output), *_ = self.shared_experts(hidden_states) - else: - share_experts_output = None - - permute1_ep_all_to_all_handle.wait() - permutated_local_input_tokens.untyped_storage().resize_(0) - - def alltoall_token_permutation2(global_input_tokens): - # Permutation 2: Sort tokens by local expert. - if self.num_local_experts > 1: - global_input_tokens, self.reversed_global_input_permutation_mapping = torch_npu.npu_moe_token_permute( - global_input_tokens, - self.global_input_tokens_local_experts_indices) - - # Perform tensor parallel AllGather on the hidden dimension to obtain the input tokens. - # global_input_tokens: [SEQL, H/TP] -> [SEQL, H] - if self.tp_ep_size > 1 and self.config.moe_grouped_gemm: - global_input_tokens = all_gather_last_dim_from_tensor_parallel_region( - global_input_tokens, self.tp_ep_group) - if self.device_sync_point == "before_finish": - torch.npu.current_stream().synchronize() - - return global_input_tokens - - # token premute2 input - global_input_tokens = alltoall_token_permutation2(global_input_tokens) - - return share_experts_output, global_input_tokens, tokens_per_expert - - def token_unpermutation(self, - hidden_states: torch.Tensor, - bias: torch.Tensor = None): - """ - Reverse the token permutation to restore the original order. - - Args: - hidden_states (torch.Tensor): Output from local experts. - bias (torch.Tensor, optional): Bias tensor (not supported). - - Returns: - Tuple[torch.Tensor, Optional[torch.Tensor]]: - - Unpermuted token embeddings in the original order. - - None (bias is not supported). - """ - - def alltoall_token_unpermutation1(hidden_states): - assert bias is None, "Bias is not supported in MoEAlltoAllSeqTokenDispatcher" - # Perform tensor parallel Reduce-Scatter - # hidden_states: [SEQL, H] -> [SEQL, H/TP] - if self.tp_ep_size > 1: - hidden_states = reduce_scatter_last_dim_to_tensor_parallel_region( - hidden_states, group=self.tp_ep_group) - - # Unpermutation 2: expert output to AlltoAll input - if hidden_states.shape[0] > 0 and self.num_local_experts > 1: - hidden_states = torch_npu.npu_moe_token_unpermute( - hidden_states, - self.reversed_global_input_permutation_mapping) - - return hidden_states - - hidden_states = alltoall_token_unpermutation1(hidden_states) - - ep_group = self.ep_group - # Perform expert parallel AlltoAll communication - # hidden_states: [SEQL, H] -> [SEQL, H/TP] - _, permutated_local_input_tokens, handle = async_all_to_all( - hidden_states, self.input_splits, self.output_splits, ep_group) - handle.wait() - hidden_states.untyped_storage().resize_(0) - - def alltoall_token_unpermutation2(permutated_local_input_tokens): - # Unpermutation 1: AlltoAll output to output - - output = torch_npu.npu_moe_token_unpermute( - permuted_tokens=permutated_local_input_tokens, - sorted_indices=self.reversed_local_input_permutation_mapping. - to(torch.int32), - probs=self.probs, - restore_shape=self.hidden_shape_before_permute) - - # Perform tensor parallel AlltoAll communication - # output: [S*B, H/TP] -> [S*B/TP, H] - if self.tp_ep_size > 1: - output = all_to_all_hp2sp(output, self.tp_ep_group) - - # Reshape the output tensor - output = output.view(self.hidden_shape) - return output - - output = alltoall_token_unpermutation2(permutated_local_input_tokens) - - self.input_splits = None - self.output_splits = None - self.num_global_tokens_per_local_expert = None - self.num_global_tokens_per_local_expert_cpu = None - - return output, None diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 754a34430c..f22d948f73 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -75,6 +75,20 @@ # Future Plan: # Remove this patch when vllm merged them. # +# ** File: worker/patch_common/patch_utils.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.utils.direct_register_custom_op` +# Why: +# pytorch 2.7.o is not compatible with pytorch 2.5.1. While vllm is based on pytorch 2.7.0, but vllm ascend +# is based on pytorch 2.5.1, so we need to use this patch to make vllm compatible with pytorch 2.5.1. +# How: +# patch __annotations__ check to make it compatible with pytorch 2.5.1. +# Related PR (if no, explain why): +# This is the problem in vllm-ascend +# Future Plan: +# Remove this patch once pytorch 2.7.0 is supported for vllm ascend. +# +# ** File: worker/patch_0_10_0/patch_sampler_gather_logprobs.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs` # Why: diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py index 78b6fcd991..2533d13e3d 100644 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -15,6 +15,9 @@ # limitations under the License. # +# patch_utils should be the first import, because it will be used by other +# patch files. +import vllm_ascend.patch.worker.patch_common.patch_utils # noqa isort:skip import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_linear # noqa import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_utils.py b/vllm_ascend/patch/worker/patch_common/patch_utils.py new file mode 100644 index 0000000000..dec618c7ab --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_utils.py @@ -0,0 +1,38 @@ +from typing import Callable, List, Optional, Tuple + +import torch +from torch.library import Library +from vllm import utils +from vllm.utils import vllm_lib + + +def ascend_direct_register_custom_op( + op_name: str, + op_func: Callable, + mutates_args: list[str], + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, + dispatch_key: str = "CUDA", + tags: Tuple[torch.Tag, ...] = (), +): + # In pytorch 2.5.1, torch.library.infer_schema require the input function to + # have annotations supported by typing library. But in pytorch 2.7.0 which + # vllm using, torch.library.infer_schema require the python builtin type. In + # this case, we should revert built type to typing type for 2.5.1 backward + # compatibility. + for k, v in op_func.__annotations__.items(): + if v == list[int]: + op_func.__annotations__[k] = List[int] + if v == Optional[list[int]]: + op_func.__annotations__[k] = Optional[List[int]] + # TODO: add more type convert here if needed. + import torch.library + schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args) + my_lib = target_lib or vllm_lib + my_lib.define(op_name + schema_str, tags=tags) + my_lib.impl(op_name, op_func, dispatch_key=dispatch_key) + if fake_impl is not None: + my_lib._register_fake(op_name, fake_impl) + + +utils.direct_register_custom_op = ascend_direct_register_custom_op diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index fa369ac10d..8f8b2b45b6 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -153,11 +153,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "Torchair compilation enabled on NPU. Setting level to NO_COMPILATION" ) compilation_config.level = CompilationLevel.NO_COMPILATION - elif parallel_config.distributed_executor_backend == "ray": - logger.warning( - "Ray distributed executor backend is not compatible with ACL Graph mode " - "right now. Setting level to NO_COMPILATION") - compilation_config.level = CompilationLevel.NO_COMPILATION else: logger.info( "PIECEWISE compilation enabled on NPU. use_inductor not supported - " @@ -259,10 +254,16 @@ def stateless_init_device_torch_dist_pg( assert is_hccl_available() + # TODO(Yizhou): The reason we need to set options while vllm does not + # seems to be related to the version of PyTorch. In the latest version, + # there is no need to set options. While in the older version, 2.5.1 + # specifically, we need to set options. + options = ProcessGroup.Options(backend=backend) pg: ProcessGroup = ProcessGroup( prefix_store, group_rank, group_size, + options, ) backend_options = ProcessGroupHCCL.Options() diff --git a/vllm_ascend/quantization/func_wrapper.py b/vllm_ascend/quantization/func_wrapper.py index 8357695b37..77ecca2b17 100644 --- a/vllm_ascend/quantization/func_wrapper.py +++ b/vllm_ascend/quantization/func_wrapper.py @@ -22,39 +22,6 @@ from vllm.logger import logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import UnquantizedLinearMethod -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, QuantizationConfig) - - -# func refers to vocabParallelEmbedding.__init__ -def wrapper_vocab_parallel_embedding_init(func): - - def init( - self, - num_embeddings: int, - embedding_dim: int, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - func( - self, - num_embeddings, - embedding_dim, - params_dtype, - org_num_embeddings, - padding_size, - quant_config, - prefix, - ) - # TODO: Contact vLLM maintainers to add a `params_dtype` attribute to the `VocabParallelEmbedding` class. - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype - - return init # func refers to RMSNorm.__init__ diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 22c8bc8a08..5984dc74dc 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -15,6 +15,10 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # +# By using quantization case, this file is called before worker patch achieve, +# we need to import patch_utils here first to make sure the patch is applied. +import vllm_ascend.patch.worker.patch_common.patch_utils # type: ignore[import] # isort: skip # noqa + from types import MappingProxyType from typing import Any, Callable, Dict, List, Mapping, Optional @@ -201,17 +205,6 @@ def create_weights( layer.register_parameter(perchannel_name, param) set_weight_attrs(param, extra_weight_attrs) - pergroup_dict = self.quant_method.get_pergroup_param( - input_size_per_partition, output_size_per_partition, params_dtype) - for pergroup_name, pergroup_param in pergroup_dict.items(): - param = torch.nn.Parameter(pergroup_param, requires_grad=False) - set_weight_attrs(param, {"output_dim": 0}) - layer.register_parameter(pergroup_name, param) - set_weight_attrs(param, extra_weight_attrs) - if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name: - setattr(param, "input_dim", 1) - param.input_dim = 1 - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if hasattr(self.quant_method, "process_weights_after_loading"): self.quant_method.process_weights_after_loading(layer) @@ -302,9 +295,6 @@ def create_weights( param = torch.nn.Parameter(param_value, requires_grad=False) layer.register_parameter(param_key, param) set_weight_attrs(param, extra_weight_attrs) - if "weight_scale_second" in param_key or "weight_offset_second" in param_key: - setattr(param, "quant_method", - FusedMoeWeightScaleSupported.GROUP.value) def apply( self, @@ -351,4 +341,4 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, packed_modules_mapping: Dict[str, Any]) -> None: self.quantizer = AscendQuantizer.get_quantizer( quant_config.quant_description, prefix, packed_modules_mapping) - self.quant_method = self.quantizer.build_linear_method() + self.quant_method = self.quantizer.build_linear_method() \ No newline at end of file diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py index 487597cb75..8178d5e7f3 100644 --- a/vllm_ascend/quantization/quantizer.py +++ b/vllm_ascend/quantization/quantizer.py @@ -22,10 +22,7 @@ from vllm.logger import logger -from .func_wrapper import (wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init, - wrapper_vocab_parallel_embedding_init) -from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod, - AscendW4A8DynamicLinearMethod) +from .func_wrapper import wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod, AscendW8A8LinearMethod) from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod, @@ -49,8 +46,14 @@ def get_quantizer(cls, if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE: return - return VLLMAscendQuantizer.get_quantizer(quant_config, prefix, - packed_modules_mapping) + try: + module = importlib.import_module("mindie_turbo") + MindIETurboQuantizer = module.MindIETurboQuantizer + return MindIETurboQuantizer.get_quantizer(quant_config, prefix, + packed_modules_mapping) + except ImportError: + return VLLMAscendQuantizer.get_quantizer(quant_config, prefix, + packed_modules_mapping) def build_linear_method(self): raise NotImplementedError @@ -77,9 +80,6 @@ def __init__(self, quant_description): VLLMAscendQuantizer.apply_patch( "vllm.model_executor.layers.layernorm.RMSNorm", "forward_oot", [wrapper_rmsnorm_forward_oot]) - VLLMAscendQuantizer.apply_patch( - "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding", - "__init__", [wrapper_vocab_parallel_embedding_init]) break VLLMAscendQuantizer.patched = True logger.info("Using the vLLM Ascend Quantizer version now!") @@ -98,15 +98,12 @@ def apply_patch(target_module, target_function, wrappers): if target_function is not None: setattr(original_module, target_function, candidate) - for _, value in sys.modules.copy().items(): - if target_function is None: - continue - try: - attr = getattr(value, target_function, None) - if attr is not None and id(attr) == original_function_id: - setattr(value, target_function, candidate) - except ImportError: - continue + for key, value in sys.modules.copy().items(): + if (target_function is not None + and hasattr(value, target_function) + and id(getattr(value, + target_function)) == original_function_id): + setattr(value, target_function, candidate) @staticmethod def parse_path(module_path, function_name, create_dummy): @@ -266,17 +263,6 @@ def get_quantizer(cls, f"{list(SUPPORT_ASCEND_QUANTIZER_TYPE.keys())}") -class W4A8DYNAMICQuantizer(VLLMAscendQuantizer): - - @staticmethod - def build_linear_method(): - return AscendW4A8DynamicLinearMethod() - - @staticmethod - def build_moe_method(): - return AscendW4A8DynamicFusedMoEMethod() - - class W8A8Quantizer(VLLMAscendQuantizer): @staticmethod @@ -304,7 +290,6 @@ def build_moe_method(): SUPPORT_ASCEND_QUANTIZER_TYPE = { - "W4A8_DYNAMIC": W4A8DYNAMICQuantizer, "W8A8": W8A8Quantizer, "W8A8_DYNAMIC": W8A8DYNAMICQuantizer, "C8": W8A8Quantizer, diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py deleted file mode 100644 index 0b62fe15cf..0000000000 --- a/vllm_ascend/quantization/w4a8_dynamic.py +++ /dev/null @@ -1,396 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing import Any, Callable, Dict, Optional - -import numpy as np -import torch -import torch_npu -from vllm.config import get_current_vllm_config -from vllm.distributed import get_ep_group -from vllm.forward_context import get_forward_context - -from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.ascend_forward_context import FusedMoEState -from vllm_ascend.distributed.parallel_state import get_mc2_group -from vllm_ascend.ops.fused_moe import select_experts -from vllm_ascend.quantization.w8a8_dynamic import (fused_experts_with_all2all, - fused_experts_with_mc2) -from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor - - -class AscendW4A8DynamicLinearMethod: - """Linear method for Ascend W4A8_DYNAMIC - """ - - def __init__(self): - self.transpose_weight = True - try: - self.group_size = get_current_vllm_config( - ).quant_config.quant_description.get("group_size", 256) - except AttributeError: - self.group_size = 256 - - @staticmethod - def get_weight(input_size: int, output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - params_dict = { - "weight": torch.empty(output_size, input_size, dtype=torch.int8) - } - return params_dict - - @staticmethod - def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: - return {} - - @staticmethod - def get_perchannel_param(output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - return {} - - def get_pergroup_param(self, input_size: int, output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - params_dict = {} - params_dict["weight_scale"] = torch.empty(output_size, - 1, - dtype=params_dtype) - params_dict["weight_offset"] = torch.empty(output_size, - 1, - dtype=params_dtype) - params_dict["weight_scale_second"] = torch.empty(output_size, - input_size // - self.group_size, - dtype=params_dtype) - params_dict["weight_offset_second"] = torch.empty(output_size, - input_size // - self.group_size, - dtype=params_dtype) - return params_dict - - @staticmethod - def process_scale_second(weight: torch.Tensor, scale: torch.Tensor, - per_group_scale: torch.Tensor): - k, n = weight.shape - group_num, n = per_group_scale.shape - weight_high = weight.to(torch.float32).reshape( - group_num, -1, n) * per_group_scale.reshape(group_num, 1, n) - weight_high = weight_high.reshape(k, n) - bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0) - antiquant_scale = (scale * per_group_scale).reshape(group_num, n) - return antiquant_scale.npu(), bias - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - tp_rank: Optional[int] = None, - ) -> torch.Tensor: - return torch_npu.npu_weight_quant_batchmatmul( - x, - layer.weight, - antiquant_scale=layer.weight_scale_second.to(x.dtype), - antiquant_group_size=self.group_size, - ) - - def process_weights_after_loading(self, layer: torch.nn.Module): - if self.transpose_weight: - layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight_scale.data = layer.weight_scale.data.flatten().to( - torch.float32) - layer.weight_offset.data = layer.weight_offset.data.flatten() - layer.weight_scale_second.data, scale_bias = self.process_scale_second( - layer.weight.data, - layer.weight_scale.data, - layer.weight_scale_second.data.transpose(0, 1).contiguous(), - ) - param = torch.nn.Parameter(scale_bias, requires_grad=False) - layer.register_parameter("weight_scale_bias", param) - layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( - layer.weight.data.to(torch.int32)) - - -class AscendW4A8DynamicFusedMoEMethod: - """FusedMoe method for Ascend W4A8_DYNAMIC. - """ - - def __init__(self): - self.transpose_weight = True - - self.ep_group = get_ep_group() - - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - - try: - device_group = get_mc2_group().device_group - # TODO: Try local_rank = ep_group.rank_in_group - local_rank = torch.distributed.get_rank(group=device_group) - backend = device_group._get_backend(torch.device("npu")) - self.moe_all_to_all_group_name = backend.get_hccl_comm_name( - local_rank) - except AttributeError: - self.moe_all_to_all_group_name = "" - - @staticmethod - def get_weight(num_experts: int, intermediate_size_per_partition: int, - hidden_sizes: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - param_dict = {} - param_dict["w13_weight"] = torch.empty(num_experts, - 2 * - intermediate_size_per_partition, - hidden_sizes, - dtype=torch.int8) - param_dict["w2_weight"] = torch.empty(num_experts, - hidden_sizes, - intermediate_size_per_partition, - dtype=torch.int8) - return param_dict - - @staticmethod - def get_dynamic_quant_param(num_experts: int, - intermediate_size_per_partition: int, - hidden_sizes: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - param_dict = {} - config = get_current_vllm_config() - group_size = config.quant_config.quant_description.get( - "group_size", 256) - - param_dict["w13_weight_scale"] = torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - 1, - dtype=params_dtype) - - param_dict["w13_weight_offset"] = torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - 1, - dtype=params_dtype) - - param_dict["w13_weight_scale_second"] = torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_sizes // group_size, - dtype=params_dtype) - - param_dict["w13_weight_offset_second"] = torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_sizes // group_size, - dtype=params_dtype) - - param_dict["w2_weight_scale"] = torch.empty(num_experts, - hidden_sizes, - 1, - dtype=params_dtype) - param_dict["w2_weight_offset"] = torch.empty(num_experts, - hidden_sizes, - 1, - dtype=params_dtype) - param_dict["w2_weight_scale_second"] = torch.empty( - num_experts, - hidden_sizes, - intermediate_size_per_partition // group_size, - dtype=params_dtype) - param_dict["w2_weight_offset_second"] = torch.empty( - num_experts, - hidden_sizes, - intermediate_size_per_partition // group_size, - dtype=params_dtype) - - return param_dict - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - is_prefill: bool = True, - enable_force_load_balance: bool = True, - log2phy: torch.Tensor = None, - global_redundant_expert_num: int = 0, - shared_experts: Optional[Any] = None, - quantized_x_for_share: Optional[Any] = None, - dynamic_scale_for_share: Optional[Any] = None, - **kwargs, - ) -> torch.Tensor: - assert router_logits.shape[ - 1] == global_num_experts, "Number of global experts mismatch" - - # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern - if global_num_experts == 256: - topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( - router_logits, - k=top_k, # topk currently is 8 - bias=e_score_correction_bias, - k_group=topk_group, # fix: 4 - group_count=num_expert_group, # fix 8 - group_select_mode= - 1, # 0: the maximum in the group; 1: topk2.sum(fix) - renorm=0, # 0: softmax->topk(fix); 1: topk->softmax - norm_type=1, # 0: softmax; 1: sigmoid(fix) - # out_flag=False, # todo new api; should the third output be output - # y2_flag=False, # old api; should the third output be output - routed_scaling_factor=1, - eps=float(1e-20)) - else: - topk_weights, topk_ids = select_experts( - hidden_states=x, - router_logits=router_logits, - top_k=top_k, - use_grouped_topk=use_grouped_topk, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, - ) - - fused_moe_state = get_forward_context().fused_moe_state - shared_gate_up, shared_dequant_scale = None, None - if shared_experts is not None and fused_moe_state == FusedMoEState.MC2: - with npu_stream_switch("moe_secondary", 0): - npu_wait_tensor(quantized_x_for_share, router_logits) - share_up_out, _ = shared_experts.gate_up_proj( - (quantized_x_for_share, dynamic_scale_for_share)) - shared_gate_up, shared_dequant_scale = share_up_out[ - 0], share_up_out[1] - - # this is a naive implementation for experts load balance so as - # to avoid accumulating too much tokens on a single rank. - # currently it is only activated when doing profile runs. - if enable_force_load_balance: - topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) - - topk_weights = topk_weights.to(x.dtype) - if fused_moe_state == FusedMoEState.MC2: - return fused_experts_with_mc2( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - w1_scale=layer.w13_weight_scale_second, - w2_scale=layer.w2_weight_scale_second, - w1_scale_bias=layer.w13_scale_bias, - w2_scale_bias=layer.w2_scale_bias, - topk_weights=topk_weights, - topk_ids=topk_ids, - top_k=top_k, - expert_map=expert_map, - moe_all_to_all_group_name=self.moe_all_to_all_group_name, - log2phy=log2phy, - global_redundant_expert_num=global_redundant_expert_num, - shared_experts=shared_experts, - is_torchair=self.torchair_graph_enabled, - quantized_x_for_share=shared_gate_up, - dynamic_scale_for_share=shared_dequant_scale, - mc2_mask=kwargs.get("mc2_mask", None)) - else: - # The current implementation of deepseek moe splits hidden_states - # according to tp_size before they are feed into fused_moe module. - # Therefore, all2all is needed no matter how dp/tp is set so as to - # dispatch/combine tokens. - return fused_experts_with_all2all( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - w1_scale=layer.w13_weight_scale_second, - w2_scale=layer.w2_weight_scale_second, - w1_scale_bias=layer.w13_scale_bias, - w2_scale_bias=layer.w2_scale_bias, - topk_weights=topk_weights, - topk_ids=topk_ids, - top_k=top_k, - expert_map=expert_map, - ep_group=self.ep_group, - log2phy=log2phy, - global_redundant_expert_num=global_redundant_expert_num, - ) - - def process_scale(self, weight: torch.Tensor, scale, per_group_scale): - group_num, k, n = weight.shape - per_group_scale = per_group_scale.reshape(group_num, -1, n) - group_num, quantgroup_num, n = per_group_scale.shape - weight_high = weight.to(torch.float32).reshape([group_num, quantgroup_num, -1, n]) * \ - per_group_scale.reshape([group_num, quantgroup_num, 1, n]) - weight_high = weight_high.reshape([group_num, k, n]) - bias = 8 * (weight_high.to(torch.float32) * scale).sum(axis=1) - scale_fp32 = (scale * per_group_scale).to(torch.float16).to( - torch.float32) - scale_fp32_np = scale_fp32.cpu().numpy() - scale_fp32_np.dtype = np.uint32 - sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), - dtype=np.uint32) - - sscale_uint64[..., ::2] = scale_fp32_np - - sscale_uint64_buffer = np.frombuffer(sscale_uint64.tobytes(), - dtype=np.int64).copy() - sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape( - group_num, quantgroup_num, n) - sscale_uint64_tensor = sscale_uint64_tensor.npu() - return sscale_uint64_tensor, bias - - def process_weights_after_loading(self, layer): - if self.transpose_weight: - layer.w13_weight.data = layer.w13_weight.data.transpose( - 1, 2).contiguous() - layer.w2_weight.data = layer.w2_weight.data.transpose( - 1, 2).contiguous() - layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose( - 1, 2).contiguous() - layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose( - 1, 2).contiguous() - layer.w13_weight_offset.data = layer.w13_weight_offset.data.view( - layer.w13_weight_offset.data.shape[0], -1) - layer.w2_weight_offset.data = layer.w2_weight_offset.data.view( - layer.w2_weight_offset.data.shape[0], -1) - layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose( - 1, 2).contiguous() - layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose( - 1, 2).contiguous() - - layer.w13_weight_scale_second.data, bias = self.process_scale( - layer.w13_weight, layer.w13_weight_scale.data, - layer.w13_weight_scale_second.data) - param = torch.nn.Parameter(bias, requires_grad=False) - layer.register_parameter("w13_scale_bias", param) - layer.w2_weight_scale_second.data, bias1 = self.process_scale( - layer.w2_weight, layer.w2_weight_scale.data, - layer.w2_weight_scale_second.data) - param = torch.nn.Parameter(bias1, requires_grad=False) - layer.register_parameter("w2_scale_bias", param) - - layer.w13_weight.data = torch_npu.npu_quantize( - layer.w13_weight.data.to(torch.float32), - torch.tensor([1.]).npu(), None, torch.quint4x2, -1, False) - layer.w2_weight.data = torch_npu.npu_quantize( - layer.w2_weight.data.to(torch.float32), - torch.tensor([1.]).npu(), None, torch.quint4x2, -1, False) diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index d3bff93f18..09080def8c 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -84,10 +84,6 @@ def get_perchannel_param( dtype=params_dtype) return params_dict - def get_pergroup_param(self, input_size: int, output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - return {} - @staticmethod def apply( layer: torch.nn.Module, diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index affc489d5c..36549e7586 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -116,9 +116,7 @@ def apply_mlp(hidden_states: torch.Tensor, w2_scale: torch.Tensor, group_list: torch.Tensor, dynamic_scale: torch.Tensor = None, - group_list_type: int = 1, - w1_scale_bias: torch.Tensor = None, - w2_scale_bias: torch.Tensor = None) -> torch.Tensor: + group_list_type: int = 1) -> torch.Tensor: """ apply MLP: gate_up_proj -> swiglu -> down_proj @@ -152,31 +150,17 @@ def apply_mlp(hidden_states: torch.Tensor, else: pertoken_scale = dynamic_scale - bias1, bias2 = None, None - _output_dtype = w2_scale.dtype - - if w1_scale_bias is not None: - if group_list_type == 0: - group_list = torch.cat( - [group_list[:1], torch.diff(group_list, dim=0)]) - group_list_type = 1 - bias1 = [w1_scale_bias] - bias2 = [w2_scale_bias] - # TODO w4a8 scene: dynamic acquisition of dtype in the future - _output_dtype = torch.bfloat16 - # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], weight=[w1], scale=[w1_scale], - bias=bias1, per_token_scale=[pertoken_scale], split_item=2, group_list_type=group_list_type, group_type=0, group_list=group_list, - output_dtype=_output_dtype)[0] + output_dtype=w2_scale.dtype)[0] # act_fn: swiglu hidden_states = torch_npu.npu_swiglu(hidden_states) @@ -188,13 +172,12 @@ def apply_mlp(hidden_states: torch.Tensor, x=[hidden_states], weight=[w2], scale=[w2_scale], - bias=bias2, per_token_scale=[swiglu_out_scale], split_item=2, group_list_type=group_list_type, group_type=0, group_list=group_list, - output_dtype=_output_dtype)[0] + output_dtype=w2_scale.dtype)[0] return hidden_states @@ -219,8 +202,6 @@ def fused_experts_with_mc2( mc2_mask: Optional[torch.Tensor] = None, shared_gate_up: Optional[Any] = None, shared_dequant_scale: Optional[Any] = None, - w1_scale_bias: torch.Tensor = None, - w2_scale_bias: torch.Tensor = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: assert mc2_mask is not None if log2phy is not None: @@ -289,25 +270,13 @@ def fused_experts_with_mc2( shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1] # `expand_x` will be disposed in the `apply_mlp` function - if w1_scale_bias is None: - down_out_list = apply_mlp_decode(expand_x, - w1, - w1_scale, - w2, - w2_scale, - expert_token_nums, - dynamic_scale=dynamic_scale) - else: - # w4a8 scene, cannot use apply_mlp_decode because the operator is not supported - down_out_list = apply_mlp(expand_x, - w1, - w1_scale, - w2, - w2_scale, - expert_token_nums, - dynamic_scale=dynamic_scale, - w1_scale_bias=w1_scale_bias, - w2_scale_bias=w2_scale_bias) + down_out_list = apply_mlp_decode(expand_x, + w1, + w1_scale, + w2, + w2_scale, + expert_token_nums, + dynamic_scale=dynamic_scale) # moeCombine kwargs_mc2 = { @@ -365,29 +334,6 @@ def fused_experts_with_mc2( return hidden_states, shared_output -def init_routing_quant(hidden_states, top_k, topk_ids, global_num_experts): - num_tokens, _ = hidden_states.shape - row_idx_len = num_tokens * top_k - row_idx = (torch.arange(0, - row_idx_len, - dtype=torch.int32, - device=hidden_states.device).view( - top_k, -1).permute(1, 0).contiguous()) - hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( - hidden_states, - row_idx=row_idx, - expert_idx=topk_ids, - active_num=num_tokens) - - expanded_row_idx = (expanded_row_idx.view(top_k, -1).permute( - 1, 0).contiguous().view(-1)) - global_expert_tokens = torch.bincount(expanded_expert_idx, - minlength=global_num_experts) - global_expert_tokens = global_expert_tokens.to(torch.int32) - quantized_tokens, token_scales = torch_npu.npu_dynamic_quant(hidden_states) - return quantized_tokens, expanded_row_idx, global_expert_tokens, token_scales - - # currently expert parallelism implemented with all2all # is under-optimized. def fused_experts_with_all2all( @@ -403,8 +349,6 @@ def fused_experts_with_all2all( ep_group: GroupCoordinator = None, log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, - w1_scale_bias: torch.Tensor = None, - w2_scale_bias: torch.Tensor = None, ): if log2phy is not None: topk_ids = log2phy[topk_ids] @@ -414,54 +358,50 @@ def fused_experts_with_all2all( num_tokens, _ = hidden_states.shape num_experts = w1.shape[0] + device = hidden_states.device if expert_map is not None: global_num_experts = len(expert_map) + global_redundant_expert_num - if hasattr(torch_npu, "npu_moe_init_routing_quant"): - quantized_tokens, expanded_row_idx, global_expert_tokens, _, token_scales = torch_npu.npu_moe_init_routing_quant( - hidden_states, - expert_idx=topk_ids.to(torch.int32), - active_num=0, - expert_capacity=0, - expert_num=global_num_experts, - drop_pad_mode=0, - expert_tokens_num_mode=2, - expert_tokens_before_capacity_flag=False, - quant_mode=1, - ) - else: - quantized_tokens, expanded_row_idx, global_expert_tokens, token_scales = init_routing_quant( - hidden_states, top_k, topk_ids, global_num_experts) - - gather_sizes = global_expert_tokens.new_empty( - global_expert_tokens.shape[0]) - dist.all_to_all_single(gather_sizes, global_expert_tokens) - - token_counts_combined = torch.stack( - [gather_sizes, global_expert_tokens], dim=0) - token_counts_combined = token_counts_combined.view( - 2, ep_group.world_size, -1).sum(dim=2) - token_counts_combined_cpu = token_counts_combined.to( - torch.device("cpu"), non_blocking=True).numpy() - all_tokens = gather_sizes.sum() - - gathered_tokens = quantized_tokens.new_empty(all_tokens.item(), - quantized_tokens.shape[1]) - dynamic_scale = token_scales.new_empty(gathered_tokens.shape[0]) - gather_size_list = token_counts_combined_cpu[1] - scatter_size_list = token_counts_combined_cpu[0] - - dist.all_to_all_single(gathered_tokens, quantized_tokens, - scatter_size_list, gather_size_list) - dist.all_to_all_single(dynamic_scale, token_scales, scatter_size_list, - gather_size_list) - - hidden_states, dynamic_scale, inverse_indices, expert_tokens = torch_npu.npu_moe_re_routing( - gathered_tokens, - gather_sizes.view(ep_group.world_size, -1), - per_token_scales=dynamic_scale) - expert_tokens = expert_tokens.to(torch.int64) - group_list_type = 1 + local_num_experts = global_num_experts // ep_group.world_size + row_idx_len = num_tokens * top_k + row_idx = (torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=device).view(top_k, -1).permute( + 1, 0).contiguous()) + hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=num_tokens) + + global_expert_tokens = torch.bincount(expanded_expert_idx, + minlength=global_num_experts) + scatter_sizes = global_expert_tokens.view(ep_group.world_size, + -1).sum(-1) + + gather_sizes = torch.empty_like(scatter_sizes) + dist.all_to_all_single(gather_sizes, + scatter_sizes, + group=ep_group.device_group) + scatter_size_list = scatter_sizes.cpu().tolist() + gather_size_list = gather_sizes.cpu().tolist() + + expanded_expert_idx = expanded_expert_idx % local_num_experts + hidden_states = ep_group.all_to_all(hidden_states, 0, 0, + scatter_size_list, + gather_size_list) + local_expert_idx = ep_group.all_to_all(expanded_expert_idx, 0, 0, + scatter_size_list, + gather_size_list) + + sorted_local_expert_idx, sorted_idx = torch.sort(local_expert_idx) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + sorted_local_expert_idx, local_num_experts).to(torch.int64) + + hidden_states = hidden_states[sorted_idx] + group_list_type = 0 else: row_idx_len = num_tokens * top_k row_idx = torch.arange(0, @@ -479,7 +419,6 @@ def fused_experts_with_all2all( expanded_expert_idx, num_experts) expert_tokens = expert_tokens.to(torch.int64) group_list_type = 0 - dynamic_scale = None # `hidden_states` will be disposed in the `apply_mlp` function hidden_states = apply_mlp( @@ -489,21 +428,14 @@ def fused_experts_with_all2all( w2, w2_scale, expert_tokens, #16 - dynamic_scale=dynamic_scale, - group_list_type=group_list_type, - w1_scale_bias=w1_scale_bias, - w2_scale_bias=w2_scale_bias) + group_list_type=group_list_type) if expert_map is not None: - reordered_outputs = torch.index_select( - hidden_states, - dim=0, - # Workaround: Convert to float so that argsort runs on AI Core instead of slower AICPU - index=inverse_indices.to(torch.float32).argsort().to(torch.int32)) - - hidden_states = reordered_outputs.new_empty(*quantized_tokens.shape) - dist.all_to_all_single(hidden_states, reordered_outputs, - gather_size_list, scatter_size_list) + resorted_idx = torch.argsort(sorted_idx) + hidden_states = hidden_states[resorted_idx] + hidden_states = ep_group.all_to_all(hidden_states, 0, 0, + gather_size_list, + scatter_size_list) final_hidden_states = torch_npu.npu_moe_finalize_routing( hidden_states, @@ -512,8 +444,8 @@ def fused_experts_with_all2all( bias=None, scales=topk_weights, expanded_src_to_dst_row=expanded_row_idx, - export_for_source_row=None, - drop_pad_mode=2) + export_for_source_row=topk_ids, + ) else: # TODO: Reorder device memory 2 times here, replace the current # implementation here when suitable operators become available. @@ -767,10 +699,6 @@ def get_perchannel_param( dtype=params_dtype) return params_dict - def get_pergroup_param(self, input_size: int, output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - return {} - @staticmethod def apply( layer: torch.nn.Module, diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index c082f988ad..862bd03e1b 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -3,8 +3,6 @@ from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.sampler import Sampler -from vllm_ascend.utils import is_310p - class AscendSampler(Sampler): @@ -22,8 +20,7 @@ def _apply_top_k_top_p( k: torch.Tensor, p: torch.Tensor, ) -> torch.Tensor: - # npu_top_k_top_p uses the operator aclnnApplyTopKTopP, but aclnnApplyTopKTopP currently does not support 310P - if not is_310p() and p is not None and k is not None: + if p is not None and k is not None: # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p) return torch_npu.npu_top_k_top_p(logits, p, k) diff --git a/vllm_ascend/soc_info.py b/vllm_ascend/soc_info.py new file mode 100644 index 0000000000..ac1317e8e1 --- /dev/null +++ b/vllm_ascend/soc_info.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass + +import torch_npu + + +@dataclass +class NPUSocInfo: + is_a3: bool = False + + def __post_init__(self): + torch_npu.npu._lazy_init() + self.soc_version = torch_npu._C._npu_get_soc_version() + if self.soc_version in (250, 251, 252, 253, 254, 255): + self.is_a3 = True diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py deleted file mode 100644 index 200167438b..0000000000 --- a/vllm_ascend/torchair/torchair_model_runner.py +++ /dev/null @@ -1,29 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py -# - -import torch -from vllm.config import VllmConfig - -from vllm_ascend.worker.model_runner_v1 import NPUModelRunner - - -class NPUTorchairModelRunner(NPUModelRunner): - - def __init__(self, vllm_config: VllmConfig, device: torch.device): - super().__init__(vllm_config, device) diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py index 3488ac7517..f74bc02d39 100644 --- a/vllm_ascend/torchair/torchair_worker.py +++ b/vllm_ascend/torchair/torchair_worker.py @@ -17,7 +17,6 @@ from vllm.logger import logger import vllm_ascend.envs as envs_ascend -from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist, check_torchair_cache_exist, delete_torchair_cache_file, @@ -53,9 +52,3 @@ def determine_available_memory(self) -> int: self.model_runner.new_kv_cache_bytes = available_kv_cache_memory return available_kv_cache_memory - - def init_device(self): - """Override init_device to init torchair model runner""" - device = self._init_device() - # Init ModelRunner here, so that we have access to self.device. - self.model_runner = NPUTorchairModelRunner(self.vllm_config, device) diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index ee620b4bb9..d35616f13a 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -61,10 +61,11 @@ def is_310p(): global _IS_310P - if _IS_310P is None: - from vllm_ascend import _build_info # type: ignore - _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p") - return _IS_310P + return False + # if _IS_310P is None: + # from vllm_ascend import _build_info # type: ignore + # _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p") + # return _IS_310P def sleep_mode_enabled(): @@ -479,8 +480,7 @@ def register_ascend_customop(): _ASCEND_CUSTOMOP_IS_REIGISTERED = True -# TODO(zzzzwwjj): Currently there is no clear SOC_VERSION policy for A2 and A3 in CANN. -# So we get the version dynamically. In the future, we should get the version info from _build_info like 310p does. +# TODO(zzzzwwjj): It will be judged with _build_info afterwards. class AscendSocVersion(Enum): A2 = 0 A3 = 1 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index b2f730a1b6..7c060fb1ae 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -70,6 +70,7 @@ from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) +from vllm.distributed.parallel_state import get_context_model_parallel_rank from vllm_ascend import envs from vllm_ascend.ascend_config import get_ascend_config @@ -79,7 +80,6 @@ AscendMetadata) from vllm_ascend.attention.attention_v1_torchair import AscendTorchairMetadata from vllm_ascend.attention.mla_v1 import AscendMLAMetadata -from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler from vllm_ascend.torchair.utils import (check_torchair_cache_exist, @@ -94,8 +94,6 @@ if not vllm_version_is("0.10.0"): from vllm.tasks import GenerationTask, SupportedTask - from vllm.v1.worker.kv_connector_model_runner_mixin import \ - KVConnectorOutput if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] @@ -164,6 +162,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.block_size) self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_reqs = self.scheduler_config.max_num_seqs + self.cp_size = int(os.getenv("VLLM_CP_SIZE", '1')) self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank self.device = device @@ -292,8 +291,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): # OPTIMIZATION: Cache the tensors rather than creating them every step. self.arange_np: npt.NDArray[np.int32] = np.arange(max( self.max_num_reqs + 1, self.model_config.max_model_len, - self.max_num_tokens), + self.max_num_tokens) * self.cp_size + self.cp_size * self.max_num_reqs, dtype=np.int32) + self.position_cp = np.zeros(self.max_num_tokens, dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should # not make any assumptions about the values in these tensors. @@ -307,7 +307,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): pin_memory=True) self.positions_np = self.positions_cpu.numpy() - self.slot_mapping_cpu = torch.zeros(self.max_num_tokens, + self.slot_mapping_cpu = torch.zeros(self.max_num_tokens * self.cp_size + self.cp_size * self.max_num_reqs, dtype=torch.int32, device="cpu", pin_memory=True) @@ -348,38 +348,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): torch._logging.set_logs( recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES) - self.check_batch_sizes_consistency() # NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True self.in_profile_run = False # kv role self.is_kv_producer = False - self.is_kv_consumer = False if vllm_config.kv_transfer_config is not None: self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer - self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer - - def check_batch_sizes_consistency(self) -> None: - if not dist.is_initialized(): - return - - local = torch.tensor(self.torchair_graph_batch_sizes, - device="cpu", - dtype=torch.int32) - gathered_graph_batch_size = local.clone() - dist.all_reduce(gathered_graph_batch_size, - group=get_dp_group().cpu_group) - expected = local * self.dp_size - - if not torch.equal(gathered_graph_batch_size, expected): - diff_idxs = (gathered_graph_batch_size != expected).nonzero( - as_tuple=False).flatten().tolist() - raise AssertionError( - f"[Graph BatchSize Mismatch] Found mismatches at indices {diff_idxs}.\n" - f"Local (rank {self.dp_rank}): {local.tolist()}\n" - f"Sum over ranks: {gathered_graph_batch_size.tolist()}\n" - f"Expected if all equal: {[v * self.dp_size for v in local.tolist()]}" - ) def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler @@ -461,6 +436,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: num_computed_tokens=new_req_data.num_computed_tokens, output_token_ids=[], lora_request=new_req_data.lora_request, + num_computed_and_new_tokens_cp=new_req_data.num_computed_and_new_tokens_cp, ) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) @@ -507,6 +483,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: is_last_rank = get_pp_group().is_last_rank for i, req_id in enumerate(req_data.req_ids): req_state = self.requests[req_id] + + #cp param + req_state.kv_rank = req_data.kv_rank[i] + req_state.num_computed_and_new_tokens_cp = req_data.num_computed_and_new_tokens_cp[i] + num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] resumed_from_preemption = req_data.resumed_from_preemption[i] @@ -543,6 +524,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_ids_to_add.append(req_id) continue + #cp param + self.input_batch.kv_rank[req_index] = req_state.kv_rank + self.input_batch.num_computed_and_new_tokens_cp[req_index] = req_state.num_computed_and_new_tokens_cp + # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) @@ -595,79 +580,44 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.input_batch.refresh_sampling_metadata() def _get_forward_metadata_across_dp( - self, num_tokens: int, with_prefill: bool, - enable_dbo: bool) -> tuple[torch.Tensor, bool, bool]: - - # Compose: all_reduce metadata (num_tokens of each rank, with_prefill, enable_dbo) - num_tokens_across_dp = torch.zeros(self.dp_size + 2, - dtype=torch.int32, - device="cpu") - num_tokens_across_dp[self.dp_rank] = num_tokens - num_tokens_across_dp[-2] = int(with_prefill) - num_tokens_across_dp[-1] = int(not enable_dbo) - dist.all_reduce(num_tokens_across_dp, group=get_dp_group().cpu_group) - with_prefill = bool(num_tokens_across_dp[-2]) - enable_dbo = not bool(num_tokens_across_dp[-1]) - num_tokens_across_dp = num_tokens_across_dp[:-2] - return num_tokens_across_dp, with_prefill, enable_dbo - - def _get_forward_metadata_across_dp_and_pad( - self, num_tokens: int, with_prefill: bool, enable_dbo: bool + self, + maybe_padded_num_tokens: int, + num_tokens: int, + with_prefill: bool, + enable_dbo: bool = False, ) -> tuple[int, Optional[torch.Tensor], bool, bool]: if self.dp_size == 1: - return num_tokens, None, with_prefill, enable_dbo + return maybe_padded_num_tokens, None, with_prefill, enable_dbo - if self.is_kv_producer and not envs_ascend.VLLM_ASCEND_ENABLE_CHUNK_MC2: - num_tokens_across_dp = torch.tensor([num_tokens] * self.dp_size, - device="cpu", - dtype=torch.int32) - return num_tokens, num_tokens_across_dp, True, enable_dbo + num_tokens_across_dp = [0] * self.dp_size * 2 + num_tokens_across_dp[self.dp_rank] = maybe_padded_num_tokens + num_tokens_across_dp[self.dp_size + self.dp_rank] = num_tokens + forward_metadata = torch.tensor(num_tokens_across_dp + + [with_prefill, not enable_dbo], + device="cpu", + dtype=torch.int32) + dist.all_reduce(forward_metadata, group=get_dp_group().cpu_group) + with_prefill = bool(forward_metadata[-2]) + + # NOTE: when with_prefill is false before all_reduce and true after all_reduce, we need to revert pad. + if with_prefill: + num_tokens_across_dp = forward_metadata[self.dp_size:self.dp_size * + 2] + maybe_padded_num_tokens = num_tokens + else: + num_tokens_across_dp = forward_metadata[:self.dp_size] - if self.is_kv_consumer and self.torchair_graph_enabled and len( - self.torchair_graph_batch_sizes - ) == 1 and not self.in_profile_run: - max_num_decode_tokens = self.torchair_graph_batch_sizes[0] - num_tokens_across_dp = torch.tensor([max_num_decode_tokens] * + # NOTE: when in torchair_graph_mode, we need to pad local_num_tokens to + # `max_tokens_across_dp`, in other situation it is not necessary. + if self.torchair_graph_enabled and not with_prefill: + maybe_padded_num_tokens = torch.max(num_tokens_across_dp).item() + num_tokens_across_dp = torch.tensor([maybe_padded_num_tokens] * self.dp_size, device="cpu", dtype=torch.int32) - return max_num_decode_tokens, num_tokens_across_dp, False, enable_dbo - maybe_padded_num_tokens = num_tokens - num_tokens_across_dp, with_prefill, enable_dbo = self._get_forward_metadata_across_dp( - num_tokens, with_prefill, enable_dbo) - - if self.torchair_graph_enabled and not with_prefill: - max_num_token = num_tokens_across_dp.max().item() - maybe_padded_num_tokens = self.select_torchair_padded_batch_size( - max_num_token) - num_tokens_across_dp = torch.full((self.dp_size, ), - maybe_padded_num_tokens, - dtype=torch.int32, - device="cpu") - - return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, enable_dbo - - def _check_dbo_is_valid(self, query_lens: torch.Tensor, - attn_state: AscendAttentionState, - num_tokens: int) -> bool: - # do the checks for dp + dbo - if attn_state in [ - AscendAttentionState.DecodeOnly, - AscendAttentionState.SpecDecoding - ]: - return False - # considering the case that one dp rank may enable dbo while others may not - if not self.vllm_config.model_config.use_mla or not envs_ascend.VLLM_ASCEND_ENABLE_DBO: - return False - # TODO: remove it if token-level microbatch is enabled - [token_index, - seq_index] = compute_split_seq_index(query_lens, attn_state, - num_tokens) - if token_index == 0 or seq_index == 0 or seq_index == len( - query_lens) or num_tokens < 256: - return False - return True + return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, not bool( + forward_metadata[-1]) def get_eagle_atten_dict( self, @@ -1004,6 +954,73 @@ def _gather_mm_embeddings( mm_embeds.append(mm_embeds_item) return mm_embeds + def _num_scheduled_tokens_prefill_cp( + self, + num_tokens, + num_comnputed_tokens, + ): + num_scheduled_tokens = num_tokens - num_comnputed_tokens + num_cp_padded_scheduled_tokens = cdiv(num_scheduled_tokens, 2 * self.cp_size) * (2 * self.cp_size) + + cp_pad = num_cp_padded_scheduled_tokens - num_scheduled_tokens # 给sample用 + + full_indices = list(range(num_comnputed_tokens, num_comnputed_tokens + num_cp_padded_scheduled_tokens)) + + chunk_size = num_cp_padded_scheduled_tokens // (2 * self.cp_size) + assert chunk_size > cp_pad, "not support req_length < 2 * CP_size " + + req_position_cp = [] + req_position_cp.extend(full_indices[self.cp_rank * chunk_size: (self.cp_rank + 1) * chunk_size]) + req_position_cp.extend(full_indices[num_cp_padded_scheduled_tokens - (self.cp_rank + 1) * chunk_size: num_cp_padded_scheduled_tokens - self.cp_rank * chunk_size]) + + return req_position_cp, num_cp_padded_scheduled_tokens, cp_pad + + + def _slot_mapping_prefill_cp( + self, + num_scheduled_tokens_for_slot, + ): + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() + num_computed_and_new_tokens_batch = self.input_batch.num_computed_and_new_tokens_cp + start_index = 0 + for i in range(self.input_batch.num_reqs): + block_table_req = block_table_cpu[i] + block_table_indices = np.repeat(block_table_req, self.block_size) + num_save_tokens_rank = num_computed_and_new_tokens_batch[i][self.cp_rank] + + positions_for_slot = self.arange_np[:num_save_tokens_rank] + block_offsets = positions_for_slot % self.block_size + slot_mapping = (block_table_indices * self.block_size)[:num_save_tokens_rank] + block_offsets + + num_cp_padded_scheduled_tokens = num_scheduled_tokens_for_slot[i] + kv_save_start = sum(num_computed_and_new_tokens_batch[i][:self.cp_rank]) + + self.slot_mapping_np[start_index+kv_save_start:start_index+kv_save_start+num_save_tokens_rank] = slot_mapping + + start_index += num_cp_padded_scheduled_tokens + + def _slot_mapping_decode_cp( + self, + num_scheduled_tokens, + ): + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() + num_computed_and_new_tokens_batch = self.input_batch.num_computed_and_new_tokens_cp + start_index = 0 + for i in range(self.input_batch.num_reqs): + if self.input_batch.kv_rank[i] == self.cp_rank: + block_table_req = block_table_cpu[i] + block_table_indices = np.repeat(block_table_req, self.block_size) + num_save_tokens_rank = num_computed_and_new_tokens_batch[i][self.cp_rank] + + positions_for_slot = self.arange_np[:num_save_tokens_rank] + block_offsets = positions_for_slot % self.block_size + slot_mapping = (block_table_indices * self.block_size)[:num_save_tokens_rank] + block_offsets + + num_scheduled_tokens_req = num_scheduled_tokens[i] + self.slot_mapping_np[start_index:start_index+num_scheduled_tokens_req] = slot_mapping[-num_scheduled_tokens_req:] + + start_index += num_scheduled_tokens_req + def _process_reqs( self, scheduler_output: "SchedulerOutput", @@ -1014,6 +1031,7 @@ def _process_reqs( Optional[set[str]], Optional[set[str]]]: # Check input valid total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + self.cp_rank = get_context_model_parallel_rank() assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs assert num_reqs > 0 @@ -1039,15 +1057,29 @@ def _process_reqs( # TODO: The Python loop can be slow. Optimize. num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32) num_valid_tokens = np.empty(num_reqs, dtype=np.int32) + num_cp_pads = np.empty(num_reqs, dtype=np.int32) max_num_scheduled_tokens = 0 + start_index = 0 + num_scheduled_tokens_for_slot = np.empty(num_reqs, dtype=np.int32) for i, req_id in enumerate(self.input_batch.req_ids): num_tokens = scheduler_output.num_scheduled_tokens[req_id] + if self.cp_size > 1 and num_tokens > 1: + req_position_cp, num_cp_padded_scheduled_tokens, num_cp_pads[i] = self._num_scheduled_tokens_prefill_cp(num_tokens, self.input_batch.num_computed_tokens_cpu[i]) + num_tokens = len(req_position_cp) + self.position_cp[start_index:start_index+num_tokens] = req_position_cp + start_index += num_tokens + num_scheduled_tokens_for_slot[i] = num_cp_padded_scheduled_tokens + num_scheduled_tokens[i] = num_tokens num_valid_tokens[i] = num_tokens - \ len(scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) max_num_scheduled_tokens = max(max_num_scheduled_tokens, num_tokens) + #update total_num_scheduled_tokens + total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) + num_input_tokens = total_num_scheduled_tokens #未考虑图模式pad + # Hot-Swap lora model if self.lora_config: self.set_active_loras(self.input_batch, num_scheduled_tokens) @@ -1058,15 +1090,26 @@ def _process_reqs( cu_num_tokens = np.cumsum(num_scheduled_tokens) cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, num_scheduled_tokens) - logits_indices = cu_num_tokens - 1 + + # sample id + if self.cp_size > 1 and self.attn_metadata_builder._num_prefills > 0: + logits_indices = cu_num_tokens - num_cp_pads[:num_reqs] - 1 + else: + logits_indices = cu_num_tokens - 1 logits_indices = torch.from_numpy(logits_indices).to(self.device, non_blocking=True) arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets positions_np = self.positions_np[:total_num_scheduled_tokens] - np.add(self.input_batch.num_computed_tokens_cpu[req_indices], - arange, - out=positions_np) + + if self.cp_size > 1 and self.attn_metadata_builder._num_prefills > 0: + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + self.position_cp[:total_num_scheduled_tokens], + out=positions_np) + else: + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) # Calculate M-RoPE positions. # Only relevant for models using M-RoPE (e.g, Qwen2-VL) @@ -1090,15 +1133,26 @@ def _process_reqs( num_scheduled_tokens) seq_lens = self.seq_lens_cpu[:num_reqs] - block_table_indices = (req_indices * self.max_num_blocks_per_req + - positions_np // self.block_size) + if self.cp_size > 1: + if self.attn_metadata_builder._num_prefills > 0: + total_num_scheduled_tokens_for_slot = sum(num_scheduled_tokens_for_slot[:num_reqs]) + self.slot_mapping_np[:total_num_scheduled_tokens_for_slot] = -1 - block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() - block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() - block_offsets = positions_np % self.block_size - np.add(block_numbers * self.block_size, - block_offsets, - out=self.slot_mapping_np[:total_num_scheduled_tokens]) + self._slot_mapping_prefill_cp(num_scheduled_tokens_for_slot) + elif self.attn_metadata_builder._num_decodes > 0: + self.slot_mapping_np[:total_num_scheduled_tokens] = -1 + self._slot_mapping_decode_cp(num_scheduled_tokens) + + else: + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size) + + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() + block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() + block_offsets = positions_np % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.slot_mapping_np[:total_num_scheduled_tokens]) ascend_config = get_ascend_config() use_spec_decode = len( @@ -1143,17 +1197,15 @@ def _process_reqs( with_prefill = attn_state not in [ AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding ] - enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(), - attn_state, - total_num_scheduled_tokens) - enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(), - attn_state, - total_num_scheduled_tokens) + maybe_padded_num_tokens = total_num_scheduled_tokens + if self.torchair_graph_enabled and not with_prefill: + maybe_padded_num_tokens = self.select_torchair_padded_batch_size( + total_num_scheduled_tokens) (padded_num_tokens_across_dp, num_tokens_across_dp, with_prefill, - enable_dbo) = self._get_forward_metadata_across_dp_and_pad( - total_num_scheduled_tokens, with_prefill, enable_dbo) - extra_builder_kwargs['enable_dbo_across_dp'] = enable_dbo + enable_dbo) = self._get_forward_metadata_across_dp( + maybe_padded_num_tokens, total_num_scheduled_tokens, with_prefill) + if self.torchair_graph_enabled and not with_prefill: graph_pad_size = padded_num_tokens_across_dp - total_num_scheduled_tokens @@ -1177,6 +1229,13 @@ def _process_reqs( **extra_builder_kwargs, ) + # token id pad + if self.cp_size > 1 and self.cp_rank == 0: + for i in range(num_reqs): + if num_scheduled_tokens[i] > 1: # prefill + num_padded_tokens = num_scheduled_tokens_for_slot[i]+self.input_batch.num_computed_tokens_cpu[i] + self.input_batch.token_ids_cpu[i][num_padded_tokens-num_cp_pads[i]: num_padded_tokens] = -1 + # Prepare input_ids token_indices = (positions_np + req_indices * self.input_batch.token_ids_cpu.shape[1]) @@ -1413,52 +1472,40 @@ def apply_grammar_bitmask( scheduler_output: "SchedulerOutput", logits: torch.Tensor, ) -> torch.Tensor: + # Serialization of np.ndarray is much more efficient than a tensor, + # so we receive it in that format. grammar_bitmask = scheduler_output.grammar_bitmask - # We receive the structured output bitmask from the scheduler, - # compacted to contain bitmasks only for structured output requests. - # The order of the requests in the bitmask is not guaranteed to be the - # same as the order of the requests in the gpu runner's batch. We need - # to sort the bitmask to match the order of the requests used here. - - # Get the batch indices of the structured output requests. - # Keep track of the number of speculative tokens scheduled for every - # request in the batch, as the logit indices are offset by this amount. + # We receive the structured output bitmask from the scheduler, but the + # indices of the requests in the batch may not match the indices of + # the bitmask since the scheduler doesn't know how the gpu runner is + # ordering the requests in the batch. We need to sort the bitmask to + # match the order of the requests used here. struct_out_req_batch_indices: dict[str, int] = {} - cumulative_offset = 0 - seq = sorted(self.input_batch.req_id_to_index.items(), - key=lambda x: x[1]) - for req_id, batch_index in seq: - logit_index = batch_index + cumulative_offset - cumulative_offset += len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - if req_id in scheduler_output.structured_output_request_ids: - struct_out_req_batch_indices[req_id] = logit_index - - out_indices = [] - - # Reorder the bitmask to match the order of the requests in the batch. - sorted_bitmask = np.zeros_like(grammar_bitmask, - shape=(logits.shape[0], - grammar_bitmask.shape[1])) - cumulative_index = 0 - seq = sorted(scheduler_output.structured_output_request_ids.items(), - key=lambda x: x[1]) - for req_id, _ in seq: - logit_index = struct_out_req_batch_indices[req_id] - num_spec_tokens = len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) - cumulative_index += 1 + num_spec_tokens - grammar_bitmask = sorted_bitmask + indices_match = True + for req_id in self.input_batch.req_ids: + mask_index = scheduler_output.structured_output_request_ids.get( + req_id) + if mask_index is None: + # not a structured output request + continue + batch_index = self.input_batch.req_id_to_index[req_id] + if batch_index != mask_index: + indices_match = False + struct_out_req_batch_indices[req_id] = batch_index + + if not indices_match: + # Sort the bitmask to match the order of the requests + sorted_bitmask = np.zeros_like(grammar_bitmask) + for req_id, batch_index in struct_out_req_batch_indices.items(): + orig_index = scheduler_output.structured_output_request_ids[ + req_id] + sorted_bitmask[batch_index] = grammar_bitmask[orig_index] + grammar_bitmask = sorted_bitmask - # Serialization of np.ndarray is much more efficient than a tensor, - # so we receive it in that format. grammar_bitmask = torch.from_numpy(grammar_bitmask) + # TODO: compatibility with spec decode. # NOTE: # 1. XGrammar bitmask applying only supports CPU and GPU. # 2. The logits and bitmask should be on the same device. @@ -1468,7 +1515,7 @@ def apply_grammar_bitmask( xgr.apply_token_bitmask_inplace( logits, grammar_bitmask, - indices=out_indices, + indices=list(struct_out_req_batch_indices.values()), ) return logits.to(self.device).to(logits_dtype) @@ -1510,9 +1557,6 @@ def _pool( hidden_states: torch.Tensor, num_scheduled_tokens: int, num_scheduled_tokens_np: np.ndarray, - finished_sending: Optional[set[str]] = None, - finished_recving: Optional[set[str]] = None, - kv_connector_output: Optional["KVConnectorOutput"] = None, ) -> ModelRunnerOutput: assert self.input_batch.num_reqs ==\ len(self.input_batch.pooling_params), \ @@ -1538,12 +1582,6 @@ def _pool( pooler_output.append(raw_output.data.cpu()) else: pooler_output.append(None) - extra_args = ({ - "finished_sending": finished_sending, - "finished_recving": finished_recving - } if vllm_version_is("0.10.0") else { - "kv_connector_output": kv_connector_output - }) return ModelRunnerOutput( req_ids=self.input_batch.req_ids, @@ -1553,7 +1591,6 @@ def _pool( logprobs=None, prompt_logprobs_dict={}, pooler_output=pooler_output, - **extra_args, ) @torch.inference_mode() @@ -1578,13 +1615,7 @@ def execute_model( num_scheduled_tokens_np, finished_sending, finished_recving) = (self._process_reqs(scheduler_output, intermediate_tensors)) - kv_connector_output = None - if not vllm_version_is("0.10.0"): - kv_connector_output = KVConnectorOutput( - finished_sending=finished_sending, - finished_recving=finished_recving) - finished_sending = None - finished_recving = None + with ProfileExecuteDuration().capture_async("post process"): # Broadcast PP output for external_launcher (torchrun) # to make sure we are synced across pp ranks @@ -1596,12 +1627,6 @@ def execute_model( if not get_pp_group().is_last_rank: # For mid-pipeline stages, return the hidden states. if not broadcast_pp_output: - if kv_connector_output is not None: - hidden_states.kv_connector_output = kv_connector_output - else: - #TODO: Remove this after we drop vllm v0.10.0 - hidden_states.finished_sending = finished_sending - hidden_states.finished_recving = finished_recving return hidden_states assert isinstance(hidden_states, IntermediateTensors) get_pp_group().send_tensor_dict( @@ -1610,9 +1635,7 @@ def execute_model( else: if self.input_batch.pooling_params: return self._pool(hidden_states, num_scheduled_tokens, - num_scheduled_tokens_np, - finished_sending, finished_recving, - kv_connector_output) + num_scheduled_tokens_np) sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) if broadcast_pp_output: @@ -1746,23 +1769,17 @@ def execute_model( if has_kv_transfer_group(): get_kv_transfer_group().clear_connector_metadata() - extra_args = ({ - "finished_sending": finished_sending, - "finished_recving": finished_recving - } if vllm_version_is("0.10.0") else { - "kv_connector_output": kv_connector_output - }) - - model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=valid_sampled_token_ids, - spec_token_ids=spec_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict=prompt_logprobs_dict, - pooler_output=[], - **extra_args, - ) + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + spec_token_ids=spec_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + finished_sending=finished_sending, + finished_recving=finished_recving, + ) durations = ProfileExecuteDuration().pop_captured_sync() if durations: @@ -1827,10 +1844,15 @@ def _dummy_run( with_prefill: bool = False, is_torchair_compile: bool = False, ) -> torch.Tensor: + maybe_padded_num_tokens = num_tokens + if self.torchair_graph_enabled and not with_prefill: + maybe_padded_num_tokens = self.select_torchair_padded_batch_size( + num_tokens) + # Padding for DP (num_tokens, num_tokens_across_dp, with_prefill, - _) = self._get_forward_metadata_across_dp_and_pad( - num_tokens, with_prefill, False) + enable_dbo) = self._get_forward_metadata_across_dp( + maybe_padded_num_tokens, num_tokens, with_prefill, False) # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index d0acd04cd0..8d5147a233 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -58,6 +58,10 @@ class CachedRequestState: lora_request: Optional[LoRARequest] = None + #cp param + kv_rank: Optional[int] = None + num_computed_and_new_tokens_cp: Optional[int] = None + def __post_init__(self): self.num_prompt_tokens = len(self.prompt_token_ids) @@ -262,6 +266,10 @@ def __init__( self.pooling_params: dict[str, PoolingParams] = {} + #cp param + self.kv_rank: list[bool] = [None] * max_num_reqs + self.num_computed_and_new_tokens_cp: list[int] = [None] * max_num_reqs + @property def req_ids(self) -> list[str]: # None elements should only be present transiently @@ -287,6 +295,10 @@ def add_request( self.req_id_to_index[req_id] = req_index + #cp param + self.kv_rank[req_index] = request.kv_rank + self.num_computed_and_new_tokens_cp[req_index] = request.num_computed_and_new_tokens_cp + # Copy the prompt token ids and output token ids. num_prompt_tokens = len(request.prompt_token_ids) self.num_prompt_tokens[req_index] = num_prompt_tokens diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index d9bff3c1b6..d2dd69005c 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -17,7 +17,6 @@ # Adapted from vllm-project/vllm/vllm/worker/gpu_worker.py # -import copy from typing import Optional import torch @@ -28,8 +27,7 @@ from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) -from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized, - has_kv_transfer_group) +from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import logger from vllm.lora.request import LoRARequest @@ -37,7 +35,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec -from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput +from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.worker_base import WorkerBase from vllm_ascend.ascend_config import init_ascend_config @@ -71,10 +69,8 @@ def __init__( from vllm_ascend import ops ops.register_dummy_fusion_op() _register_atb_extensions() - - # init ascend config and soc version + # init ascend config init_ascend_config(vllm_config) - init_ascend_soc_version() super().__init__(vllm_config=vllm_config, local_rank=local_rank, @@ -83,6 +79,9 @@ def __init__( is_driver_worker=is_driver_worker) # Try to import mindie_turbo to accelerate vLLM inference. + local_dp_rank = self.vllm_config.parallel_config.data_parallel_rank_local + world_size = self.vllm_config.parallel_config.world_size + self.local_rank_across_dp = local_dp_rank * world_size + self.local_rank try_register_lib( "mindie_turbo", "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo." @@ -130,19 +129,18 @@ def initialize_cache(self, num_gpu_blocks: int, self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - def _init_device(self): + def init_device(self): device = torch.device(f"npu:{self.local_rank}") NPUPlatform.set_device(device) NPUPlatform.empty_cache() self.init_npu_memory = NPUPlatform.mem_get_info()[0] + + init_ascend_soc_version() # Initialize the distributed environment. self._init_worker_distributed_environment() # Set random seed. NPUPlatform.seed_everything(self.model_config.seed) - return device - def init_device(self): - device = self._init_device() # Init ModelRunner here, so that we have access to self.device. self.model_runner = NPUModelRunner(self.vllm_config, device) @@ -206,33 +204,9 @@ def execute_model( assert isinstance(output, IntermediateTensors) get_pp_group().send_tensor_dict(output.tensors, all_gather_group=get_tp_group()) - if not has_kv_transfer_group(): - return None - - is_legacy = vllm_version_is("0.10.0") - - if is_legacy: - finished_sending = output.finished_sending - finished_recving = output.finished_recving - else: - kv_connector_output = output.kv_connector_output - finished_sending = kv_connector_output.finished_sending - finished_recving = kv_connector_output.finished_recving - - if not finished_sending and not finished_recving: - return EMPTY_MODEL_RUNNER_OUTPUT - - new_output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) - - if is_legacy: - new_output.finished_sending = finished_sending - new_output.finished_recving = finished_recving - else: - new_output.kv_connector_output = kv_connector_output - return new_output - + return None assert isinstance(output, ModelRunnerOutput) - return output + return output if self.is_driver_worker else None def load_model(self) -> None: if self.vllm_config.model_config.enable_sleep_mode: @@ -310,7 +284,8 @@ def _init_worker_distributed_environment(self) -> None: self.local_rank, "hccl") ensure_model_parallel_initialized( self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size) + self.parallel_config.pipeline_parallel_size, + self.parallel_config.context_parallel_size,) init_ascend_model_parallel(self.parallel_config) ensure_kv_transfer_initialized(self.vllm_config)