Skip to content

Commit fcc9a0e

Browse files
authored
Update torch-npu version to 2.7.1 (#3896)
### What this PR does / why we need it? Upgrade torch-npu to the official release version 2.7.1 - vLLM version: v0.11.0 - vLLM main: vllm-project/vllm@83f478b --------- Signed-off-by: wangxiyuan <[email protected]>
1 parent 5f6d1b3 commit fcc9a0e

File tree

15 files changed

+78
-163
lines changed

15 files changed

+78
-163
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
4343
- Software:
4444
* Python >= 3.9, < 3.12
4545
* CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
46-
* PyTorch == 2.7.1, torch-npu == 2.7.1.dev20250724
46+
* PyTorch == 2.7.1, torch-npu == 2.7.1
4747
* vLLM (the same version as vllm-ascend)
4848

4949
## Getting Started

README.zh.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
4444
- 软件:
4545
* Python >= 3.9, < 3.12
4646
* CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
47-
* PyTorch == 2.7.1, torch-npu == 2.7.1.dev20250724
47+
* PyTorch == 2.7.1, torch-npu == 2.7.1
4848
* vLLM (与vllm-ascend版本一致)
4949

5050
## 开始使用

docs/source/installation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ This document describes how to install vllm-ascend manually.
1313
|---------------|----------------------------------|-------------------------------------------|
1414
| Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html) | Required for CANN |
1515
| CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu |
16-
| torch-npu | == 2.7.1.dev20250724 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
16+
| torch-npu | == 2.7.1 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
1717
| torch | == 2.7.1 | Required for torch-npu and vllm |
1818

1919
There are two installation methods:

examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Software:
66
* Python >= 3.9, < 3.12
77
* CANN >= 8.2.rc1
8-
* PyTorch == 2.7.1, torch-npu == 2.7.1.dev20250724
8+
* PyTorch == 2.7.1, torch-npu == 2.7.1
99
* vLLM (same version as vllm-ascend)
1010
* mooncake-transfer-engine reference documentation: https://github.com/kvcache-ai/Mooncake/blob/main/doc/zh/ascend_transport.md
1111

examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Software:
66
* Python >= 3.9, < 3.12
77
* CANN >= 8.2.rc1
8-
* PyTorch == 2.7.1, torch-npu == 2.7.1.dev20250724
8+
* PyTorch == 2.7.1, torch-npu == 2.7.1
99
* vLLM:main branch
1010
* vLLM-Ascend:main branch
1111
* Mooncake:[AscendTransport/Mooncake at pooling-async-memcpy](https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy)(Currently available branch code, continuously updated.)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ requires = [
1414
"pandas-stubs",
1515
"setuptools>=64",
1616
"setuptools-scm>=8",
17-
"torch-npu==2.7.1.dev20250724",
17+
"torch-npu==2.7.1",
1818
"torch==2.7.1",
1919
"torchvision",
2020
"wheel",

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,6 @@ quart
2525
numba
2626

2727
# Install torch_npu
28-
--pre
29-
--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
30-
torch-npu==2.7.1.dev20250724
28+
#--pre
29+
#--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
30+
torch-npu==2.7.1

tests/ut/ops/test_layernorm.py

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from tests.ut.base import PytestBase
99
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
10-
from vllm_ascend.utils import version_check
1110

1211

1312
def mock_rms_norm(x, weight, eps):
@@ -18,15 +17,6 @@ def mock_add_rms_norm(x, residual, weight, eps):
1817
return 2 * x, None, 2 * residual
1918

2019

21-
def mock_add_rms_norm_quant(x, residual, weight, quant_scale, quant_offset,
22-
epsilon):
23-
x_out = 2 * x
24-
residual_out = 2 * residual
25-
x_out_quant = x_out.to(torch.int8)
26-
residual_out_quant = residual_out.to(torch.int8)
27-
return x_out_quant, None, residual_out_quant
28-
29-
3020
def mock_add_rms_norm_quant_with_bias(x, residual, weight, quant_scale,
3121
quant_offset, beta, epsilon):
3222
x_out = 2 * x
@@ -43,10 +33,8 @@ def context(self, mocker: MockerFixture):
4333
mocker.patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
4434
mocker.patch("torch_npu.npu_add_rms_norm",
4535
side_effect=mock_add_rms_norm)
46-
torch_npu_check = version_check()
47-
arnq_side_effect = mock_add_rms_norm_quant_with_bias if torch_npu_check else mock_add_rms_norm_quant
4836
mocker.patch("torch_npu.npu_add_rms_norm_quant",
49-
side_effect=arnq_side_effect)
37+
side_effect=mock_add_rms_norm_quant_with_bias)
5038
mocker.patch("torch.ops.vllm.maybe_wait_prefetch_done",
5139
side_effect=lambda x: None)
5240

@@ -82,8 +70,7 @@ def test_forward_oot_with_quant_fusion(self, mocker: MockerFixture):
8270

8371
mock_model_instance = mocker.MagicMock()
8472
mock_forward_context.model_instance = mock_model_instance
85-
torch_npu_check = version_check()
86-
num_hidden_layers = 3 if torch_npu_check else 2
73+
num_hidden_layers = 3
8774
mock_model_instance.model.layers = [
8875
mocker.MagicMock() for _ in range(num_hidden_layers)
8976
]
@@ -136,34 +123,31 @@ def test_forward_oot_with_quant_fusion(self, mocker: MockerFixture):
136123
assert mock_forward_context.fusion_linear == "gate_up_dense"
137124
assert mock_forward_context.layer_idx == 1
138125

139-
if torch_npu_check:
140-
mock_forward_context.fusion_linear = "gate_moe"
126+
mock_forward_context.fusion_linear = "gate_moe"
141127
x_out, residual_out = layer.forward_oot(x, residual)
142128

143-
assert mock_get_forward_context.call_count == 6
144-
fusion_linear_expected = "qkv_moe" if torch_npu_check else "qkv_dense"
129+
assert mock_get_forward_context.call_count == 5
130+
fusion_linear_expected = "qkv_moe"
145131
assert mock_forward_context.fusion_linear == fusion_linear_expected
146132
assert mock_forward_context.layer_idx == 2
147133

148134
x_out, residual_out = layer.forward_oot(x, residual)
149135

150-
assert mock_get_forward_context.call_count == 7
151-
fusion_linear_expected = "gate_moe" if torch_npu_check else "qkv_dense"
136+
assert mock_get_forward_context.call_count == 6
137+
fusion_linear_expected = "gate_moe"
152138
assert mock_forward_context.fusion_linear == fusion_linear_expected
153139
assert mock_forward_context.layer_idx == 2
154140

155-
if not torch_npu_check:
156-
return
157141
# last layer returned directly
158142
x_out, residual_out = layer.forward_oot(x, residual)
159143

160-
assert mock_get_forward_context.call_count == 8
144+
assert mock_get_forward_context.call_count == 7
161145
assert mock_forward_context.fusion_linear == "qkv_moe"
162146
assert mock_forward_context.layer_idx == 3
163147

164148
x_out, residual_out = layer.forward_oot(x, residual)
165149

166-
assert mock_get_forward_context.call_count == 9
150+
assert mock_get_forward_context.call_count == 8
167151
assert mock_forward_context.fusion_linear == "qkv_moe"
168152
assert mock_forward_context.layer_idx == 3
169153

tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ def setUp(self):
2323
@patch("torch_npu.npu_swiglu")
2424
@patch("torch_npu.npu_dynamic_quant")
2525
@patch("torch_npu.npu_moe_finalize_routing")
26-
@patch("torch_npu.npu_moe_init_routing")
26+
@patch("torch_npu.npu_moe_init_routing_quant")
2727
def test_torchair_fused_experts_with_all2all(
28-
self, mock_moe_init_routing, mock_moe_finalize_routing,
28+
self, mock_npu_moe_init_routing_quant, mock_moe_finalize_routing,
2929
mock_dynamic_quant, mock_swiglu, mock_grouped_matmul,
3030
mock_moe_re_routing, mock_all_to_all_single):
3131

@@ -38,11 +38,10 @@ def test_torchair_fused_experts_with_all2all(
3838
placeholder_ones = torch.ones(self.num_tokens, dtype=torch.int32)
3939
mock_all_to_all_single.side_effect = lambda output, input, *args, **kwargs: output.copy_(
4040
input)
41-
mock_moe_init_routing.return_value = (
42-
placeholder_int8,
43-
placeholder_ones,
44-
placeholder_ones,
45-
)
41+
mock_npu_moe_init_routing_quant.return_value = (
42+
placeholder_int8, placeholder_ones, placeholder_ones,
43+
torch.bincount(placeholder_ones, minlength=len(expert_map)),
44+
torch.randn(self.num_tokens))
4645
mock_moe_re_routing.return_value = (placeholder_int8, self.placeholder,
4746
torch.randint(0,
4847
100,

vllm_ascend/ascend_forward_context.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@
1111
set_forward_context)
1212

1313
import vllm_ascend.envs as envs_ascend
14-
from vllm_ascend.utils import (enable_sp, has_layer_idx, is_moe_model,
15-
version_check)
14+
from vllm_ascend.utils import enable_sp, has_layer_idx, is_moe_model
1615

1716
if TYPE_CHECKING:
1817
from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod
@@ -163,9 +162,7 @@ def set_ascend_forward_context(
163162
# this optim now just support dense models due to the specific operators used.
164163
# Once the necessary conditions are met, support for MOE models will also be added.
165164
from vllm_ascend.quantization.quant_config import AscendQuantConfig
166-
model_type_scope = ["llama", "qwen2", "qwen3"]
167-
if version_check():
168-
model_type_scope.append("qwen3_moe")
165+
model_type_scope = ["llama", "qwen2", "qwen3", "qwen3_moe"]
169166
addrmsnorm_quant_fusion_enabled = isinstance(vllm_config.quant_config, AscendQuantConfig) and \
170167
vllm_config.model_config.hf_config.model_type in model_type_scope and \
171168
forward_context.layer_idx is not None

0 commit comments

Comments
 (0)