Skip to content

Commit 29bd923

Browse files
authored
[v0.11.0][Perf] Delete redundant operations in model_runner and forward_context (#3775)
<!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> cherry pick #3677 Remove redundant operations from `model_runner` and `forward_context`. This optimization can significantly reduce the idle time (bubble) before decoding when running models with small parameter counts (e.g., Qwen/Qwen2.5-0.5B). Testing on 800I A2, bubble is reduced from 3.8ms to 2.8ms : Before <img width="1655" height="696" alt="image" src="https://github.com/user-attachments/assets/d7608e52-2438-46dd-8fc9-391fd6274495" /> After <img width="1607" height="774" alt="image" src="https://github.com/user-attachments/assets/56daf081-2dba-4d2e-99d4-e055187d9806" /> ### What this PR does / why we need it? <!-- - Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue. If possible, please consider writing useful notes for better and faster reviews in your PR. - Please clarify why the changes are needed. For instance, the use case and bug description. - Fixes # --> ### Does this PR introduce _any_ user-facing change? <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> No ### How was this patch tested? <!-- CI passed with new added/existing test. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> --------- Signed-off-by: realliujiaxu <[email protected]>
1 parent 75de3fa commit 29bd923

File tree

5 files changed

+34
-25
lines changed

5 files changed

+34
-25
lines changed

tests/ut/worker/test_model_runner_v1.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel,
6868
with patch('vllm_ascend.worker.model_runner_v1.get_ascend_soc_version',
6969
return_value=soc_version), \
7070
patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank',
71+
return_value=True), \
72+
patch('vllm_ascend.worker.model_runner_v1.is_moe_model',
7173
return_value=True):
7274

7375
# Bind the real method to the mock object
@@ -102,6 +104,8 @@ def test_select_moe_comm_method_unsupported_soc():
102104
return_value=unsupported_soc), \
103105
patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank',
104106
return_value=True), \
107+
patch('vllm_ascend.worker.model_runner_v1.is_moe_model',
108+
return_value=True), \
105109
pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"):
106110

107111
NPUModelRunner._select_moe_comm_method(mock_runner, 100, False)

vllm_ascend/ascend_forward_context.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
set_forward_context)
1212

1313
import vllm_ascend.envs as envs_ascend
14-
from vllm_ascend.utils import enable_sp, is_moe_model, version_check
14+
from vllm_ascend.utils import (enable_sp, has_layer_idx, is_moe_model,
15+
version_check)
1516

1617
if TYPE_CHECKING:
1718
from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod
@@ -136,9 +137,7 @@ def set_ascend_forward_context(
136137
# set layer_idx to enable optimization features that depend on this information.
137138
# This is only applicable to models that contain these necessary attributes.
138139
forward_context.layer_idx = None
139-
if model_instance is not None and \
140-
hasattr(model_instance, "model") and \
141-
hasattr(model_instance.model, "start_layer"):
140+
if has_layer_idx(model_instance):
142141
forward_context.layer_idx = model_instance.model.start_layer
143142

144143
# TODO(rjg-lyh): refactor mlp weight prefetch method

vllm_ascend/ops/moe/moe_comm_method.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939

4040
def get_moe_comm_method(
4141
moe_comm_type: Optional[MoECommType]) -> Optional[MoECommMethod]:
42-
return _MoECommMethods.get(moe_comm_type)
42+
return _MoECommMethods.get(moe_comm_type, None)
4343

4444

4545
def setup_moe_comm_method(moe_config):

vllm_ascend/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
_MIN_DP_BUFFER_SIZE = 50
5959
_IS_MOE_MODEL = None
6060
_ENABLE_SP = None
61+
_HAS_LAYER_IDX = None
6162

6263

6364
def is_310p():
@@ -785,3 +786,14 @@ def version_check():
785786
if full_date >= "20250919":
786787
return True
787788
return False
789+
790+
791+
def has_layer_idx(model_instance: torch.nn.Module) -> bool:
792+
if model_instance is None:
793+
return False
794+
795+
global _HAS_LAYER_IDX
796+
if _HAS_LAYER_IDX is None:
797+
_HAS_LAYER_IDX = hasattr(model_instance, "model") and \
798+
hasattr(model_instance.model, "start_layer")
799+
return _HAS_LAYER_IDX

vllm_ascend/worker/model_runner_v1.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@
131131
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
132132
AscendSocVersion, ProfileExecuteDuration,
133133
enable_sp, get_ascend_soc_version, is_310p,
134-
is_enable_nz, lmhead_tp_enable)
134+
is_enable_nz, is_moe_model, lmhead_tp_enable)
135135
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
136136

137137
if TYPE_CHECKING:
@@ -470,11 +470,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
470470
self.in_profile_run = False
471471

472472
self._init_mc2_tokens_capacity()
473-
self.reserved_mc2_mask = torch.zeros(
474-
self.mc2_tokens_capacity,
475-
dtype=torch.bool,
476-
device=self.device,
477-
)
473+
if is_moe_model(vllm_config):
474+
self.reserved_mc2_mask = torch.zeros(
475+
self.mc2_tokens_capacity,
476+
dtype=torch.bool,
477+
device=self.device,
478+
)
479+
else:
480+
self.reserved_mc2_mask = None
478481
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
479482
if self.dynamic_eplb:
480483
EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb)
@@ -1341,9 +1344,7 @@ def _prepare_inputs(
13411344
self.query_lens = torch.from_numpy(num_scheduled_tokens)
13421345

13431346
# Copy the tensors to the NPU.
1344-
self.input_ids[:total_num_scheduled_tokens].copy_(
1345-
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
1346-
1347+
self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
13471348
self.positions_cpu[total_num_scheduled_tokens:num_input_tokens].zero_()
13481349
self.positions[:num_input_tokens].copy_(
13491350
self.positions_cpu[:num_input_tokens], non_blocking=True)
@@ -1364,16 +1365,6 @@ def _prepare_inputs(
13641365
self._update_graph_pad_size(with_prefill, maybe_padded_num_tokens)
13651366
attn_metadata: dict[str, Any] = {}
13661367

1367-
# Prepare input_ids
1368-
token_indices = (positions_np +
1369-
req_indices * self.input_batch.token_ids_cpu.shape[1])
1370-
torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
1371-
0,
1372-
torch.from_numpy(token_indices),
1373-
out=self.input_ids_cpu[:total_num_scheduled_tokens])
1374-
# Copy the tensors to the NPU.
1375-
self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
1376-
13771368
# _prepare_inputs may reorder the batch, so we must gather
13781369
# multi-modal outputs after that to ensure the correct order
13791370
if self.is_multimodal_model:
@@ -1835,7 +1826,7 @@ def _pool(
18351826
)
18361827

18371828
def _select_moe_comm_method(self, num_tokens: int,
1838-
with_prefill: bool) -> MoECommType:
1829+
with_prefill: bool) -> Optional[MoECommType]:
18391830
"""1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all
18401831
are designed for expert parallelism.
18411832
2. If expert parallel is enabled, we need to consider the soc version and the
@@ -1858,6 +1849,9 @@ def _select_moe_comm_method(self, num_tokens: int,
18581849
Returns:
18591850
MoECommType: The selected MoE communication method.
18601851
"""
1852+
if not is_moe_model(self.vllm_config):
1853+
return None
1854+
18611855
soc_version = get_ascend_soc_version()
18621856
quant_type = getattr(self.vllm_config.model_config.hf_config,
18631857
'moe_quantize', None)

0 commit comments

Comments
 (0)