[main] mlp weight prefetch in Qwen Dense Models

rjg-lyh · Shuming19 · rjg-lyh · commit c29bfc93cb69 · 2025-09-10T19:53:05.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
Co-authored-by: Shuming19 &lt;313093131@qq.com&gt;
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -258,4 +258,4 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
-          pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
+          pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml
@@ -226,6 +226,8 @@ jobs:
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
 
           #pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
           pytest -sv tests/e2e/multicard/test_prefix_caching.py
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -173,3 +173,23 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
             tensor_parallel_size=4,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
+def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model, enforce_eager):
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download(model),
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            dtype="auto",
+            tensor_parallel_size=4,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -66,7 +66,9 @@ def set_ascend_forward_context(
         moe_comm_method: str = "",
         num_actual_tokens: Optional[int] = None,
         aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        batch_descriptor: Optional[BatchDescriptor] = None):
+        batch_descriptor: Optional[BatchDescriptor] = None,
+        prefetch_stream: torch.npu.Stream = None,
+        prefetch_model: torch.nn.Module = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     We add some additional param into forward_context.
@@ -108,7 +110,8 @@ def set_ascend_forward_context(
         # Currently, it is an empirical value. In normal scenarios, if the concurrency exceeds this threshold,
         # the performance benefits can be maximized. Conversely, if the concurrency is below the threshold,
         # the performance may degrade due to the switching of communication methods.
-        flashcomm_v1_enabled = envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM and \
+        flashcomm_v1_enabled = envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE and \
+            envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM and \
             tp_world_size > 1 and \
             num_tokens is not None and num_tokens > 1000
 
@@ -122,6 +125,26 @@ def set_ascend_forward_context(
         # set this for rope forward_oot using
         forward_context.is_first_layer = True
 
+        # set layer_idx to enable optimization features that depend on this information.
+        # This is only applicable to models that contain these necessary attributes.
+        forward_context.layer_idx = None
+        if prefetch_model is not None and \
+            hasattr(prefetch_model, "model") and \
+            hasattr(prefetch_model.model, "start_layer"):
+            forward_context.layer_idx = prefetch_model.model.start_layer
+
+        # set for mlp weight prefetch
+        prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE and \
+            envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \
+            forward_context.layer_idx is not None and \
+            num_tokens is not None and num_tokens < 500
+        if prefetch_mlp_enabled:
+            forward_context.prefetch_stream = prefetch_stream
+            forward_context.prefetch_model = prefetch_model
+            forward_context.prefetch_mlp_gate_up_proj = False
+            forward_context.prefetch_mlp_down_proj = False
+        forward_context.prefetch_mlp_enabled = prefetch_mlp_enabled
+
         if num_tokens is None and attn_metadata is not None:
             num_tokens = attn_metadata.num_actual_tokens
 
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -135,6 +135,15 @@
     # This feature will get better performance when concurrency is large.
     "VLLM_ASCEND_ENABLE_FLASHCOMM":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))),
+    # Whether to enable MLP weight prefetch, only used in small concurrency.
+    "VLLM_ASCEND_ENABLE_PREFETCH_MLP":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", '0'))),
+    # buffer size for gate up prefetch
+    "MLP_GATE_UP_PREFETCH_SIZE":
+    lambda: int(os.getenv("MLP_GATE_UP_PREFETCH_SIZE", 18 * 1024 * 1024)),
+    # buffer size for down proj prefetch
+    "MLP_DOWN_PREFETCH_SIZE":
+    lambda: int(os.getenv("MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
     # Whether to enable dense model and general optimizations for better performance.
     # Since we modified the base parent class `linear`, this optimization is also applicable to other model types.
     # However, there might be hidden issues, and it is currently recommended to prioritize its use with dense models.
diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py
@@ -35,8 +35,10 @@ def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
 
         from vllm_ascend.utils import is_310p
 
+        torch.ops.vllm.maybe_prefetch_mlp_down_proj(x)
         if is_310p():
             out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
         else:
             out = torch_npu.npu_swiglu(x)
+        torch.ops.vllm.maybe_wait_prefetch_done(out)
         return out
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
@@ -44,12 +44,7 @@ def forward(
         import torch_npu
 
         if residual is not None:
-            # FIXME(rjg-lyh): This is a hacky way to chunk residuals when the flashcomm_v1 feature
-            # is enabled, without interfering with the normal operation of components like torchair.
-            # The final solution should be to move this check into the operator and support
-            # integration with torchair.
-            if x.size(0) != residual.size(0):
-                residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
+            residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
             assert x.size(0) == residual.size(0)
             x, _, residual = torch_npu.npu_add_rms_norm_quant(
                 x,
@@ -58,6 +53,7 @@ def forward(
                 self.layer.aclnn_input_scale,
                 self.layer.aclnn_input_offset,
                 epsilon=self.variance_epsilon)
+            torch.ops.vllm.maybe_wait_prefetch_done(x)
             return x, residual
 
         x, residual = torch_npu.npu_rms_norm(x, self.weight,
@@ -76,12 +72,7 @@ def forward_oot(
 
         from vllm_ascend.utils import is_310p
         if residual is not None:
-            # FIXME(rjg-lyh): This is a hacky way to chunk residuals when the flashcomm_v1 feature
-            # is enabled, without interfering with the normal operation of components like torchair.
-            # The final solution should be to move this check into the operator and support
-            # integration with torchair.
-            if x.size(0) != residual.size(0):
-                residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
+            residual = torch.ops.vllm.maybe_chunk_residual(x, residual)
             assert x.size(0) == residual.size(0)
             if is_310p():
                 orig_dtype = residual.dtype
@@ -92,6 +83,7 @@ def forward_oot(
             else:
                 x, _, residual = torch_npu.npu_add_rms_norm(
                     x, residual, self.weight, self.variance_epsilon)
+            torch.ops.vllm.maybe_wait_prefetch_done(x)
             return x, residual
 
         x, residual = torch_npu.npu_rms_norm(x, self.weight,
diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py
@@ -366,6 +366,7 @@ def _forward_dense_optim(
                                                       input_parallel,
                                                       bias=bias_)
             output = torch.ops.vllm.maybe_pad_and_reduce(output_parallel)
+            torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
 
         output_bias = self.bias if self.skip_bias_add else None
 
diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py
@@ -1,17 +1,24 @@
 import torch
 import torch.nn.functional as F
+import torch_npu
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_reduce_scatter)
 from vllm.forward_context import get_forward_context
 from vllm.utils import direct_register_custom_op
+import vllm_ascend.envs as envs_ascend
 
 
 def _maybe_chunk_residual_impl(x: torch.Tensor,
                                residual: torch.Tensor) -> torch.Tensor:
-    if get_forward_context().flashcomm_v1_enabled:
+    if x.size(0) != residual.size(0):
+        flashcomm_v1_enabled = get_forward_context().flashcomm_v1_enabled
+        assert flashcomm_v1_enabled is True, (
+            "Currently, this situation only occurs "
+            "when flashcomm_v1 is enabled"
+        )
         pad_size = get_forward_context().pad_size
         if pad_size > 0:
             residual = F.pad(residual, (0, 0, 0, pad_size))
@@ -44,6 +51,75 @@ def _maybe_pad_and_reduce_impl(x: torch.Tensor) -> torch.Tensor:
         return tensor_model_parallel_all_reduce(x)
 
 
+def _maybe_prefetch_mlp_gate_up_proj_impl(x_dependency: torch.Tensor,
+                                          prefix: str) -> None:
+    forward_context = get_forward_context()
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    prefetch_model = forward_context.prefetch_model
+    prefetch_stream = forward_context.prefetch_stream
+    layer_idx = int(prefix.split('.')[2])
+
+    # start point of gate_up_proj weight prefetch
+    if prefix.split('.')[-2] == "self_attn":
+        forward_context.prefetch_mlp_gate_up_proj = True
+    if forward_context.prefetch_mlp_gate_up_proj:
+        prefetch_stream.wait_stream(torch.npu.current_stream())
+
+        with torch.npu.stream(prefetch_stream):
+            MLP_GATE_UP_PREFETCH_SIZE = envs_ascend.MLP_GATE_UP_PREFETCH_SIZE
+            torch_npu.npu_prefetch(prefetch_model.model.layers[layer_idx].mlp.gate_up_proj.weight, \
+                                x_dependency, MLP_GATE_UP_PREFETCH_SIZE)
+    return
+
+
+def _maybe_prefetch_mlp_gate_up_proj_impl_fake(x_dependency: torch.Tensor,
+                                               prefix: str) -> None:
+    return
+
+
+def _maybe_prefetch_mlp_down_proj_impl(x_dependency: torch.Tensor) -> None:
+    forward_context = get_forward_context()
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    forward_context.prefetch_mlp_down_proj = True
+    prefetch_model = forward_context.prefetch_model
+    prefetch_stream = forward_context.prefetch_stream
+    layer_idx = forward_context.layer_idx
+
+    # start point of down_proj weight prefetch
+    prefetch_stream.wait_stream(torch.npu.current_stream())
+
+    with torch.npu.stream(prefetch_stream):
+        MLP_DOWN_PREFETCH_SIZE = envs_ascend.MLP_DOWN_PREFETCH_SIZE
+        torch_npu.npu_prefetch(prefetch_model.model.layers[layer_idx].mlp.down_proj.weight, \
+                            x_dependency, MLP_DOWN_PREFETCH_SIZE)
+    forward_context.layer_idx += 1
+    return
+
+
+def _maybe_prefetch_mlp_down_proj_impl_fake(x_dependency: torch.Tensor) -> None:
+    return
+
+
+def _maybe_wait_prefetch_done_impl(x: torch.Tensor) -> None:
+    forward_context = get_forward_context()
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    if forward_context.prefetch_mlp_gate_up_proj or \
+        forward_context.prefetch_mlp_down_proj:
+        prefetch_stream = get_forward_context().prefetch_stream
+        # wait until prefetch done
+        torch.npu.current_stream().wait_stream(prefetch_stream)
+        forward_context.prefetch_mlp_gate_up_proj = False
+        forward_context.prefetch_mlp_down_proj = False
+    return
+
+
+def _maybe_wait_prefetch_done_impl_fake(x: torch.Tensor) -> None:
+    return
+
+
 direct_register_custom_op(op_name="maybe_chunk_residual",
                           op_func=_maybe_chunk_residual_impl,
                           fake_impl=lambda x, residual: residual,
@@ -60,4 +136,25 @@ def _maybe_pad_and_reduce_impl(x: torch.Tensor) -> torch.Tensor:
                           op_func=_maybe_pad_and_reduce_impl,
                           fake_impl=lambda x: x,
                           mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+
+direct_register_custom_op(op_name="maybe_prefetch_mlp_gate_up_proj",
+                          op_func=_maybe_prefetch_mlp_gate_up_proj_impl,
+                          fake_impl=_maybe_prefetch_mlp_gate_up_proj_impl_fake,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+
+direct_register_custom_op(op_name="maybe_prefetch_mlp_down_proj",
+                          op_func=_maybe_prefetch_mlp_down_proj_impl,
+                          fake_impl=_maybe_prefetch_mlp_down_proj_impl_fake,
+                          mutates_args=[],
+                          dispatch_key="PrivateUse1")
+
+
+direct_register_custom_op(op_name="maybe_wait_prefetch_done",
+                          op_func=_maybe_wait_prefetch_done_impl,
+                          fake_impl=_maybe_wait_prefetch_done_impl_fake,
+                          mutates_args=[],
                           dispatch_key="PrivateUse1")
diff --git a/vllm_ascend/torchair/ops/torchair_activation.py b/vllm_ascend/torchair/ops/torchair_activation.py
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+import torch
+
+
+def torchair_silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor:
+    """AscendSiluAndMul forward in torchair mode.
+
+    The key difference from the original implementation is the removal of operators
+    from the torch.ops.vllm class, as these operators only function in non-torchair
+    modes. Adding them back would cause the graph compilation to fail.
+    """
+
+    import torch_npu
+
+    from vllm_ascend.utils import is_310p
+
+    if is_310p():
+        out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
+    else:
+        out = torch_npu.npu_swiglu(x)
+    return out
diff --git a/vllm_ascend/torchair/ops/torchair_layernorm.py b/vllm_ascend/torchair/ops/torchair_layernorm.py
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+
+def torchair_rmsnorm_forward_oot(
+    self,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """AscendRMSNorm forward in torchair mode.
+
+    The key difference from the original implementation is the removal of operators
+    from the torch.ops.vllm class, as these operators only function in non-torchair
+    modes. Adding them back would cause the graph compilation to fail.
+    """
+
+    import torch_npu
+
+    from vllm_ascend.utils import is_310p
+    if residual is not None:
+        if is_310p():
+            orig_dtype = residual.dtype
+            x = x + residual.to(x.dtype)
+            residual = x.to(orig_dtype)
+            x, _ = torch_npu.npu_rms_norm(x, self.weight,
+                                            self.variance_epsilon)
+        else:
+            x, _, residual = torch_npu.npu_add_rms_norm(
+                x, residual, self.weight, self.variance_epsilon)
+        return x, residual
+
+    x, residual = torch_npu.npu_rms_norm(x, self.weight,
+                                            self.variance_epsilon)
+    return x
diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py