[CI]Fix oom of deepseek-eplb nigtly test. (#3884)

offline893 · offline0806 · web-flow · commit 14ca1e5cb222 · 2025-10-30T10:18:07.000+08:00
### What this PR does / why we need it? Fix oom of deepseek-eplb nigtly test - vLLM version: v0.11.0rc3 - vLLM main: vllm-project/vllm@83f478b --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
diff --git a/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py b/tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
@@ -85,7 +85,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
         "--quantization", "ascend", "--gpu-memory-utilization", "0.9",
         "--additional-config", '{"enable_weight_nz_layout":true, '
         '"torch_air_graph_config":{"enabled": true, "enable_multistream_mla": true, "graph_batch_size": [16], "use_cached_graph": true},'
-        '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, "init_redundancy_expert": 16}'
+        '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200'
     ]
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
diff --git a/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
@@ -82,8 +82,7 @@ async def test_models(model: str, tp_size: int) -> None:
         "--quantization", "ascend", "--gpu-memory-utilization", "0.9",
         "--additional-config",
         '{"enable_weight_nz_layout":true, "dynamic_eplb": true, '
-        '"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, '
-        '"init_redundancy_expert": 16}'
+        '"num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200}'
     ]
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
diff --git a/vllm_ascend/eplb/core/eplb_device_transfer_loader.py b/vllm_ascend/eplb/core/eplb_device_transfer_loader.py
@@ -126,7 +126,7 @@ def update_expert_map_and_weight(self, reqs):
                                                       local_expert_to_replace,
                                                       buffer_tensor_id)
 
-        logger.info(
+        logger.debug(
             f"[EPLB] finished update expert weight for layer: {self.layer_id}")
 
         self.recv_expert_list = []
diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py
@@ -77,6 +77,7 @@ def update_iteration(self):
         self.cur_iterations += 1
         if self.cur_iterations == (self.num_iterations_eplb_update + \
                                    self.num_wait_worker_iterations + self.num_moe_layers):
+            logger.info("Finish expert parallel load balancing.")
             if self.expert_map_record_path is not None:
                 self.adaptor._export_tensor_to_file(
                     self.shared_dict["expert_maps"],

Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:`
`85`	`85`	`"--quantization", "ascend", "--gpu-memory-utilization", "0.9",`
`86`	`86`	`"--additional-config", '{"enable_weight_nz_layout":true, '`
`87`	`87`	`'"torch_air_graph_config":{"enabled": true, "enable_multistream_mla": true, "graph_batch_size": [16], "use_cached_graph": true},'`
`88`		`- '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200, "init_redundancy_expert": 16}'`
	`88`	`+ '"dynamic_eplb": true, "num_iterations_eplb_update": 1000, "num_wait_worker_iterations": 200'`
`89`	`89`	`]`
`90`	`90`	`request_keyword_args: dict[str, Any] = {`
`91`	`91`	`**api_keyword_args,`