2929 from vllm .utils .network_utils import get_open_port
3030
3131MODELS = [
32- "Qwen/Qwen3-30B-A3B" ,
32+ "Qwen/Qwen3-30B-A3B" , "vllm-ascend/DeepSeek-V2-Lite-W8A8"
3333]
3434
3535DATA_PARALLELS = [2 ]
@@ -52,12 +52,20 @@ async def test_single_request_aclgraph(model: str, dp_size: int) -> None:
5252 "TASK_QUEUE_ENABLE" : "1" ,
5353 "HCCL_OP_EXPANSION_MODE" : "AIV" ,
5454 }
55- server_args = [
55+ if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" :
56+ server_args = [
5657 "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
57- "--data-parallel-size" ,
58+ "--data-parallel-size" , "quantization" , "ascend" ,
5859 str (dp_size ), "--port" ,
5960 str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
6061 ]
62+ else :
63+ server_args = [
64+ "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
65+ "--data-parallel-size" ,
66+ str (dp_size ), "--port" ,
67+ str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
68+ ]
6169 request_keyword_args : dict [str , Any ] = {
6270 ** api_keyword_args ,
6371 }
0 commit comments