2828else :
2929 from vllm .utils .network_utils import get_open_port
3030
31- MODELS = [
32- "Qwen/Qwen3-30B-A3B" ,
33- ]
31+ MODELS = ["Qwen/Qwen3-30B-A3B" , "vllm-ascend/DeepSeek-V2-Lite-W8A8" ]
3432
3533DATA_PARALLELS = [2 ]
3634
@@ -52,12 +50,21 @@ async def test_single_request_aclgraph(model: str, dp_size: int) -> None:
5250 "TASK_QUEUE_ENABLE" : "1" ,
5351 "HCCL_OP_EXPANSION_MODE" : "AIV" ,
5452 }
55- server_args = [
56- "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
57- "--data-parallel-size" ,
58- str (dp_size ), "--port" ,
59- str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
60- ]
53+ if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" :
54+ server_args = [
55+ "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
56+ "--data-parallel-size" ,
57+ str (dp_size ), "--quantization" , "ascend" , "--max-model-len" ,
58+ "1024" , "--port" ,
59+ str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
60+ ]
61+ else :
62+ server_args = [
63+ "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
64+ "--data-parallel-size" ,
65+ str (dp_size ), "--port" ,
66+ str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
67+ ]
6168 request_keyword_args : dict [str , Any ] = {
6269 ** api_keyword_args ,
6370 }
0 commit comments