2323
2424from tests .e2e .conftest import RemoteOpenAIServer
2525
26- MODELS = [
27- "Qwen/Qwen3-30B-A3B" ,
28- ]
26+ MODELS = ["Qwen/Qwen3-30B-A3B" , "vllm-ascend/DeepSeek-V2-Lite-W8A8" ]
2927
3028DATA_PARALLELS = [2 ]
3129
@@ -47,12 +45,21 @@ async def test_single_request_aclgraph(model: str, dp_size: int) -> None:
4745 "TASK_QUEUE_ENABLE" : "1" ,
4846 "HCCL_OP_EXPANSION_MODE" : "AIV" ,
4947 }
50- server_args = [
51- "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
52- "--data-parallel-size" ,
53- str (dp_size ), "--port" ,
54- str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
55- ]
48+ if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" :
49+ server_args = [
50+ "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
51+ "--data-parallel-size" ,
52+ str (dp_size ), "--quantization" , "ascend" , "--max-model-len" ,
53+ "1024" , "--port" ,
54+ str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
55+ ]
56+ else :
57+ server_args = [
58+ "--no-enable-prefix-caching" , "--tensor-parallel-size" , "1" ,
59+ "--data-parallel-size" ,
60+ str (dp_size ), "--port" ,
61+ str (port ), "--trust-remote-code" , "--gpu-memory-utilization" , "0.9"
62+ ]
5663 request_keyword_args : dict [str , Any ] = {
5764 ** api_keyword_args ,
5865 }
0 commit comments