Skip to content
23 changes: 12 additions & 11 deletions tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,20 +144,21 @@ deployment:
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 1
max_out_len: 2
batch_size: 1
baseline: 5
num_prompts: 2800
max_out_len: 1500
batch_size: 700
request_rate: 11.2
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/AIME2024
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 10
batch_size: 32
baseline: 1
threshold: 1
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 512
baseline: 95
threshold: 5
38 changes: 30 additions & 8 deletions tests/e2e/nightly/multi_node/test_multi_node.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
import openai

from tests.e2e.conftest import RemoteOpenAIServer
from tests.e2e.nightly.multi_node.config.multi_node_config import (
DISAGGREGATED_PREFILL_PROXY_SCRIPT, MultiNodeConfig)
from tools.aisbench import run_aisbench_cases

prompts = [
"San Francisco is a",
]

api_keyword_args = {
"max_tokens": 10,
}


def test_multi_node() -> None:
async def test_multi_node() -> None:
config = MultiNodeConfig.from_yaml()
env_dict = config.envs
# perf_cmd = config.perf_cmd
# acc_cmd = config.acc_cmd
perf_cmd = config.perf_cmd
acc_cmd = config.acc_cmd
nodes_info = config.nodes_info
disaggregated_prefill = config.disaggregated_prefill
server_port = config.server_port
Expand All @@ -26,11 +37,22 @@ def test_multi_node() -> None:
nodes_info=nodes_info,
max_wait_seconds=2000,
) as remote_server:
# base_url = remote_server.url_root
if config.is_master:
pass
# TODO: enable perf and acc test
# subprocess.run(perf_cmd, check=True)
# subprocess.run(acc_cmd, check=True)
port = proxy_port if disaggregated_prefill else server_port
base_url = f"http://localhost:{port}/v1/completions"
client = openai.AsyncOpenAI(base_url=base_url,
api_key="token-abc123",
max_retries=0,
**{"timeout": 600})
batch = await client.completions.create(
model=config.model,
prompt=prompts,
**api_keyword_args,
)
choices: list[openai.types.CompletionChoice] = batch.choices
assert choices[0].text, "empty response"
# aisbench test
run_aisbench_cases(config.model, port, acc_cmd)
run_aisbench_cases(config.model, port, perf_cmd)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The run_aisbench_cases function is synchronous and performs blocking I/O operations (e.g., subprocess.Popen and reading from stdout). Calling it directly from an async function like test_multi_node will block the asyncio event loop, which can lead to performance issues and defeats the purpose of using asyncio.

You should run blocking functions in a separate thread to avoid blocking the event loop. You can use asyncio.to_thread for this.

Note: You'll need to add import asyncio at the top of the file.

Suggested change
run_aisbench_cases(config.model, port, acc_cmd)
run_aisbench_cases(config.model, port, perf_cmd)
await asyncio.to_thread(run_aisbench_cases, config.model, port, acc_cmd)
await asyncio.to_thread(run_aisbench_cases, config.model, port, perf_cmd)

else:
remote_server.hang_until_terminated()
2 changes: 2 additions & 0 deletions tools/aisbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ def _accuracy_verify(self):


def run_aisbench_cases(model, port, aisbench_cases):
if isinstance(aisbench_cases, dict):
aisbench_cases = [aisbench_cases]
aisbench_results = []
aisbench_errors = []
for aisbench_case in aisbench_cases:
Expand Down
Loading