diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6cbc25b4b3bf..c6fd0be4d8cf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -205,6 +205,24 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - label: EPLB Algorithm Test # 5min timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" @@ -392,7 +410,7 @@ steps: --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss.py \ --ignore=lora/test_qwen3moe_tp.py - + parallelism: 4 - label: PyTorch Compilation Unit Tests # 15min @@ -1115,7 +1133,7 @@ steps: - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - + - label: NixlConnector PD accuracy tests (Distributed) # 30min timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py index 295d1637528c..eb7ed969ea4b 100644 --- a/examples/offline_inference/torchrun_dp_example.py +++ b/examples/offline_inference/torchrun_dp_example.py @@ -9,10 +9,76 @@ ```bash $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py ``` + +With custom parallelism settings: +```bash +$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ + --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep +``` """ +import argparse + from vllm import LLM, SamplingParams + +def parse_args(): + parser = argparse.ArgumentParser( + description="Data-parallel inference with torchrun" + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size (default: 1)", + ) + parser.add_argument( + "--pp-size", + type=int, + default=1, + help="Pipeline parallel size (default: 1)", + ) + parser.add_argument( + "--dp-size", + type=int, + default=2, + help="Data parallel size (default: 2)", + ) + parser.add_argument( + "--enable-ep", + action="store_true", + help="Enable expert parallel (default: False)", + ) + parser.add_argument( + "--model", + type=str, + default="microsoft/Phi-mini-MoE-instruct", + help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)", + ) + parser.add_argument( + "--max-model-len", + type=int, + default=4096, + help="Maximum model length (default: 4096)", + ) + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.6, + help="GPU memory utilization (default: 0.6)", + ) + parser.add_argument( + "--seed", + type=int, + default=1, + help="Random seed (default: 1)", + ) + return parser.parse_args() + + +args = parse_args() + + # Create prompts, the same across all ranks prompts = [ "Hello, my name is", @@ -30,15 +96,15 @@ # all ranks have the same random seed, so that sampling can be # deterministic across ranks. llm = LLM( - model="microsoft/Phi-mini-MoE-instruct", - tensor_parallel_size=1, - data_parallel_size=2, - pipeline_parallel_size=1, - enable_expert_parallel=False, + model=args.model, + tensor_parallel_size=args.tp_size, + data_parallel_size=args.dp_size, + pipeline_parallel_size=args.pp_size, + enable_expert_parallel=args.enable_ep, distributed_executor_backend="external_launcher", - max_model_len=4096, - gpu_memory_utilization=0.6, - seed=1, + max_model_len=args.max_model_len, + gpu_memory_utilization=args.gpu_memory_utilization, + seed=args.seed, ) dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank