Skip to content

Commit f7a6682

Browse files
authored
[CI/Build] Test torchrun with 8 cards (#27548)
Signed-off-by: 22quinn <[email protected]>
1 parent a9fe079 commit f7a6682

File tree

2 files changed

+94
-10
lines changed

2 files changed

+94
-10
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,24 @@ steps:
205205
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
206206
- popd
207207

208+
- label: Distributed Tests (8 GPUs) # 4min
209+
timeout_in_minutes: 10
210+
gpu: h100
211+
num_gpus: 8
212+
working_dir: "/vllm-workspace/tests"
213+
source_file_dependencies:
214+
- examples/offline_inference/torchrun_dp_example.py
215+
- vllm/config/parallel.py
216+
- vllm/distributed/
217+
- vllm/v1/engine/llm_engine.py
218+
- vllm/v1/executor/uniproc_executor.py
219+
- vllm/v1/worker/gpu_worker.py
220+
commands:
221+
# https://github.com/NVIDIA/nccl/issues/1838
222+
- export NCCL_CUMEM_HOST_ENABLE=0
223+
# test with torchrun tp=2 and dp=4 with ep
224+
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
225+
208226
- label: EPLB Algorithm Test # 5min
209227
timeout_in_minutes: 15
210228
working_dir: "/vllm-workspace/tests"
@@ -401,7 +419,7 @@ steps:
401419
--ignore=lora/test_deepseekv2_tp.py \
402420
--ignore=lora/test_gptoss.py \
403421
--ignore=lora/test_qwen3moe_tp.py
404-
422+
405423
parallelism: 4
406424

407425
- label: PyTorch Compilation Unit Tests # 15min
@@ -1126,7 +1144,7 @@ steps:
11261144
- tests/weight_loading
11271145
commands:
11281146
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1129-
1147+
11301148
- label: NixlConnector PD accuracy tests (Distributed) # 30min
11311149
timeout_in_minutes: 30
11321150
working_dir: "/vllm-workspace/tests"

examples/offline_inference/torchrun_dp_example.py

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,76 @@
99
```bash
1010
$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
1111
```
12+
13+
With custom parallelism settings:
14+
```bash
15+
$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
16+
--tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
17+
```
1218
"""
1319

20+
import argparse
21+
1422
from vllm import LLM, SamplingParams
1523

24+
25+
def parse_args():
26+
parser = argparse.ArgumentParser(
27+
description="Data-parallel inference with torchrun"
28+
)
29+
parser.add_argument(
30+
"--tp-size",
31+
type=int,
32+
default=1,
33+
help="Tensor parallel size (default: 1)",
34+
)
35+
parser.add_argument(
36+
"--pp-size",
37+
type=int,
38+
default=1,
39+
help="Pipeline parallel size (default: 1)",
40+
)
41+
parser.add_argument(
42+
"--dp-size",
43+
type=int,
44+
default=2,
45+
help="Data parallel size (default: 2)",
46+
)
47+
parser.add_argument(
48+
"--enable-ep",
49+
action="store_true",
50+
help="Enable expert parallel (default: False)",
51+
)
52+
parser.add_argument(
53+
"--model",
54+
type=str,
55+
default="microsoft/Phi-mini-MoE-instruct",
56+
help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
57+
)
58+
parser.add_argument(
59+
"--max-model-len",
60+
type=int,
61+
default=4096,
62+
help="Maximum model length (default: 4096)",
63+
)
64+
parser.add_argument(
65+
"--gpu-memory-utilization",
66+
type=float,
67+
default=0.6,
68+
help="GPU memory utilization (default: 0.6)",
69+
)
70+
parser.add_argument(
71+
"--seed",
72+
type=int,
73+
default=1,
74+
help="Random seed (default: 1)",
75+
)
76+
return parser.parse_args()
77+
78+
79+
args = parse_args()
80+
81+
1682
# Create prompts, the same across all ranks
1783
prompts = [
1884
"Hello, my name is",
@@ -30,15 +96,15 @@
3096
# all ranks have the same random seed, so that sampling can be
3197
# deterministic across ranks.
3298
llm = LLM(
33-
model="microsoft/Phi-mini-MoE-instruct",
34-
tensor_parallel_size=1,
35-
data_parallel_size=2,
36-
pipeline_parallel_size=1,
37-
enable_expert_parallel=False,
99+
model=args.model,
100+
tensor_parallel_size=args.tp_size,
101+
data_parallel_size=args.dp_size,
102+
pipeline_parallel_size=args.pp_size,
103+
enable_expert_parallel=args.enable_ep,
38104
distributed_executor_backend="external_launcher",
39-
max_model_len=4096,
40-
gpu_memory_utilization=0.6,
41-
seed=1,
105+
max_model_len=args.max_model_len,
106+
gpu_memory_utilization=args.gpu_memory_utilization,
107+
seed=args.seed,
42108
)
43109

44110
dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank

0 commit comments

Comments
 (0)