|
9 | 9 | ```bash |
10 | 10 | $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py |
11 | 11 | ``` |
| 12 | +
|
| 13 | +With custom parallelism settings: |
| 14 | +```bash |
| 15 | +$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ |
| 16 | + --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep |
| 17 | +``` |
12 | 18 | """ |
13 | 19 |
|
| 20 | +import argparse |
| 21 | + |
14 | 22 | from vllm import LLM, SamplingParams |
15 | 23 |
|
| 24 | + |
| 25 | +def parse_args(): |
| 26 | + parser = argparse.ArgumentParser( |
| 27 | + description="Data-parallel inference with torchrun" |
| 28 | + ) |
| 29 | + parser.add_argument( |
| 30 | + "--tp-size", |
| 31 | + type=int, |
| 32 | + default=1, |
| 33 | + help="Tensor parallel size (default: 1)", |
| 34 | + ) |
| 35 | + parser.add_argument( |
| 36 | + "--pp-size", |
| 37 | + type=int, |
| 38 | + default=1, |
| 39 | + help="Pipeline parallel size (default: 1)", |
| 40 | + ) |
| 41 | + parser.add_argument( |
| 42 | + "--dp-size", |
| 43 | + type=int, |
| 44 | + default=2, |
| 45 | + help="Data parallel size (default: 2)", |
| 46 | + ) |
| 47 | + parser.add_argument( |
| 48 | + "--enable-ep", |
| 49 | + action="store_true", |
| 50 | + help="Enable expert parallel (default: False)", |
| 51 | + ) |
| 52 | + parser.add_argument( |
| 53 | + "--model", |
| 54 | + type=str, |
| 55 | + default="microsoft/Phi-mini-MoE-instruct", |
| 56 | + help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)", |
| 57 | + ) |
| 58 | + parser.add_argument( |
| 59 | + "--max-model-len", |
| 60 | + type=int, |
| 61 | + default=4096, |
| 62 | + help="Maximum model length (default: 4096)", |
| 63 | + ) |
| 64 | + parser.add_argument( |
| 65 | + "--gpu-memory-utilization", |
| 66 | + type=float, |
| 67 | + default=0.6, |
| 68 | + help="GPU memory utilization (default: 0.6)", |
| 69 | + ) |
| 70 | + parser.add_argument( |
| 71 | + "--seed", |
| 72 | + type=int, |
| 73 | + default=1, |
| 74 | + help="Random seed (default: 1)", |
| 75 | + ) |
| 76 | + return parser.parse_args() |
| 77 | + |
| 78 | + |
| 79 | +args = parse_args() |
| 80 | + |
| 81 | + |
16 | 82 | # Create prompts, the same across all ranks |
17 | 83 | prompts = [ |
18 | 84 | "Hello, my name is", |
|
30 | 96 | # all ranks have the same random seed, so that sampling can be |
31 | 97 | # deterministic across ranks. |
32 | 98 | llm = LLM( |
33 | | - model="microsoft/Phi-mini-MoE-instruct", |
34 | | - tensor_parallel_size=1, |
35 | | - data_parallel_size=2, |
36 | | - pipeline_parallel_size=1, |
37 | | - enable_expert_parallel=False, |
| 99 | + model=args.model, |
| 100 | + tensor_parallel_size=args.tp_size, |
| 101 | + data_parallel_size=args.dp_size, |
| 102 | + pipeline_parallel_size=args.pp_size, |
| 103 | + enable_expert_parallel=args.enable_ep, |
38 | 104 | distributed_executor_backend="external_launcher", |
39 | | - max_model_len=4096, |
40 | | - gpu_memory_utilization=0.6, |
41 | | - seed=1, |
| 105 | + max_model_len=args.max_model_len, |
| 106 | + gpu_memory_utilization=args.gpu_memory_utilization, |
| 107 | + seed=args.seed, |
42 | 108 | ) |
43 | 109 |
|
44 | 110 | dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank |
|
0 commit comments