|  | 
| 9 | 9 | ```bash | 
| 10 | 10 | $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py | 
| 11 | 11 | ``` | 
|  | 12 | +
 | 
|  | 13 | +With custom parallelism settings: | 
|  | 14 | +```bash | 
|  | 15 | +$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ | 
|  | 16 | +    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep | 
|  | 17 | +``` | 
| 12 | 18 | """ | 
| 13 | 19 | 
 | 
|  | 20 | +import argparse | 
|  | 21 | + | 
| 14 | 22 | from vllm import LLM, SamplingParams | 
| 15 | 23 | 
 | 
|  | 24 | + | 
|  | 25 | +def parse_args(): | 
|  | 26 | +    parser = argparse.ArgumentParser( | 
|  | 27 | +        description="Data-parallel inference with torchrun" | 
|  | 28 | +    ) | 
|  | 29 | +    parser.add_argument( | 
|  | 30 | +        "--tp-size", | 
|  | 31 | +        type=int, | 
|  | 32 | +        default=1, | 
|  | 33 | +        help="Tensor parallel size (default: 1)", | 
|  | 34 | +    ) | 
|  | 35 | +    parser.add_argument( | 
|  | 36 | +        "--pp-size", | 
|  | 37 | +        type=int, | 
|  | 38 | +        default=1, | 
|  | 39 | +        help="Pipeline parallel size (default: 1)", | 
|  | 40 | +    ) | 
|  | 41 | +    parser.add_argument( | 
|  | 42 | +        "--dp-size", | 
|  | 43 | +        type=int, | 
|  | 44 | +        default=2, | 
|  | 45 | +        help="Data parallel size (default: 2)", | 
|  | 46 | +    ) | 
|  | 47 | +    parser.add_argument( | 
|  | 48 | +        "--enable-ep", | 
|  | 49 | +        action="store_true", | 
|  | 50 | +        help="Enable expert parallel (default: False)", | 
|  | 51 | +    ) | 
|  | 52 | +    parser.add_argument( | 
|  | 53 | +        "--model", | 
|  | 54 | +        type=str, | 
|  | 55 | +        default="microsoft/Phi-mini-MoE-instruct", | 
|  | 56 | +        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)", | 
|  | 57 | +    ) | 
|  | 58 | +    parser.add_argument( | 
|  | 59 | +        "--max-model-len", | 
|  | 60 | +        type=int, | 
|  | 61 | +        default=4096, | 
|  | 62 | +        help="Maximum model length (default: 4096)", | 
|  | 63 | +    ) | 
|  | 64 | +    parser.add_argument( | 
|  | 65 | +        "--gpu-memory-utilization", | 
|  | 66 | +        type=float, | 
|  | 67 | +        default=0.6, | 
|  | 68 | +        help="GPU memory utilization (default: 0.6)", | 
|  | 69 | +    ) | 
|  | 70 | +    parser.add_argument( | 
|  | 71 | +        "--seed", | 
|  | 72 | +        type=int, | 
|  | 73 | +        default=1, | 
|  | 74 | +        help="Random seed (default: 1)", | 
|  | 75 | +    ) | 
|  | 76 | +    return parser.parse_args() | 
|  | 77 | + | 
|  | 78 | + | 
|  | 79 | +args = parse_args() | 
|  | 80 | + | 
|  | 81 | + | 
| 16 | 82 | # Create prompts, the same across all ranks | 
| 17 | 83 | prompts = [ | 
| 18 | 84 |     "Hello, my name is", | 
|  | 
| 30 | 96 | # all ranks have the same random seed, so that sampling can be | 
| 31 | 97 | # deterministic across ranks. | 
| 32 | 98 | llm = LLM( | 
| 33 |  | -    model="microsoft/Phi-mini-MoE-instruct", | 
| 34 |  | -    tensor_parallel_size=1, | 
| 35 |  | -    data_parallel_size=2, | 
| 36 |  | -    pipeline_parallel_size=1, | 
| 37 |  | -    enable_expert_parallel=False, | 
|  | 99 | +    model=args.model, | 
|  | 100 | +    tensor_parallel_size=args.tp_size, | 
|  | 101 | +    data_parallel_size=args.dp_size, | 
|  | 102 | +    pipeline_parallel_size=args.pp_size, | 
|  | 103 | +    enable_expert_parallel=args.enable_ep, | 
| 38 | 104 |     distributed_executor_backend="external_launcher", | 
| 39 |  | -    max_model_len=4096, | 
| 40 |  | -    gpu_memory_utilization=0.6, | 
| 41 |  | -    seed=1, | 
|  | 105 | +    max_model_len=args.max_model_len, | 
|  | 106 | +    gpu_memory_utilization=args.gpu_memory_utilization, | 
|  | 107 | +    seed=args.seed, | 
| 42 | 108 | ) | 
| 43 | 109 | 
 | 
| 44 | 110 | dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank | 
|  | 
0 commit comments