intelligent-machine-learning · xhx1022 · Aug 3, 2025 · Aug 17, 2025 · skydoorkai · Aug 7, 2025
@@ -0,0 +1,28 @@
+
+
+
+## Clone the DualPipe & Setup Environment
+
+```bash
+git clone https://github.com/deepseek-ai/DualPipe.git
+cd dualpipe
+conda create -n dualpipe python=3.10 -y
+conda activate dualpipe
+pip install -r requirements.txt
+pip install -e .
+```
+
+## Naive Implementation for Single-GPU and Multi-GPU Training of MoE Models
+```bash
+MASTER_ADDR=localhost MASTER_PORT=12355 WORLD_SIZE=4 python examples/moe_train_basic.py
+```
+
+### Parameters
+- WORLD_SIZE=4: Uses 4 GPUs for pipeline parallelism
+- MASTER_ADDR: Master node address
+- MASTER_PORT: Communication port
+- `test_moe_basic()`: Tests basic functionality of the MoE model
+
+
+
+
@@ -0,0 +1,17 @@
+__version__ = "1.0.0"
+
+from dualpipe.dualpipe import DualPipe
+from dualpipe.dualpipev import DualPipeV
+from dualpipe.comm import (
+    set_p2p_tensor_shapes,
+    set_p2p_tensor_dtype,
+)
+from dualpipe.utils import WeightGradStore
+
+__all__ = [
+    DualPipe,
+    DualPipeV,
+    WeightGradStore,
+    set_p2p_tensor_shapes,
+    set_p2p_tensor_dtype,
+]
@@ -0,0 +1,38 @@
+from typing import List, Tuple
+
+import torch
+import torch.distributed as dist
+
+
+TENSOR_SHAPES: List[Tuple[int]] = None
+TENSOR_DTYPE: torch.dtype = None
+
+
+def set_p2p_tensor_shapes(shapes: List[Tuple[int]]):
+    global TENSOR_SHAPES
+    TENSOR_SHAPES = shapes
+
+
+def set_p2p_tensor_dtype(dtype: torch.dtype):
+    global TENSOR_DTYPE
+    TENSOR_DTYPE = dtype
+
+
+def build_from_tensor_shapes():
+    return [torch.empty(s, dtype=TENSOR_DTYPE, device="cuda", requires_grad=True) for s in TENSOR_SHAPES]
+
+
+def append_irecv(ops: List[dist.P2POp], src: int, group: dist.ProcessGroup) -> List[torch.Tensor]:
+    tensors = build_from_tensor_shapes()
+    src = dist.distributed_c10d.get_global_rank(group, src)
+    for tensor in tensors:
+        if tensor is not None:
+            ops.append(dist.P2POp(dist.irecv, tensor, src))
+    return tensors
+
+
+def append_isend(ops: List[dist.P2POp], tensors: List[torch.Tensor], dst: int, group: dist.ProcessGroup) -> None:
+    dst = dist.distributed_c10d.get_global_rank(group, dst)
+    for tensor in tensors:
+        if tensor is not None:
+            ops.append(dist.P2POp(dist.isend, tensor, dst))