WIP

H-Huang · H-Huang · commit 3a61b86b2017 · 2025-09-18T10:06:51.000-07:00
diff --git a/run_train.sh b/run_train.sh
@@ -10,8 +10,11 @@ set -ex
 # use envs as local overwrites for convenience
 # e.g.
 # LOG_RANK=0,1 NGPU=4 ./run_train.sh
-NGPU=${NGPU:-"8"}
-export LOG_RANK=${LOG_RANK:-0}
+# NGPU=${NGPU:-"8"}
+NGPU=${NGPU:-"4"}
+# export LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7}
+# export LOG_RANK=${LOG_RANK:-0,1,2,3}
+export LOG_RANK=${LOG_RANK:-1}
 CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/models/llama3/train_configs/debug_model.toml"}
 TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}
 
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Callable, Literal
+from typing import Callable, Literal, Dict
 
 import torch
 import torch.nn as nn
@@ -22,6 +22,160 @@
 )
 from torch.distributed.tensor.parallel import ParallelStyle
 
+import threading
+import torch
+from typing import Optional
+import time
+
+class HookSequenceCoordinator:
+    """Coordinates hooks based on a predefined sequence"""
+    
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._condition = threading.Condition(self._lock)
+        
+        # Define your desired execution sequence matching:
+        # stageB.combine() -> stageA.forward_attention() -> stageB.backward_moe() -> 
+        # stageA.dispatch() -> stageB.dispatch() -> stageA.forward_moe() -> 
+        # stageB.backward_attention() -> stageA.combine()
+        self._hook_sequence = [
+            "combine_D_bwd",
+            "dispatch_A_fwd",
+            "combine_C_bwd",
+            "dispatch_B_fwd",
+            "dispatch_B_bwd",
+            "combine_C_fwd",
+            "dispatch_A_bwd",
+            "combine_D_fwd",
+        ]
+        # Create a semaphore for each hook in the sequence
+        self._semaphores: Dict[str, threading.Semaphore] = {}
+        self._reset_semaphores()
+        
+        # Coordination control - disabled by default
+        self._coordination_enabled = False
+        self._cycle_count = 0
+        
+    def _reset_semaphores(self):
+        """Reset all semaphores - first one gets 1 permit, others get 0"""
+        self._semaphores.clear()
+        for i, hook_name in enumerate(self._hook_sequence):
+            # First semaphore starts with 1 permit, others start with 0
+            initial_permits = 1 if i == 0 else 0
+            self._semaphores[hook_name] = threading.Semaphore(initial_permits)
+        
+    def enable_coordination(self):
+        """Enable hook coordination"""
+        self._coordination_enabled = True
+        self._reset_semaphores()  # Reset semaphores when enabling
+        print("[COORDINATION] Hook coordination ENABLED")
+    
+    def disable_coordination(self):
+        """Disable hook coordination"""
+        self._coordination_enabled = False
+        # Release all semaphores so no threads get stuck
+        for semaphore in self._semaphores.values():
+            try:
+                semaphore.release()
+            except ValueError:
+                pass  # Semaphore was already at max value
+        print("[COORDINATION] Hook coordination DISABLED")
+    
+    def is_coordination_enabled(self) -> bool:
+        """Check if coordination is currently enabled"""
+        return self._coordination_enabled
+    
+    def reset_coordination(self):
+        """Reset coordination state (useful between training runs)"""
+        self._cycle_count = 0
+        self._reset_semaphores()
+        print("[COORDINATION] Hook coordination state RESET")
+    
+    def acquire_execution(self, hook_name: str):
+        """Acquire execution permission using semaphores"""
+        # If coordination is disabled, just pass through
+        if not self._coordination_enabled:
+            print(f"[PASSTHROUGH] {hook_name} executing (coordination disabled)")
+            return
+        
+        # Check if hook is in our sequence
+        if hook_name not in self._semaphores:
+            print(f"[WARNING] {hook_name} not in sequence, executing without coordination")
+            return
+        
+        # Acquire the semaphore for this hook (blocks until available)
+        print(f"[WAITING] {hook_name} waiting for semaphore")
+        self._semaphores[hook_name].acquire()
+        print(f"[EXECUTING] {hook_name} acquired semaphore")
+    
+    def release_execution(self, hook_name: str):
+        """Release execution and signal next hook"""
+        # If coordination is disabled, just pass through
+        if not self._coordination_enabled:
+            return
+        
+        # Check if hook is in our sequence
+        if hook_name not in self._semaphores:
+            return
+        
+        # Find the next hook in the sequence and release its semaphore
+        try:
+            current_index = self._hook_sequence.index(hook_name)
+            next_index = (current_index + 1) % len(self._hook_sequence)
+            next_hook = self._hook_sequence[next_index]
+            
+            print(f"[COMPLETED] {hook_name} completed, signaling {next_hook}")
+            self._semaphores[next_hook].release()
+            
+            # Check if we completed a full cycle
+            if next_index == 0:
+                self._cycle_count += 1
+                print(f"[CYCLE] Completed cycle {self._cycle_count}")
+                
+        except ValueError:
+            print(f"[ERROR] {hook_name} not found in sequence")
+
+# Global coordinator
+_hook_coordinator = HookSequenceCoordinator()
+
+class SyncHook(torch.autograd.Function):
+    """Sync hook that follows a predefined execution sequence"""
+    
+    @staticmethod
+    def forward(ctx, x, hook_name):
+        ctx.hook_name = hook_name
+        
+        # Use forward-specific hook name
+        forward_hook_name = f"{hook_name}_fwd"
+        _hook_coordinator.acquire_execution(forward_hook_name)
+        
+        try:
+            if _hook_coordinator.is_coordination_enabled():
+                print(f"[FORWARD HOOK] {forward_hook_name} (coordinated)")
+            else:
+                print(f"[FORWARD HOOK] {forward_hook_name} (uncoordinated)")
+            return x
+        finally:
+            _hook_coordinator.release_execution(forward_hook_name)
+    
+    @staticmethod
+    def backward(ctx, grad_output):
+        hook_name = ctx.hook_name
+        
+        # Use backward-specific hook name
+        backward_hook_name = f"{hook_name}_bwd"
+        _hook_coordinator.acquire_execution(backward_hook_name)
+        
+        try:
+            if _hook_coordinator.is_coordination_enabled():
+                print(f"[BACKWARD HOOK] {backward_hook_name} (coordinated)")
+            else:
+                print(f"[BACKWARD HOOK] {backward_hook_name} (uncoordinated)")
+            return grad_output, None
+        finally:
+            _hook_coordinator.release_execution(backward_hook_name)
+
+
 
 TOKEN_GROUP_ALIGN_SIZE_M = 8
 ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
@@ -77,7 +231,6 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             self._partition_fn,
         )
 
-
 class ExpertParallel(ParallelStyle):
     def __init__(self):
         super().__init__()
@@ -90,6 +243,9 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         routed_input, num_tokens_per_expert = inputs
         ep_size = device_mesh.shape[0]
 
+        # HOOK: signal ready for sync
+        routed_input = SyncHook.apply(routed_input, "dispatch_A")
+
         # generate the input splits and output splits for all-to-all
         with torch.no_grad():
             num_tokens_per_expert_group = all_to_all_single(
@@ -135,6 +291,9 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         # generate_permute_indices in moe.py, which also does padding to make sure the number of tokens
         # each expert gets locally is a multiple of ALIGN_SIZE_M.
 
+        # HOOK: signal ready for sync
+        routed_input = SyncHook.apply(routed_input, "dispatch_B")
+
         return routed_input, num_tokens_per_expert_group
 
     @staticmethod
@@ -146,12 +305,16 @@ def _partition_fn(name, mod, device_mesh):
 
     # performing all-to-all combine on the output
     def _token_combine(self, mod, routed_output, device_mesh):
+        # HOOK: signal ready for sync
+        routed_output = SyncHook.apply(routed_output, "combine_C")
         routed_output = all_to_all_single_autograd(
             routed_output,
             self.input_splits,
             self.output_splits,
             device_mesh.get_group(),
         )
+        # HOOK: signal ready for sync
+        routed_output = SyncHook.apply(routed_output, "combine_D")
         return routed_output
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
diff --git a/torchtitan/models/deepseek_v3/__init__.py b/torchtitan/models/deepseek_v3/__init__.py
@@ -11,7 +11,7 @@
 from torchtitan.components.optimizer import build_optimizers_with_moe_load_balancing
 from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.models.llama3.infra.pipeline import pipeline_llama
+from torchtitan.models.llama3.infra.pipeline import pipeline_llama, pipeline_llama_tracer
 from torchtitan.models.moe import MoEArgs
 
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
@@ -32,10 +32,11 @@
 deepseekv3_configs = {
     "debugmodel": DeepSeekV3ModelArgs(
         vocab_size=2000,
-        dim=256,
+        # needs at least dim 8?
+        dim=8,
         inter_dim=1024,
         moe_inter_dim=256,
-        n_layers=6,
+        n_layers=16,
         n_dense_layers=1,
         n_heads=16,
         moe_args=MoEArgs(
diff --git a/torchtitan/models/deepseek_v3/train_configs/debug_model.toml b/torchtitan/models/deepseek_v3/train_configs/debug_model.toml
@@ -4,9 +4,9 @@ description = "DeepSeek-V3 debug training"
 print_args = false
 
 [profiling]
-enable_profiling = false
+enable_profiling = true
 save_traces_folder = "profile_trace"
-profile_freq = 10
+profile_freq = 5
 enable_memory_snapshot = false
 save_memory_snapshot_folder = "memory_snapshot"
 
@@ -36,22 +36,23 @@ decay_type = "linear"
 min_lr_factor = 0.0
 
 [training]
-local_batch_size = 8
-seq_len = 2048
+local_batch_size = 4
+seq_len = 4
 max_norm = 1.0  # grad norm clipping
-steps = 10
+steps = 6
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+# dataset = "c4"
 
 [parallelism]
 data_parallel_replicate_degree = 1
 data_parallel_shard_degree = -1
 fsdp_reshard_after_forward = "default" # default / never / always
 tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
-pipeline_parallel_degree = 1
-pipeline_parallel_schedule = "1F1B"
+pipeline_parallel_degree = 2
+expert_parallel_degree = 2
 context_parallel_degree = 1
-expert_parallel_degree = 1
+pipeline_parallel_schedule = "DualPipeV"
 expert_tensor_parallel_degree = 1
 
 [checkpoint]
@@ -63,7 +64,7 @@ export_dtype = "float32"
 async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
 
 [activation_checkpoint]
-mode = "selective"  # ["none", "selective", "full"]
+mode = "none"  # ["none", "selective", "full"]
 selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
 [compile]
diff --git a/torchtitan/models/llama3/infra/pipeline.py b/torchtitan/models/llama3/infra/pipeline.py
@@ -25,6 +25,9 @@
     pipeline_module_split,
 )
 
+from torch.distributed.pipelining import SplitPoint, pipeline
+from torch.distributed.pipelining.stage import _PipelineStage
+
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
@@ -148,3 +151,75 @@ def pipeline_llama(
             has_last_stage = True
 
     return pp_schedule, model_parts, has_first_stage, has_last_stage
+
+
+def pipeline_llama_tracer(
+    model: nn.Module,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: torch.device,
+    model_args: BaseModelArgs,
+    parallelize_fn: ParallelizeFunction,
+    loss_fn: LossFunction,
+):
+    assert (
+        parallel_dims.pp_enabled
+    ), "can't apply pipeline parallelism if it is not enabled"
+
+    # if job_config.model.norm_type == "fused_rmsnorm":
+    #     # TODO(whc) - torch._dynamo.exc.Unsupported: Illegal getattr invocation stride in strict mode
+    #     # coming from ` if dy.stride(-1) != 1:` in fused_rmsnorm
+    #     raise NotImplementedError(
+    #         "fused_rmsnorm not yet compatible with Pipeline Tracer (strides error). Please use layernorm or rmsnorm."
+    #     )
+    pp_mesh = parallel_dims.world_mesh["pp"]
+    pp_rank = pp_mesh.get_local_rank()
+    stage_idx = pp_mesh.get_local_rank()
+    layers_per_rank = model_args.n_layers // parallel_dims.pp
+    split_spec = {
+        f"layers.{i * layers_per_rank}": SplitPoint.BEGINNING
+        for i in range(1, parallel_dims.pp)
+    }
+    # Get example input
+    input_shape = (job_config.training.local_batch_size, job_config.training.seq_len)
+    assert hasattr(model_args, "vocab_size")
+    input_ids = torch.randint(
+        model_args.vocab_size, input_shape, dtype=torch.int64, device="meta"
+    )
+
+    # Create a pipeline representation from the model
+    pipe = pipeline(
+        model, mb_args=(input_ids,), split_spec=split_spec
+    )
+    model = pipe.get_stage_module(stage_idx)
+    stage = _PipelineStage(
+        stage_module=model,
+        stage_index=pp_rank,
+        pipe_info=pipe.pipe_info,
+        device=device,
+        group=pp_mesh.get_group(),
+    )
+
+    # For PP with looped schedules, each item in model_parts is one stage-model-chunk.
+    # We need to iterate through model_parts to apply SPMD parallelisms, compilation,
+    # optimizer, and checkpointing
+    for i, m in enumerate(model_parts):
+        # apply SPMD-style PT-D techniques
+        m = parallelize_fn(m, parallel_dims, job_config)
+        model_parts[i] = m
+        # NOTE: this is to update the model in the stage
+        #       in case the model is modified e.g. by torch.compile
+        stages[i].submod = m
+
+    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
+
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+
+    return pp_schedule, model_parts, has_first_stage, has_last_stage
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
diff --git a/torchtitan/train.py b/torchtitan/train.py