deep copy state dict for checkpoint

tushar00jain · tushar00jain · commit 0b914904b91d · 2025-07-28T20:08:22.000-07:00
Summary:
deep copy the state dict for sending checkpoint because if the replica moves to the next step, the state dict can change before the checkpoint is sent
diff --git a/torchft/checkpointing/http_transport.py b/torchft/checkpointing/http_transport.py
@@ -16,6 +16,7 @@
 from typing import Generator, List, Optional, TypeVar, cast
 
 import torch
+from torch.distributed.tensor import DTensor, distribute_tensor
 from torch.utils._pytree import TreeSpec, tree_flatten, tree_unflatten
 
 from torchft.checkpointing._rwlock import RWLock
@@ -265,6 +266,13 @@ def recv_checkpoint(
 
             return tree_unflatten(values, spec)
 
+def _clone_cpu_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    if isinstance(tensor, DTensor):
+        return distribute_tensor(
+            tensor.to_local().clone(), tensor.device_mesh, tensor.placements
+        )
+    else:
+        return tensor.clone()
 
 def _to_cpu(values: List[T], pin_memory: bool) -> List[T]:
     out = []
@@ -278,7 +286,7 @@ def _to_cpu(values: List[T], pin_memory: bool) -> List[T]:
                 else:
                     out.append(v.cpu())
             else:
-                out.append(v)
+                out.append(_clone_cpu_tensor(v))
         else:
             out.append(v)
     return out
diff --git a/torchft/checkpointing/pg_transport.py b/torchft/checkpointing/pg_transport.py
@@ -194,6 +194,9 @@ def metadata(self) -> str:
     def disallow_checkpoint(self) -> None:
         pass
 
+    def allow_checkpoint(self) -> None:
+        pass
+
     def send_checkpoint(
         self, dst_ranks: list[int], step: int, state_dict: T, timeout: timedelta
     ) -> None:
diff --git a/torchft/checkpointing/transport.py b/torchft/checkpointing/transport.py
@@ -44,6 +44,12 @@ def disallow_checkpoint(self) -> None:
         """
         ...
 
+    def allow_checkpoint(self) -> None:
+        """
+        Called when checkpoint is allowed to be sent to make sure access to the state_dict is safe.
+        """
+        ...
+
     @abstractmethod
     def recv_checkpoint(
         self, src_rank: int, metadata: str, step: int, timeout: timedelta
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -85,6 +85,9 @@ def __init__(
         self._hooks: List[RemovableHandle] = []
 
     def __enter__(self) -> "LocalSGD":
+        self._hooks.append(
+            self._local_optimizer.register_step_pre_hook(self._step_pre_hook)
+        )
         # Add optimizer hook which increments the local step counter and syncs if necessary
         self._hooks.append(
             self._local_optimizer.register_step_post_hook(self._step_post_hook)
@@ -105,12 +108,18 @@ def __exit__(
 
         return False  # Propagate exceptions
 
+    def _step_pre_hook(self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]) -> None:
+        # The checkpoint may transfer model parameters, so we need to make access to it thread safe
+        self._manager._checkpoint_transport.disallow_checkpoint()
+
     def _step_post_hook(
         self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
     ) -> None:
         """
         This hook is registered on the optimizer and is called after the optimizer step.
         """
+        self._manager._checkpoint_transport.allow_checkpoint()
+
         self._local_step += 1
         if self._local_step >= self._sync_every:
             self.sync()