make checkpointing thread safe and deterministic

tushar00jain · tushar00jain · commit 1f5854d259e9 · 2025-08-01T12:17:33.000-07:00
Summary:
- the regression tests fail (on future changes) because it expects no recovery to happen, or it happens at the first step
- because we validate the parameters at each step, if recovery happens non deterministically, we can't really validate the parameters
- to fix this, copy the state dict before transferring it
- the checkpointing also wasn't thread safe for http transport so use lock the model in the pre step hook and when we want to transfer the checkpoint
diff --git a/torchft/checkpointing/http_transport.py b/torchft/checkpointing/http_transport.py
@@ -16,6 +16,7 @@
 from typing import Generator, List, Optional, TypeVar, cast
 
 import torch
+from torch.distributed.tensor import DTensor, distribute_tensor
 from torch.utils._pytree import TreeSpec, tree_flatten, tree_unflatten
 
 from torchft.checkpointing._rwlock import RWLock
@@ -266,6 +267,15 @@ def recv_checkpoint(
             return tree_unflatten(values, spec)
 
 
+def _clone_cpu_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    if isinstance(tensor, DTensor):
+        return distribute_tensor(
+            tensor.to_local().clone(), tensor.device_mesh, tensor.placements
+        )
+    else:
+        return tensor.clone()
+
+
 def _to_cpu(values: List[T], pin_memory: bool) -> List[T]:
     out = []
     for v in values:
@@ -278,7 +288,7 @@ def _to_cpu(values: List[T], pin_memory: bool) -> List[T]:
                 else:
                     out.append(v.cpu())
             else:
-                out.append(v)
+                out.append(_clone_cpu_tensor(v))
         else:
             out.append(v)
     return out
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -86,6 +86,9 @@ def __init__(
         self._hooks: List[RemovableHandle] = []
 
     def __enter__(self) -> "LocalSGD":
+        self._hooks.append(
+            self._local_optimizer.register_step_pre_hook(self._step_pre_hook)
+        )
         # Add optimizer hook which increments the local step counter and syncs if necessary
         self._hooks.append(
             self._local_optimizer.register_step_post_hook(self._step_post_hook)
@@ -106,12 +109,20 @@ def __exit__(
 
         return False  # Propagate exceptions
 
+    def _step_pre_hook(
+        self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
+    ) -> None:
+        # The checkpoint may transfer model parameters, so we need to make access to it thread safe
+        self._manager.disallow_state_dict_read()
+
     def _step_post_hook(
         self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
     ) -> None:
         """
         This hook is registered on the optimizer and is called after the optimizer step.
         """
+        self._manager.allow_state_dict_read()
+
         self._local_step += 1
         if self._local_step >= self._sync_every:
             self.sync()
@@ -677,12 +688,21 @@ def _restore_parameters(self) -> None:
             fragment.restore_parameters()
 
     def __enter__(self) -> "DiLoCo":
+        self._hooks.append(
+            self._local_optimizer.register_step_pre_hook(self._step_pre_hook)
+        )
         # Add optimizer hook which increments the local step counter and syncs if necessary
         self._hooks.append(
             self._local_optimizer.register_step_post_hook(self._step_post_hook)
         )
         return self
 
+    def _step_pre_hook(
+        self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
+    ) -> None:
+        # The checkpoint may transfer model parameters, so we need to make access to it thread safe
+        self._manager.disallow_state_dict_read()
+
     def __exit__(
         self,
         exc_type: Optional[Type[BaseException]],
@@ -717,6 +737,8 @@ def _step_post_hook(
         """
         This hook is registered on the optimizer and is called after the optimizer step.
         """
+        self._manager.allow_state_dict_read()
+
         # We need to make sure all nodes send the same fragments in order.
         # This is to avoid deadlocking e.g.
         #
diff --git a/torchft/local_sgd_integ_test.py b/torchft/local_sgd_integ_test.py
@@ -36,6 +36,7 @@
 )
 
 logger: logging.Logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
 
 
 def local_sgd_train_loop(
@@ -143,6 +144,7 @@ def assert_equal_global_state(
                 rep1[step]["user"][f"StreamingDiLoCoFragment_{i}"],
                 rep0[step]["user"][f"StreamingDiLoCoFragment_{i}"],
                 check_device=False,
+                msg=f"{step=} {i=}",
             )
         # Check all outer optimizers
         for i in range(
@@ -574,3 +576,9 @@ def test_streaming_diloco_commit_failure(
             self.assertEqual(
                 event_injector.count[EventInjectorEvent.AllreduceFailure], 1
             )
+
+
+if __name__ == "__main__":
+    import unittest
+
+    unittest.main()
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -55,6 +55,7 @@
 
 from torchft._torchft import ManagerClient, ManagerServer
 from torchft.checkpointing import CheckpointTransport, HTTPTransport
+from torchft.checkpointing._rwlock import RWLock
 from torchft.futures import future_timeout
 from torchft.work import _DummyWork
 
@@ -216,6 +217,9 @@ def __init__(
         self._load_state_dict_fns: Dict[str, Callable[[object], None]] = {}
         self._user_state_dicts: Dict[str, Callable[[], object]] = {}
 
+        # Protects state dict
+        self._state_dict_lock = RWLock(timeout=timeout.total_seconds())
+
         if load_state_dict and state_dict:
             self.register_state_dict_fn("default", load_state_dict, state_dict)
 
@@ -324,6 +328,21 @@ def __init__(
         # first step is 1
         self._participating_replica_rank: Optional[int] = None
         self._participating_replica_world_size: int = 0
+        self._is_state_dict_read_allowed = True
+
+    def allow_state_dict_read(self) -> None:
+        if self._is_state_dict_read_allowed:
+            return
+
+        self._is_state_dict_read_allowed = True
+        self._state_dict_lock.w_release()
+
+    def disallow_state_dict_read(self) -> None:
+        if not self._is_state_dict_read_allowed:
+            return
+
+        self._is_state_dict_read_allowed = False
+        self._state_dict_lock.w_acquire()
 
     def register_state_dict_fn(
         self,
@@ -806,11 +825,14 @@ def load_state_dict(self, state_dict: Dict[str, int]) -> None:
         self._batches_committed = state_dict["batches_committed"]
 
     def _manager_state_dict(self) -> Dict[str, object]:
-        assert len(self._user_state_dicts) > 0, "user state_dict is not initialized."
-        return {
-            "user": {key: value() for key, value in self._user_state_dicts.items()},
-            "torchft": self.state_dict(),
-        }
+        with self._state_dict_lock.r_lock():
+            assert (
+                len(self._user_state_dicts) > 0
+            ), "user state_dict is not initialized."
+            return {
+                "user": {key: value() for key, value in self._user_state_dicts.items()},
+                "torchft": self.state_dict(),
+            }
 
     def state_dict(self) -> Dict[str, int]:
         """