fix compute/communication overlap for gloo

tushar00jain · tushar00jain · commit 91207a2d36b0 · 2025-07-26T10:48:38.000-07:00
Summary:
- we current wait for pg work's future when preparing for a fragment
- if we use gloo, this blocks the cpu
- move the wait call to when we perform the actual sync of the fragment
- since we still call `work.wait()` in the allreduce call itself this doesn't completely fix the problem
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -401,13 +401,6 @@ def prepare_sync(self) -> None:
         ):
             self._average_grads()
 
-            for work in self._allreduce_work:
-                work.get_future().wait()
-
-            if self._stream is not None:
-                self._stop_event = torch.cuda.Event()
-                self._stop_event.record()
-
     @torch.profiler.record_function("torchft::local_sgd::perform_sync")
     def perform_sync(self) -> bool:
         """
@@ -417,6 +410,18 @@ def perform_sync(self) -> bool:
         # Waiting for an allreduce before it has been sent is currently not supported.
         assert len(self._allreduce_work) > 0
 
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            for work in self._allreduce_work:
+                work.get_future().wait()
+
+            if self._stream is not None:
+                self._stop_event = torch.cuda.Event()
+                self._stop_event.record()
+
         self.wait()
 
         # save the parameters so they can be used for merging