fix stream dependencies in callbacks

tushar00jain · tushar00jain · commit 43dd4fb72df6 · 2025-07-30T19:22:17.000-07:00
Summary:
- call future.wait in callbacks to make sure the continuation executes after the future has completed
- set the stream correctly to execute callback scheduled by bucketized allreduce
diff --git a/torchft/collectives.py b/torchft/collectives.py
@@ -387,6 +387,8 @@ def callback(fut: Future[list[torch.Tensor]]) -> list[torch.Tensor]:
             nonlocal tensors, quantized_tensors, world_size, sync_stream
 
             with torch.cuda.stream(sync_stream):
+                # Setup stream dependency
+                fut.wait()
                 # Dequantize the result back to the original precision
                 fused_dequantize_from_fp8(tensors, quantized_tensors, world_size)
                 return tensors
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -524,9 +524,14 @@ def _bucketize_and_allreduce(
             )
 
             def callback(fut: torch.futures.Future[torch.Tensor]) -> None:
-                nonlocal bucket_tensors, flat_buffer
-                for t, pack_offset, numel in bucket_tensors:
-                    t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
+                with torch.cuda.stream(self._stream) if self._stream else nullcontext():
+                    nonlocal bucket_tensors, flat_buffer
+                    # Setup stream dependency
+                    fut.wait()
+                    for t, pack_offset, numel in bucket_tensors:
+                        t.copy_(
+                            flat_buffer[pack_offset : pack_offset + numel].view_as(t)
+                        )
 
             work = work.then(callback)
             self._allreduce_futures.append(work)
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -430,6 +430,8 @@ def callback(
                 # change the stream to avoid making the callback stream
                 # dependent on process group stream running the allreduce
                 with torch.cuda.stream(stream) if stream is not None else nullcontext():
+                    # Setup stream dependency
+                    fut.wait()
                     fut.value()
                     tensor /= num_participants