make managed futures generic

tushar00jain · tushar00jain · commit 15cbf05fe12c · 2025-08-04T14:00:57.000-07:00
diff --git a/torchft/_test/managed_work_test.py b/torchft/_test/managed_work_test.py
@@ -7,7 +7,10 @@
 import types
 import unittest
 from datetime import timedelta
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, TypeVar, cast
+
+# Define a type variable for the Future's value type
+T = TypeVar("T")
 
 import parameterized
 import torch
@@ -65,20 +68,24 @@ def test_callbacks_execute_after_wait(
         )
 
         # Create the managed work
-        managed_work = _ManagedWork(work, manager, [tensor])
+        managed_work = _ManagedWork(manager, work, [tensor])
 
         # Track callback execution
         callback_executed: bool = False
 
-        def callback(fut: Future[List[torch.Tensor]]) -> List[torch.Tensor]:
+        def callback(fut: Future[object]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            tensor_fut = cast(Future[List[torch.Tensor]], fut)
             nonlocal callback_executed
             callback_executed = True
             # Multiply tensor by 2 to verify the callback ran
-            fut.value()[0].mul_(2)
-            return fut.value()
+            value = tensor_fut.value()
+            value[0].mul_(2)
+            return value
 
         # Add the callback
-        managed_work.add_callback(callback)
+        fut = managed_work.get_future()
+        fut = fut.then(callback)
 
         # Verify callback hasn't executed yet
         self.assertFalse(callback_executed)
@@ -118,30 +125,40 @@ def test_multiple_callbacks_execute_in_order(
         )
 
         # Create the managed work
-        managed_work = _ManagedWork(work, manager, [tensor])
+        managed_work = _ManagedWork(manager, work, [tensor])
 
         # Track execution order
         execution_order: List[int] = []
 
-        def callback1(fut: Future[List[torch.Tensor]]) -> List[torch.Tensor]:
+        def callback1(fut: Future[T]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            tensor_fut = cast(Future[List[torch.Tensor]], fut)
             execution_order.append(1)
-            fut.value()[0].add_(1)
-            return fut.value()
+            value = tensor_fut.value()
+            value[0].add_(1)
+            return value
 
-        def callback2(fut: Future[List[torch.Tensor]]) -> List[torch.Tensor]:
+        def callback2(fut: Future[T]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            tensor_fut = cast(Future[List[torch.Tensor]], fut)
             execution_order.append(2)
-            fut.value()[0].add_(2)
-            return fut.value()
+            value = tensor_fut.value()
+            value[0].add_(2)
+            return value
 
-        def callback3(fut: Future[List[torch.Tensor]]) -> List[torch.Tensor]:
+        def callback3(fut: Future[T]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            tensor_fut = cast(Future[List[torch.Tensor]], fut)
             execution_order.append(3)
-            fut.value()[0].add_(3)
-            return fut.value()
+            value = tensor_fut.value()
+            value[0].add_(3)
+            return value
 
         # Add callbacks
-        managed_work.add_callback(callback1)
-        managed_work.add_callback(callback2)
-        managed_work.add_callback(callback3)
+        fut = managed_work.get_future()
+        fut = fut.then(callback1)
+        fut = fut.then(callback2)
+        fut = fut.then(callback3)
 
         # Verify no callbacks have executed yet
         self.assertEqual(len(execution_order), 0)
@@ -181,29 +198,35 @@ def test_future_then_api(self, name: str, device: torch.device) -> None:
         )
 
         # Create the managed work
-        managed_work = _ManagedWork(work, manager, [tensor])
+        managed_work = _ManagedWork(manager, work, [tensor])
 
         # Get the future
         future = managed_work.get_future()
 
         # Track callback execution
         callback_executed: bool = False
 
-        def callback(fut: Future[List[torch.Tensor]]) -> List[torch.Tensor]:
+        def callback(fut: Future[object]) -> List[torch.Tensor]:
+            # Cast to the expected type
+            tensor_fut = cast(Future[List[torch.Tensor]], fut)
             nonlocal callback_executed
             callback_executed = True
             # Multiply tensor by 3 to verify the callback ran
-            fut.value()[0].mul_(3)
-            return fut.value()
+            value = tensor_fut.value()
+            value[0].mul_(3)
+            return value
 
         # Use the then API
-        future.then(callback)
+        future = future.then(callback)
 
         # Verify callback hasn't executed yet
         self.assertFalse(callback_executed)
         self.assertEqual(tensor.item(), 1.0)
 
-        # Call wait() which should trigger the callback
+        # Call wait() on the managed_work first to set up the future properly
+        managed_work.wait()
+
+        # Now wait on the future
         future.wait()
 
         # Verify callback has executed
diff --git a/torchft/ddp.py b/torchft/ddp.py
@@ -75,11 +75,10 @@ def _comm_hook(
         fut = work.get_future()
 
         def callback(
-            tensors: torch.futures.Future[list[torch.Tensor]],
-        ) -> list[torch.Tensor]:
+            tensor: torch.futures.Future[torch.Tensor],
+        ) -> None:
             nonlocal result_fut
-            result_fut.set_result(tensors.value()[0])
-            return []
+            result_fut.set_result(tensor.value())
 
         fut = fut.then(callback)
 
diff --git a/torchft/manager.py b/torchft/manager.py