Skip to content

Commit 7e5dda1

Browse files
committed
fix managed pg allreduce
Summary: managed pg allreduce should just call manager's allreduce
1 parent a31b483 commit 7e5dda1

File tree

1 file changed

+1
-17
lines changed

1 file changed

+1
-17
lines changed

torchft/process_group.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,23 +1101,7 @@ def __init__(self, manager: "Manager") -> None:
11011101
self._manager = manager
11021102

11031103
def allreduce(self, tensors: List[torch.Tensor], opts: object) -> Work:
1104-
# Ensure we have a valid quorum and are configured before trying to do
1105-
# any work.
1106-
self._manager.wait_quorum()
1107-
1108-
if self._manager.errored() is not None:
1109-
return _DummyWork(tensors)
1110-
try:
1111-
work = super().allreduce(tensors, opts)
1112-
except Exception as e:
1113-
self._manager.report_error(e)
1114-
return _DummyWork(tensors)
1115-
1116-
return _ManagedWork(
1117-
self._manager,
1118-
work,
1119-
tensors,
1120-
)
1104+
return self._manager.allreduce(tensors)
11211105

11221106
def size(self) -> int:
11231107
return self._manager.num_participants()

0 commit comments

Comments
 (0)