add reduce_scatter_v support

tianfengfrank · meta-codesync[bot] · commit 3b65b52e8ac5 · 2025-10-28T16:38:05.000-07:00
Summary:
tp_overlapping requires to work with uneven_split introduced by D84788079. To support that, we need reduce_scatter_v in torchcomm
- enable reduce_scatter_v to support various tensor size of input_tensor list
- add both cpp/py integration UTs

Reviewed By: d4l3k

Differential Revision: D85297838

fbshipit-source-id: 210969573cbec89341825939016a3826ac850331
diff --git a/comms/torchcomms/TorchComm.cpp b/comms/torchcomms/TorchComm.cpp
@@ -114,6 +114,15 @@ std::shared_ptr<TorchWork> TorchComm::reduce_scatter(
   return impl_->reduce_scatter(output, input_list, op, async_op, options);
 }
 
+std::shared_ptr<TorchWork> TorchComm::reduce_scatter_v(
+    at::Tensor& output,
+    const std::vector<at::Tensor>& input_list,
+    ReduceOp op,
+    bool async_op,
+    const ReduceScatterOptions& options) {
+  return impl_->reduce_scatter_v(output, input_list, op, async_op, options);
+}
+
 std::shared_ptr<TorchWork> TorchComm::reduce_scatter_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/TorchComm.hpp b/comms/torchcomms/TorchComm.hpp
@@ -81,6 +81,12 @@ class TorchComm {
       ReduceOp op,
       bool async_op,
       const ReduceScatterOptions& options = {});
+  std::shared_ptr<TorchWork> reduce_scatter_v(
+      at::Tensor& output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      bool async_op,
+      const ReduceScatterOptions& options = {});
   std::shared_ptr<TorchWork> reduce_scatter_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/TorchCommBackend.hpp b/comms/torchcomms/TorchCommBackend.hpp
@@ -94,6 +94,12 @@ class TorchCommBackend {
       ReduceOp op,
       bool async_op,
       const ReduceScatterOptions& options = {}) = 0;
+  virtual std::shared_ptr<TorchWork> reduce_scatter_v(
+      at::Tensor& output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      bool async_op,
+      const ReduceScatterOptions& options = {}) = 0;
   virtual std::shared_ptr<TorchWork> reduce_scatter_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/TorchCommPy.cpp b/comms/torchcomms/TorchCommPy.cpp
@@ -745,6 +745,43 @@ Reduce, then scatter a list of tensors to all ranks.
     op: Reduction operation.
     async_op: Whether to perform the operation asynchronously.
     hints: Dictionary of string hints for backend-specific options.
+    timeout: Timeout for the operation.
+          )",
+          py::arg("output"),
+          py::arg("input_list"),
+          py::arg("op"),
+          py::arg("async_op"),
+          py::arg("hints") = std::nullopt,
+          py::arg("timeout") = std::nullopt,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "reduce_scatter_v",
+          [](TorchComm& self,
+             at::Tensor& output,
+             const std::vector<at::Tensor>& input_list,
+             ReduceOp op,
+             bool async_op,
+             std::optional<std::unordered_map<std::string, std::string>> hints,
+             std::optional<std::chrono::milliseconds> timeout) {
+            ReduceScatterOptions opts;
+            if (hints) {
+              opts.hints = *hints;
+            }
+            if (timeout) {
+              opts.timeout = *timeout;
+            }
+            return self.reduce_scatter_v(
+                output, input_list, op, async_op, opts);
+          },
+          R"(
+Reduce, then scatter a list of tensors to all ranks, supporting variable tensor sizes per rank.
+
+Args:
+    output: Output tensor on each rank; size may differ per rank.
+    input_list: List of tensors to reduce and scatter; the list is the same on all ranks, but tensor sizes may differ between indices.
+    op: Reduction operation.
+    async_op: Whether to perform the operation asynchronously
+    hints: Dictionary of string hints for backend-specific options.
     timeout: Timeout for the operation.
           )",
           py::arg("output"),
diff --git a/comms/torchcomms/_comms.pyi b/comms/torchcomms/_comms.pyi
@@ -261,6 +261,15 @@ class TorchComm:
         hints: Dict[str, str] | None = None,
         timeout: timedelta | None = None,
     ) -> TorchWork: ...
+    def reduce_scatter_v(
+        self,
+        output: Any,
+        input_list: List[Any],
+        op: ReduceOp,
+        async_op: bool,
+        hints: Dict[str, str] | None = None,
+        timeout: timedelta | None = None,
+    ) -> TorchWork: ...
     def reduce_scatter_single(
         self,
         output: Any,
diff --git a/comms/torchcomms/gloo/TorchCommGloo.cpp b/comms/torchcomms/gloo/TorchCommGloo.cpp
@@ -815,6 +815,16 @@ std::shared_ptr<TorchWork> TorchCommGloo::reduce_scatter(
   return reduce_scatter_single(output, input, op, async_op, singleOptions);
 }
 
+std::shared_ptr<TorchWork> TorchCommGloo::reduce_scatter_v(
+    at::Tensor& output,
+    const std::vector<at::Tensor>& input_list,
+    ReduceOp op,
+    bool async_op,
+    const ReduceScatterOptions& options) {
+  throw std::runtime_error(
+      "reduce_scatter_v is not supported in GLOO backend yet");
+}
+
 std::shared_ptr<TorchWork> TorchCommGloo::reduce_scatter_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/gloo/TorchCommGloo.hpp b/comms/torchcomms/gloo/TorchCommGloo.hpp
@@ -106,6 +106,12 @@ class TorchCommGloo : public TorchCommBackend,
       ReduceOp op,
       bool async_op,
       const ReduceScatterOptions& options = {}) override;
+  std::shared_ptr<TorchWork> reduce_scatter_v(
+      at::Tensor& output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      bool async_op,
+      const ReduceScatterOptions& options = {}) override;
   std::shared_ptr<TorchWork> reduce_scatter_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.cpp b/comms/torchcomms/nccl/TorchCommNCCL.cpp
@@ -818,6 +818,15 @@ std::shared_ptr<TorchWork> TorchCommNCCL::reduce_scatter(
   return work;
 }
 
+std::shared_ptr<TorchWork> TorchCommNCCL::reduce_scatter_v(
+    at::Tensor& output,
+    const std::vector<at::Tensor>& input_list,
+    ReduceOp op,
+    bool async_op,
+    const ReduceScatterOptions& options) {
+  throw std::runtime_error("reduce_scatter_v is not supported in NCCL backend");
+}
+
 std::shared_ptr<TorchWork> TorchCommNCCL::reduce_scatter_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.hpp b/comms/torchcomms/nccl/TorchCommNCCL.hpp
@@ -123,6 +123,12 @@ class TorchCommNCCL : public TorchCommBackend,
       ReduceOp op,
       bool async_op,
       const ReduceScatterOptions& options = {}) override;
+  std::shared_ptr<TorchWork> reduce_scatter_v(
+      at::Tensor& output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      bool async_op,
+      const ReduceScatterOptions& options = {}) override;
   std::shared_ptr<TorchWork> reduce_scatter_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.cpp b/comms/torchcomms/ncclx/TorchCommNCCLX.cpp
@@ -888,6 +888,86 @@ std::shared_ptr<TorchWork> TorchCommNCCLX::reduce_scatter(
   return work;
 }
 
+std::shared_ptr<TorchWork> TorchCommNCCLX::reduce_scatter_v(
+    at::Tensor& output,
+    const std::vector<at::Tensor>& input_list,
+    ReduceOp op,
+    bool async_op,
+    const ReduceScatterOptions& options) {
+  checkInitialized();
+  checkAndAbortIfTimedOutOrError();
+  ensureTensorContiguous(output);
+
+  if (input_list.size() != static_cast<size_t>(comm_size_)) {
+    throw std::runtime_error(
+        "input_list size must equal comm_size for reduce_scatter");
+  }
+
+  // Check that all input tensors are contiguous and have correct size
+  for (const auto& t : input_list) {
+    ensureTensorContiguous(t);
+  }
+
+  TorchCommTracingGuard tracingGuard(
+      name_, comm_size_, "reduce_scatter", rank_, input_list, {output});
+
+  cudaStream_t stream = getOperationStream(async_op);
+  auto work = createWork(
+      stream,
+      getOperationTimeout(options.timeout, options_.timeout),
+      input_list);
+
+  work->recordStart();
+
+  // Use multiple reduce operations for reduce_scatter
+  nccl_api_->groupStart();
+
+  for (int i = 0; i < comm_size_; ++i) {
+    const auto dataType = getNcclDataType(input_list[i]);
+    if (i == rank_) {
+      // This rank receives the reduced result
+      // assign input/output tensor to support vector reduce_scatter
+      // (reduce_scatter_v) where inputs are reduced and scattered unevenly
+      // among participating ranks
+      auto& input_tensor = input_list[i];
+      auto& output_tensor = output;
+      if (input_tensor.numel() != output_tensor.numel()) {
+        throw std::runtime_error(
+            "Output tensor size must equal input tensor size for all_gather");
+      }
+      nccl_api_->reduce(
+          input_tensor.data_ptr(),
+          output_tensor.data_ptr(),
+          output_tensor.numel(),
+          dataType,
+          getNcclReduceOp(op, nccl_comm_, dataType),
+          i,
+          nccl_comm_,
+          stream);
+    } else {
+      // Other ranks contribute to the reduction
+      nccl_api_->reduce(
+          input_list[i].data_ptr(),
+          nullptr, // Non-root ranks don't receive
+          input_list[i].numel(),
+          dataType,
+          getNcclReduceOp(op, nccl_comm_, dataType),
+          i,
+          nccl_comm_,
+          stream);
+    }
+  }
+
+  nccl_api_->groupEnd();
+
+  work->recordEnd();
+
+  // Enqueue the work after events have been recorded
+  enqueueWork(work, stream);
+
+  return work;
+}
+
 std::shared_ptr<TorchWork> TorchCommNCCLX::reduce_scatter_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.hpp b/comms/torchcomms/ncclx/TorchCommNCCLX.hpp
@@ -123,6 +123,12 @@ class TorchCommNCCLX : public TorchCommBackend,
       ReduceOp op,
       bool async_op,
       const ReduceScatterOptions& options = {}) override;
+  std::shared_ptr<TorchWork> reduce_scatter_v(
+      at::Tensor& output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      bool async_op,
+      const ReduceScatterOptions& options = {}) override;
   std::shared_ptr<TorchWork> reduce_scatter_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp b/comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp
@@ -1115,6 +1115,9 @@ TEST_F(
   testOperation([&]() {
     comm->reduce_scatter(tensor, input_list, ReduceOp::SUM, false);
   });
+  testOperation([&]() {
+    comm->reduce_scatter_v(tensor, input_list, ReduceOp::SUM, false);
+  });
   testOperation([&]() {
     comm->reduce_scatter_single(
         tensor, large_input_tensor, ReduceOp::SUM, false);
diff --git a/comms/torchcomms/rccl/TorchCommRCCL.cpp b/comms/torchcomms/rccl/TorchCommRCCL.cpp
@@ -821,6 +821,15 @@ std::shared_ptr<TorchWork> TorchCommRCCL::reduce_scatter(
   return work;
 }
 
+std::shared_ptr<TorchWork> TorchCommRCCL::reduce_scatter_v(
+    at::Tensor& output,
+    const std::vector<at::Tensor>& input_list,
+    ReduceOp op,
+    bool async_op,
+    const ReduceScatterOptions& options) {
+  throw std::runtime_error("reduce_scatter_v not implemented");
+}
+
 std::shared_ptr<TorchWork> TorchCommRCCL::reduce_scatter_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/rccl/TorchCommRCCL.hpp b/comms/torchcomms/rccl/TorchCommRCCL.hpp
@@ -119,6 +119,12 @@ class TorchCommRCCL : public TorchCommBackend,
       ReduceOp op,
       bool async_op,
       const ReduceScatterOptions& options = {}) override;
+  std::shared_ptr<TorchWork> reduce_scatter_v(
+      at::Tensor& output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      bool async_op,
+      const ReduceScatterOptions& options = {}) override;
   std::shared_ptr<TorchWork> reduce_scatter_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/tests/integration/cpp/ReduceScatterVTest.cpp b/comms/torchcomms/tests/integration/cpp/ReduceScatterVTest.cpp
diff --git a/comms/torchcomms/tests/integration/cpp/ReduceScatterVTest.hpp b/comms/torchcomms/tests/integration/cpp/ReduceScatterVTest.hpp
diff --git a/comms/torchcomms/tests/integration/cpp/ReduceScatterVTestMain.cpp b/comms/torchcomms/tests/integration/cpp/ReduceScatterVTestMain.cpp
diff --git a/comms/torchcomms/tests/integration/py/ReduceScatterVTest.py b/comms/torchcomms/tests/integration/py/ReduceScatterVTest.py
diff --git a/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.cpp b/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.cpp
diff --git a/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.hpp b/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.hpp