KepingYan
diff --git a/‎CHANGELOG.md
+6 b/‎CHANGELOG.md
+6
diff --git a/‎CMakeLists.txt
+5-4 b/‎CMakeLists.txt
+5-4
diff --git a/‎docs/concepts.rst
+4 b/‎docs/concepts.rst
+4
diff --git a/‎docs/gpus.rst
+1-1 b/‎docs/gpus.rst
+1-1
diff --git a/‎docs/install.rst
+3-1 b/‎docs/install.rst
+3-1
diff --git a/‎horovod/_keras/__init__.py
+4 b/‎horovod/_keras/__init__.py
+4
diff --git a/‎horovod/common/common.cc
+2-2 b/‎horovod/common/common.cc
+2-2
diff --git a/‎horovod/common/common.h
+2 b/‎horovod/common/common.h
+2
diff --git a/‎horovod/common/controller.cc
+25-4 b/‎horovod/common/controller.cc
+25-4
diff --git a/‎horovod/common/message.cc
+6 b/‎horovod/common/message.cc
+6
diff --git a/‎horovod/common/message.h
+17-3 b/‎horovod/common/message.h
+17-3
diff --git a/‎horovod/common/nvtx_op_range.h
+1 b/‎horovod/common/nvtx_op_range.h
+1
diff --git a/‎horovod/common/operations.cc
+82-1 b/‎horovod/common/operations.cc
+82-1
@@ -8,10 +8,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `hvd.reducescatter()` operation with implementations in NCCL, MPI, and Gloo. ([#3299](https://github.com/horovod/horovod/pull/3299)) 
+
 ### Changed
 
+- MXNet: Updated allreduce functions to newer `op` API. ([#3299](https://github.com/horovod/horovod/pull/3299))
+
 ### Deprecated
 
+- MXNet: Deprecated `average` argument of allreduce functions. ([#3299](https://github.com/horovod/horovod/pull/3299))
+
 ### Removed
 
 ### Fixed
 
@@ -110,8 +110,9 @@ set_gpu_op(HOROVOD_GPU_ALLREDUCE "MPI;NCCL;DDL")
 set_gpu_op(HOROVOD_GPU_ALLGATHER "MPI;NCCL")
 set_gpu_op(HOROVOD_GPU_BROADCAST "MPI;NCCL")
 set_gpu_op(HOROVOD_GPU_ALLTOALL "MPI;NCCL")
+set_gpu_op(HOROVOD_GPU_REDUCESCATTER "MPI;NCCL")
 
-foreach(VAR in ITEMS HOROVOD_GPU_ALLREDUCE HOROVOD_GPU_ALLGATHER HOROVOD_GPU_BROADCAST HOROVOD_GPU_ALLTOALL)
+foreach(VAR in ITEMS HOROVOD_GPU_ALLREDUCE HOROVOD_GPU_ALLGATHER HOROVOD_GPU_BROADCAST HOROVOD_GPU_ALLTOALL HOROVOD_GPU_REDUCESCATTER)
     if(DEFINED ${VAR})
         string(SUBSTRING ${${VAR}} 0 1 ${VAR})
         convert_to_ascii_dec(ASCII_DEC ${${VAR}})
@@ -197,7 +198,7 @@ macro(ADD_CUDA)
     endif()
 endmacro()
 
-if(DEFINED HOROVOD_GPU_ALLREDUCE OR DEFINED HOROVOD_GPU_ALLGATHER OR DEFINED HOROVOD_GPU_BROADCAST OR DEFINED HOROVOD_GPU_ALLTOALL)
+if(DEFINED HOROVOD_GPU_ALLREDUCE OR DEFINED HOROVOD_GPU_ALLGATHER OR DEFINED HOROVOD_GPU_BROADCAST OR DEFINED HOROVOD_GPU_ALLTOALL OR DEFINED HOROVOD_GPU_REDUCESCATTER)
     if(NOT DEFINED HOROVOD_GPU OR HOROVOD_GPU STREQUAL "CUDA")
         add_cuda()
     elseif(HOROVOD_GPU STREQUAL "ROCM")
@@ -215,7 +216,7 @@ if(DEFINED HOROVOD_GPU_ALLREDUCE OR DEFINED HOROVOD_GPU_ALLGATHER OR DEFINED HOR
 endif()
 
 # NCCL
-if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" OR HOROVOD_GPU_ALLGATHER STREQUAL "N" OR HOROVOD_GPU_BROADCAST STREQUAL "N" OR HOROVOD_GPU_ALLTOALL STREQUAL "N")
+if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" OR HOROVOD_GPU_ALLGATHER STREQUAL "N" OR HOROVOD_GPU_BROADCAST STREQUAL "N" OR HOROVOD_GPU_ALLTOALL STREQUAL "N" OR HOROVOD_GPU_REDUCESCATTER STREQUAL "N")
     if(HAVE_ROCM)
         find_package(rccl REQUIRED)
         include_directories(SYSTEM ${RCCL_INCLUDE_DIRS})
@@ -256,7 +257,7 @@ if(DEFINED CCL_ROOT)
 endif()
 
 set(HOROVOD_ALLOW_MIXED_GPU_IMPL $ENV{HOROVOD_ALLOW_MIXED_GPU_IMPL})
-if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" AND (HOROVOD_GPU_ALLGATHER STREQUAL "M" OR HOROVOD_GPU_BROADCAST STREQUAL "M" OR HOROVOD_GPU_ALLTOALL STREQUAL "M") AND
+if(HOROVOD_GPU_ALLREDUCE STREQUAL "N" AND (HOROVOD_GPU_ALLGATHER STREQUAL "M" OR HOROVOD_GPU_BROADCAST STREQUAL "M" OR HOROVOD_GPU_ALLTOALL STREQUAL "M" OR HOROVOD_GPU_REDUCESCATTER STREQUAL "M") AND
    NOT HOROVOD_ALLOW_MIXED_GPU_IMPL STREQUAL "1")
 message(FATAL_ERROR "You should not mix NCCL and MPI GPU due to a possible deadlock.\n"
                     "If you are sure you want to mix them, set the "
 
@@ -31,6 +31,10 @@ a training script on 4 servers, each having 4 GPUs. If we launched one copy of t
     .. image:: http://mpitutorial.com/tutorials/mpi-broadcast-and-collective-communication/broadcast_pattern.png
        :alt: Broadcast Illustration
 
+* *Reducescatter* is an operation that aggregates data among multiple processes and scatters the data across them.  *Reducescatter* is used to average dense tensors then split them across processes.  Here's an illustration from the `Nvidia developer guide <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter>`__:
+
+    .. image:: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/_images/reducescatter.png
+       :alt: Reducescatter Illustration
 
 * *Alltoall* is an operation to exchange data between all processes.  *Alltoall* may be useful to implement neural networks with advanced architectures that span multiple devices.
 
 
@@ -82,7 +82,7 @@ use it instead:
     $ HOROVOD_GPU_ALLREDUCE=MPI pip install --no-cache-dir horovod
 
 
-Additionally, if your MPI vendor's implementation supports *allgather* and *broadcast* operations on GPU, you can
+Additionally, if your MPI vendor's implementation supports *allgather*, *broadcast*, and *reducescatter* operations on GPU, you can
 configure Horovod to use them as well:
 
 .. code-block:: bash
 
@@ -245,7 +245,9 @@ Possible values are given in curly brackets: {}.
 * ``HOROVOD_GPU_ALLREDUCE`` - {NCCL, MPI}. Framework to use for GPU tensor allreduce.
 * ``HOROVOD_GPU_ALLGATHER`` - {NCCL, MPI}. Framework to use for GPU tensor allgather.
 * ``HOROVOD_GPU_BROADCAST`` - {NCCL, MPI}. Framework to use for GPU tensor broadcast.
-* ``HOROVOD_ALLOW_MIXED_GPU_IMPL`` - {1}. Allow Horovod to install with NCCL allreduce and MPI GPU allgather / broadcast.  Not recommended due to a possible deadlock.
+* ``HOROVOD_GPU_ALLTOALL`` - {NCCL, MPI}. Framework to use for GPU tensor alltoall.
+* ``HOROVOD_GPU_REDUCESCATTER`` - {NCCL, MPI}. Framework to use for GPU tensor reducescatter.
+* ``HOROVOD_ALLOW_MIXED_GPU_IMPL`` - {1}. Allow Horovod to install with NCCL allreduce and MPI GPU allgather / broadcast / alltoall / reducescatter.  Not recommended due to a possible deadlock.
 * ``HOROVOD_CPU_OPERATIONS`` - {MPI, GLOO, CCL}. Framework to use for CPU tensor allreduce, allgather, and broadcast.
 * ``HOROVOD_CMAKE`` - path to the CMake binary used to build Horovod.
 * ``HOROVOD_WITH_TENSORFLOW`` - {1}. Require Horovod to install with TensorFlow support enabled.
 
@@ -188,6 +188,10 @@ def broadcast(backend, value, root_rank, name):
     return _eval(backend, hvd.broadcast(tf.constant(value, name=name), root_rank))
 
 
+def reducescatter(backend, value, name, op):
+    return _eval(backend, hvd.reducescatter(tf.constant(value, name=name), op=op))
+
+
 def load_model(keras, wrap_optimizer, optimizer_modules, filepath, custom_optimizers, custom_objects):
     horovod_objects = {
         subclass.__name__.lower(): wrap_optimizer(subclass)
 
@@ -101,7 +101,7 @@ int TensorShape::dims() const {
 
 int64_t TensorShape::dim_size(int idx) const {
   assert(idx >= 0);
-  assert(idx < shape_.size());
+  assert(idx < (int)shape_.size());
   return shape_[idx];
 }
 
@@ -165,7 +165,7 @@ void parse_and_set_affinity(const char* affinity, int local_size, int local_rank
     auto core_id_str = strsep(&tmp, ",");
     errno = 0;
     auto core_id = std::strtol(core_id_str, &endptr, 10);
-    if (errno == ERANGE && (core_id == LONG_MAX || core_id == LONG_MIN)
+    if ((errno == ERANGE && (core_id == LONG_MAX || core_id == LONG_MIN))
         || (errno != 0 && core_id == 0)){
         LOG(ERROR) << "Core ID value is invalid in " << HOROVOD_THREAD_AFFINITY
                    << "=" << affinity;
 
@@ -83,6 +83,7 @@ namespace common {
 #define MEMCPY_IN_SHARED_BUFFER "MEMCPY_IN_SHARED_BUFFER"
 #define MPI_ALLREDUCE "MPI_ALLREDUCE"
 #define MPI_ADASUM_ALLREDUCE "MPI_ADASUM_ALLREDUCE"
+#define MPI_REDUCESCATTER "MPI_REDUCESCATTER"
 #define MEMCPY_OUT_HOST_BUFFER "MEMCPY_OUT_HOST_BUFFER"
 #define NCCL_ALLREDUCE "NCCL_ALLREDUCE"
 #define MEMCPY_OUT_FUSION_BUFFER "MEMCPY_OUT_FUSION_BUFFER"
@@ -102,6 +103,7 @@ namespace common {
 #define GLOO_ALLREDUCE "GLOO_ALLREDUCE"
 #define GLOO_ALLGATHER "GLOO_ALLGATHER"
 #define GLOO_BCAST "GLOO_BCAST"
+#define GLOO_REDUCESCATTER "GLOO_REDUCESCATTER"
 #define HOROVOD_ELASTIC "HOROVOD_ELASTIC"
 
 // Horovod knobs.
 
@@ -536,11 +536,12 @@ Response Controller::ConstructResponse(const std::string& name, int joined_size)
     }
   }
 
-  // If we are doing an allreduce or broadcast, check that all tensor shapes are
-  // identical.
+  // If we are doing an allreduce, broadcast, or reducescatter check that all
+  // tensor shapes are identical.
   if (message_type == Request::ALLREDUCE ||
       message_type == Request::ADASUM ||
-      message_type == Request::BROADCAST) {
+      message_type == Request::BROADCAST ||
+      message_type == Request::REDUCESCATTER) {
     TensorShape tensor_shape;
     for (auto dim : requests[0].tensor_shape()) {
       tensor_shape.AddDim(dim);
@@ -673,6 +674,19 @@ Response Controller::ConstructResponse(const std::string& name, int joined_size)
     }
   }
 
+  if (message_type == Request::REDUCESCATTER) {
+    if (joined_size > 0) {
+      error = true;
+      error_message_stream << "Reducescatter is not supported with Join at this time.";
+    }
+
+    TensorShape tensor_shape;
+    for (auto dim : requests[0].tensor_shape()) {
+      tensor_shape.AddDim(dim);
+    }
+    tensor_sizes.push_back(tensor_shape.num_elements());
+  }
+
   if (message_type == Request::ALLREDUCE || message_type == Request::ADASUM) {
     TensorShape tensor_shape;
     for (auto dim : requests[0].tensor_shape()) {
@@ -756,6 +770,12 @@ Response Controller::ConstructResponse(const std::string& name, int joined_size)
     response.set_response_type(Response::BROADCAST);
   } else if (message_type == Request::ALLTOALL) {
     response.set_response_type(Response::ALLTOALL);
+  } else if (message_type == Request::REDUCESCATTER) {
+    response.set_response_type(Response::REDUCESCATTER);
+    for (auto dim : tensor_sizes) {
+      response.add_tensor_size(dim);
+    }
+    response.set_tensor_type(data_type);
   } else if (message_type == Request::ADASUM) {
     response.set_response_type(Response::ADASUM);
     for (auto dim : tensor_sizes) {
@@ -815,7 +835,8 @@ void Controller::FuseResponses(std::deque<Response>& responses,
     responses.pop_front();
     int64_t tensor_size = 0;
     if (response.response_type() == Response::ResponseType::ALLREDUCE ||
-        response.response_type() == Response::ResponseType::ADASUM) {
+        response.response_type() == Response::ResponseType::ADASUM ||
+        response.response_type() == Response::ResponseType::REDUCESCATTER) {
       // Attempt to add more responses to this fused response.
 
       tensor_size = response.tensor_sizes()[0] * GetTypeSize(response.tensor_type());
 
@@ -102,6 +102,9 @@ const std::string& Request::RequestType_Name(RequestType value) {
     case RequestType::BROADCAST:
       static const std::string broadcast("BROADCAST");
       return broadcast;
+    case RequestType::REDUCESCATTER:
+      static const std::string reducescatter("REDUCESCATTER");
+      return reducescatter;
     case RequestType::JOIN:
       static const std::string join("JOIN");
       return join;
@@ -294,6 +297,9 @@ const std::string& Response::ResponseType_Name(ResponseType value) {
     case ResponseType::BROADCAST:
       static const std::string broadcast("BROADCAST");
       return broadcast;
+    case ResponseType::REDUCESCATTER:
+      static const std::string reducescatter("REDUCESCATTER");
+      return reducescatter;
     case ResponseType::JOIN:
       static const std::string join("JOIN");
       return join;
 
@@ -50,10 +50,16 @@ std::size_t DataType_Size(DataType value);
 class Request {
 public:
   enum RequestType {
-    ALLREDUCE = 0, ALLGATHER = 1, BROADCAST = 2, JOIN = 3, ADASUM = 4, ALLTOALL = 5, BARRIER = 6
+    ALLREDUCE = 0,
+    ALLGATHER = 1,
+    BROADCAST = 2,
+    JOIN = 3,
+    ADASUM = 4,
+    ALLTOALL = 5,
+    BARRIER = 6,
+    REDUCESCATTER = 7
   };
 
-
   static const std::string& RequestType_Name(RequestType value);
 
   // The request rank is necessary to create a consistent ordering of results,
@@ -153,7 +159,15 @@ class RequestList {
 class Response {
 public:
   enum ResponseType {
-    ALLREDUCE = 0, ALLGATHER = 1, BROADCAST = 2, JOIN = 3, ADASUM = 4, ALLTOALL= 5, BARRIER=6, ERROR = 7
+    ALLREDUCE = 0,
+    ALLGATHER = 1,
+    BROADCAST = 2,
+    JOIN = 3,
+    ADASUM = 4,
+    ALLTOALL = 5,
+    BARRIER = 6,
+    REDUCESCATTER = 7,
+    ERROR = 8
   };
 
   static const std::string& ResponseType_Name(ResponseType value);
 
@@ -15,6 +15,7 @@ enum class RegisteredNvtxOp {
   HorovodAllgather,
   HorovodBroadcast,
   HorovodAlltoall,
+  HorovodReducescatter,
   // Insert new enum values above this line
   END,
 };
 
@@ -148,6 +148,7 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
   std::vector<std::shared_ptr<AllreduceOp>> allreduce_ops;
   std::vector<std::shared_ptr<AllgatherOp>> allgather_ops;
   std::vector<std::shared_ptr<BroadcastOp>> broadcast_ops;
+  std::vector<std::shared_ptr<ReducescatterOp>> reducescatter_ops;
   std::vector<std::shared_ptr<AllreduceOp>> adasum_ops;
   std::vector<std::shared_ptr<AlltoallOp>> alltoall_ops;
 
@@ -180,6 +181,11 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
     alltoall_ops.push_back(
         std::shared_ptr<AlltoallOp>(new MPI_GPUAlltoall(&gpu_context, &state)));
 #endif
+
+#if HOROVOD_GPU_REDUCESCATTER == 'M'
+    reducescatter_ops.push_back(std::shared_ptr<ReducescatterOp>(
+        new MPI_GPUReduceScatter(&gpu_context, &state)));
+#endif
   }
 #endif
 
@@ -198,6 +204,11 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
       new NCCLAllgather(&nccl_context, &gpu_context, &state)));
 #endif
 
+#if HAVE_NCCL && HOROVOD_GPU_REDUCESCATTER == 'N'
+    reducescatter_ops.push_back(std::shared_ptr<ReducescatterOp>(
+        new NCCLReducescatter(&nccl_context, &gpu_context, &state)));
+#endif
+
 #if HAVE_NCCL && HOROVOD_GPU_ALLTOALL == 'N'
   alltoall_ops.push_back(std::shared_ptr<AlltoallOp>(
       new NCCLAlltoall(&nccl_context, &gpu_context, &state)));
@@ -213,6 +224,8 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
         std::shared_ptr<BroadcastOp>(new GlooBroadcast(&state)));
     alltoall_ops.push_back(
         std::shared_ptr<AlltoallOp>(new GlooAlltoall(&state)));
+    reducescatter_ops.push_back(
+        std::shared_ptr<ReducescatterOp>(new GlooReducescatter(&state)));
   }
 #endif
 
@@ -240,6 +253,8 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
         std::shared_ptr<BroadcastOp>(new MPIBroadcast(&state)));
     alltoall_ops.push_back(
         std::shared_ptr<AlltoallOp>(new MPIAlltoall(&state)));
+    reducescatter_ops.push_back(
+        std::shared_ptr<ReducescatterOp>(new MPIReducescatter(&state)));
   }
 #endif
 
@@ -249,7 +264,8 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
 
   return new OperationManager(&state.parameter_manager, allreduce_ops,
                               allgather_ops, broadcast_ops, alltoall_ops,
-                              join_op, adasum_ops, barrier_op, error_op);
+                              reducescatter_ops, join_op, adasum_ops,
+                              barrier_op, error_op);
 }
 
 // Process a Response by doing a reduction, a gather, a broadcast, or
@@ -1637,6 +1653,71 @@ Status EnqueueTensorBroadcast(std::shared_ptr<OpContext> context,
   return status;
 }
 
+// Contexts and controller must be initialized and the background thread
+// must be running before this function is called.
+Status EnqueueTensorReducescatter(std::shared_ptr<OpContext> context,
+                                  std::shared_ptr<Tensor> tensor,
+                                  ReadyEventList ready_event_list,
+                                  const std::string& name, const int device,
+                                  StatusCallback callback, ReduceOp reduce_op,
+                                  int32_t process_set_id) {
+  if (horovod_global.cpu_operation == LibType::CCL && device == CPU_DEVICE_ID) {
+    return Status::InvalidArgument(
+        "Reducescatter is not supported yet with oneCCL operations.");
+  }
+  if (!horovod_global.process_set_table.Contains(process_set_id)) {
+    return Status::InvalidArgument(
+        "Reducescatter: Process set provided does not "
+        "exist, or has not been registered.");
+  }
+  if (reduce_op != ReduceOp::SUM) {
+    // Note: AVERAGE is supported by enqueuing SUM and performing divide at the
+    // framework level.
+    LOG(ERROR, horovod_global.global_controller->GetRank())
+        << "Reducescatter currently only supports SUM.";
+    return Status::Aborted("Reducescatter currently only supports SUM.");
+  }
+  if (horovod_global.shut_down) {
+    return SHUT_DOWN_ERROR;
+  }
+  auto& process_set = horovod_global.process_set_table.Get(process_set_id);
+
+  if (!process_set.IsCurrentProcessIncluded()) {
+    return Status::InvalidArgument(
+        "Reducescatter: Rank " +
+        std::to_string(horovod_global.global_controller->GetRank()) +
+        " is not a member of the provided process set.");
+  }
+
+  Request message;
+  message.set_request_rank(process_set.controller->GetRank());
+  message.set_tensor_name(name);
+  message.set_tensor_type(tensor->dtype());
+  message.set_device(device);
+  message.set_request_type(Request::REDUCESCATTER);
+  for (int i = 0; i < tensor->shape().dims(); ++i) {
+    message.add_tensor_shape((int64_t)tensor->shape().dim_size(i));
+  }
+
+  TensorTableEntry e;
+  e.tensor_name = name;
+  e.context = context;
+  e.tensor = tensor;
+  e.process_set_id = process_set_id;
+  e.ready_event_list = ready_event_list;
+  e.device = device;
+  e.callback = callback;
+  e.nvtx_op_range.Start(RegisteredNvtxOp::HorovodReducescatter,
+                        e.tensor->size());
+
+  Status status = process_set.tensor_queue.AddToTensorQueue(e, message);
+  if (status.ok()) {
+    LOG(TRACE, horovod_global.global_controller->GetRank())
+        << "Enqueued " << name;
+  }
+  return status;
+}
+
 // Contexts and controller must be initialized and the background thread
 // must be running before this function is called.
 Status EnqueueTensorAlltoall(std::shared_ptr<OpContext> context,