KepingYan
diff --git a/‎horovod/common/controller.cc
+6-1 b/‎horovod/common/controller.cc
+6-1
diff --git a/‎horovod/common/message.cc
+16-2 b/‎horovod/common/message.cc
+16-2
diff --git a/‎horovod/common/message.h
+9-4 b/‎horovod/common/message.h
+9-4
diff --git a/‎horovod/common/operations.cc
+4-3 b/‎horovod/common/operations.cc
+4-3
diff --git a/‎horovod/common/operations.h
+1 b/‎horovod/common/operations.h
+1
diff --git a/‎horovod/common/ops/collective_operations.cc
+5-1 b/‎horovod/common/ops/collective_operations.cc
+5-1
diff --git a/‎horovod/common/process_set.h
+3 b/‎horovod/common/process_set.h
+3
diff --git a/‎horovod/common/tensor_queue.cc
+12 b/‎horovod/common/tensor_queue.cc
+12
diff --git a/‎horovod/common/wire/message.fbs
+2-1 b/‎horovod/common/wire/message.fbs
+2-1
@@ -261,6 +261,7 @@ ResponseList Controller::ComputeResponseList(bool this_process_requested_shutdow
 
         if (message.request_type() == Request::JOIN) {
           process_set.joined_size++;
+          process_set.last_joined_rank = global_ranks_[rank_];
           continue;
         }
 
@@ -285,6 +286,7 @@ ResponseList Controller::ComputeResponseList(bool this_process_requested_shutdow
 
           if (received_message.request_type() == Request::JOIN) {
             process_set.joined_size++;
+            process_set.last_joined_rank = global_ranks_[i];
             continue;
           }
 
@@ -401,12 +403,15 @@ ResponseList Controller::ComputeResponseList(bool this_process_requested_shutdow
         responses.push_back(std::move(response));
       }
       if (process_set.joined_size == size_) {
-        // All ranks did Join(). Send the response, reset joined size.
+        // All ranks did Join(). Send the response, reset joined_size and
+        // last_joined_rank.
         Response join_response;
         join_response.set_response_type(Response::JOIN);
         join_response.add_tensor_name(JOIN_TENSOR_NAME);
+        join_response.set_last_joined_rank(process_set.last_joined_rank);
         responses.push_back(std::move(join_response));
         process_set.joined_size = 0;
+        process_set.last_joined_rank = -1;
       }
       FuseResponses(responses, state, response_list);
       response_list.set_shutdown(should_shut_down);
 
@@ -391,9 +391,21 @@ double Response::prescale_factor() const { return prescale_factor_; };
 
 double Response::postscale_factor() const { return postscale_factor_; };
 
-void Response::set_prescale_factor(const double prescale_factor) { prescale_factor_ = prescale_factor; };
+void Response::set_prescale_factor(const double prescale_factor) {
+  prescale_factor_ = prescale_factor;
+};
 
-void Response::set_postscale_factor(const double postscale_factor) { postscale_factor_ = postscale_factor; };
+void Response::set_postscale_factor(const double postscale_factor) {
+  postscale_factor_ = postscale_factor;
+};
+
+int Response::last_joined_rank() const {
+  return last_joined_rank_;
+}
+
+void Response::set_last_joined_rank(int value) {
+  last_joined_rank_ = value;
+}
 
 void Response_ParseFromWire(Response& response,
                             const wire::Response* obj) {
@@ -409,6 +421,7 @@ void Response_ParseFromWire(Response& response,
                                                  obj->tensor_sizes()->end()));
   response.set_prescale_factor(obj->prescale_factor());
   response.set_postscale_factor(obj->postscale_factor());
+  response.set_last_joined_rank(obj->last_joined_rank());
 }
 
 void Response::ParseFromBytes(Response& response, const uint8_t* input) {
@@ -437,6 +450,7 @@ void Response_SerializeToWire(const Response& response,
   response_builder.add_tensor_sizes(tensor_sizes_wire);
   response_builder.add_prescale_factor(response.prescale_factor());
   response_builder.add_postscale_factor(response.postscale_factor());
+  response_builder.add_last_joined_rank(response.last_joined_rank());
   obj = response_builder.Finish();
 }
 
 
@@ -146,10 +146,10 @@ class RequestList {
 };
 
 // A Response is a message sent from the coordinator (rank zero) to a rank
-// greater than zero, informing the rank of an operation should be performed
-// now. If the operation requested would result in an error (for example, due
-// to a type or shape mismatch), then the Response can contain an error and
-// an error message instead.
+// greater than zero, informing the rank of an operation that should be
+// performed now. If the requested operation would result in an error (for
+// example, due to a type or shape mismatch), then the Response can contain an
+// error and an error message instead.
 class Response {
 public:
   enum ResponseType {
@@ -208,6 +208,10 @@ class Response {
 
   void set_postscale_factor(double postscale_factor);
 
+  int last_joined_rank() const;
+
+  void set_last_joined_rank(int value);
+
   static void ParseFromBytes(Response& response, const uint8_t* input);
 
   static void SerializeToString(const Response& response,
@@ -222,6 +226,7 @@ class Response {
   std::vector<int64_t> tensor_sizes_;
   double prescale_factor_ = 1.0;
   double postscale_factor_ = 1.0;
+  int last_joined_rank_ = -1;
 };
 
 class ResponseList {
 
@@ -257,10 +257,9 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
 void PerformOperation(Response response, ProcessSet& process_set) {
   std::vector<TensorTableEntry> entries;
   auto& timeline = horovod_global.timeline;
+  process_set.tensor_queue.GetTensorEntriesFromResponse(response, entries,
+                                                        process_set.joined);
   if (response.response_type() != Response::JOIN) {
-    process_set.tensor_queue.GetTensorEntriesFromResponse(response, entries,
-                                                          process_set.joined);
-
     for (auto& e : entries) {
       timeline.Start(e.tensor_name, response.response_type(), e.tensor->size());
     }
@@ -1725,6 +1724,7 @@ Status EnqueueTensorAlltoall(std::shared_ptr<OpContext> context,
 // Contexts and controller must be initialized and the background thread
 // must be running before this function is called.
 Status EnqueueJoin(std::shared_ptr<OpContext> context,
+                   std::shared_ptr<Tensor> output_last_joined_rank,
                    ReadyEventList ready_event_list,
                    const std::string& name, const int device,
                    StatusCallback callback,
@@ -1739,6 +1739,7 @@ Status EnqueueJoin(std::shared_ptr<OpContext> context,
   TensorTableEntry e;
   e.tensor_name = name;
   e.context = context;
+  e.output = output_last_joined_rank;
   e.process_set_id = process_set_id;
   e.ready_event_list = ready_event_list;
   e.device = device;
 
@@ -227,6 +227,7 @@ Status EnqueueTensorAlltoall(std::shared_ptr<OpContext> context,
                              int32_t process_set_id = 0);
 
 Status EnqueueJoin(std::shared_ptr<OpContext> context,
+                   std::shared_ptr<Tensor> output_last_joined_rank,
                    ReadyEventList ready_event_list,
                    const std::string& name, int device,
                    StatusCallback callback,
 
@@ -300,10 +300,14 @@ Status JoinOp::Execute(std::vector<TensorTableEntry>& entries,
                        const Response& response, ProcessSet& process_set) {
   WaitForData(entries);
 
-  assert(entries.empty());
+  assert(entries.size() == 1);
+  auto e = entries[0];
+  auto output_ptr = (int*) e.output->data();
+  *output_ptr = response.last_joined_rank();
   if (process_set.joined) {
     process_set.tensor_queue.RemoveJoinTensor();
     process_set.joined = false;
+    process_set.last_joined_rank = -1;
   }
   return Status::OK();
 }
 
@@ -44,6 +44,9 @@ struct ProcessSet {
   // Number of ranks that did Join()
   int joined_size = 0;
 
+  // Last global rank that did Join()
+  int32_t last_joined_rank = -1;
+
   // If a rank is Joined, AllReduce uses temporary 0 tensors for it.
   bool joined = false;
 
 
@@ -90,6 +90,18 @@ void TensorQueue::GetTensorEntriesFromResponse(
   {
     // Lock on the tensor table.
     std::lock_guard<std::mutex> guard(mutex_);
+    if (response.response_type() == Response::JOIN) {
+      assert(response.tensor_names().size() == 1);
+      assert(response.tensor_names()[0] == JOIN_TENSOR_NAME);
+      auto iter = tensor_table_.find(JOIN_TENSOR_NAME);
+      assert(iter != tensor_table_.end());
+
+      entries.push_back(std::move(iter->second));
+
+      // The tensor table will be cleared of the join tensor later in
+      // RemoveJoinTensor().
+      return;
+    }
     int64_t i = 0;
     for (auto& name : response.tensor_names()) {
       assert(response.response_type() == Response::ALLREDUCE ||
 
@@ -62,7 +62,6 @@ table Request {
     // Prescale and postscale factors
     prescale_factor:double;
     postscale_factor:double;
-
 }
 table RequestList {
     requests:[Request];
@@ -110,6 +109,8 @@ table Response {
     // Prescale and postscale factors
     prescale_factor:double;
     postscale_factor:double;
+
+    last_joined_rank:int;
 }
 table ResponseList {
     responses:[Response];