check size

turboFei · turboFei · commit 3cc6f875db5a · 2025-03-19T15:19:25.000-07:00
diff --git a/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala b/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala
@@ -325,7 +325,9 @@ class ReducePartitionCommitHandler(
           StatusCode.SUCCESS,
           reducerFileGroupsMap.getOrDefault(shuffleId, JavaUtils.newConcurrentHashMap()),
           getMapperAttempts(shuffleId))
-        if (shouldBroadcastGetReducerFileGroup(response)) {
+
+        // only check whether broadcast enabled for the UTs
+        if (conf.getReducerFileGroupBroadcastEnabled) {
           response = broadcastGetReducerFileGroup(shuffleId, response)
         }
 
@@ -335,7 +337,7 @@ class ReducePartitionCommitHandler(
           shuffleId,
           new Callable[ByteBuffer]() {
             override def call(): ByteBuffer = {
-              var response = GetReducerFileGroupResponse(
+              val returnedMsg = GetReducerFileGroupResponse(
                 StatusCode.SUCCESS,
                 reducerFileGroupsMap.getOrDefault(shuffleId, JavaUtils.newConcurrentHashMap()),
                 getMapperAttempts(shuffleId),
@@ -344,23 +346,27 @@ class ReducePartitionCommitHandler(
                     shuffleId,
                     new util.HashMap[String, util.Set[PushFailedBatch]]()))
 
-              if (shouldBroadcastGetReducerFileGroup(response)) {
-                response = broadcastGetReducerFileGroup(shuffleId, response)
+              val serializedMsg =
+                context.asInstanceOf[RemoteNettyRpcCallContext].nettyEnv.serialize(returnedMsg)
+
+              if (conf.getReducerFileGroupBroadcastEnabled &&
+                serializedMsg.capacity() >= conf.getReducerFileGroupBroadcastMiniSize) {
+                val broadcastMsg = broadcastGetReducerFileGroup(shuffleId, returnedMsg)
+                if (broadcastMsg != returnedMsg) {
+                  context.asInstanceOf[RemoteNettyRpcCallContext].nettyEnv.serialize(broadcastMsg)
+                } else {
+                  serializedMsg
+                }
+              } else {
+                serializedMsg
               }
-
-              context.asInstanceOf[RemoteNettyRpcCallContext].nettyEnv.serialize(response)
             }
           })
         context.asInstanceOf[RemoteNettyRpcCallContext].callback.onSuccess(cachedMsg)
       }
     }
   }
 
-  private def shouldBroadcastGetReducerFileGroup(response: GetReducerFileGroupResponse): Boolean = {
-    conf.getReducerFileGroupBroadcastEnabled &&
-      response.partitionIds.size() >= conf.getReducerFileGroupBroadcastMiniPartitions
-  }
-
   private def broadcastGetReducerFileGroup(
       shuffleId: Int,
       response: GetReducerFileGroupResponse): GetReducerFileGroupResponse = {
diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -1054,8 +1054,8 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se
     get(CLIENT_PUSH_DYNAMIC_WRITE_MODE_PARTITION_NUM_THRESHOLD)
   def getReducerFileGroupBroadcastEnabled =
     get(CLIENT_SHUFFLE_GET_REDUCER_FILE_GROUP_BROADCAST_ENABLED)
-  def getReducerFileGroupBroadcastMiniPartitions =
-    get(CLIENT_SHUFFLE_GET_REDUCER_FILE_GROUP_BROADCAST_MINI_PARTITIONS)
+  def getReducerFileGroupBroadcastMiniSize =
+    get(CLIENT_SHUFFLE_GET_REDUCER_FILE_GROUP_BROADCAST_MINI_SIZE)
   def shufflePartitionType: PartitionType = PartitionType.valueOf(get(SHUFFLE_PARTITION_TYPE))
   def shuffleRangeReadFilterEnabled: Boolean = get(SHUFFLE_RANGE_READ_FILTER_ENABLED)
   def shuffleForceFallbackEnabled: Boolean = get(SPARK_SHUFFLE_FORCE_FALLBACK_ENABLED)
@@ -5225,13 +5225,13 @@ object CelebornConf extends Logging {
       .booleanConf
       .createWithDefault(false)
 
-  val CLIENT_SHUFFLE_GET_REDUCER_FILE_GROUP_BROADCAST_MINI_PARTITIONS =
-    buildConf("celeborn.client.spark.shuffle.getReducerFileGroup.broadcast.miniPartitions")
+  val CLIENT_SHUFFLE_GET_REDUCER_FILE_GROUP_BROADCAST_MINI_SIZE =
+    buildConf("celeborn.client.spark.shuffle.getReducerFileGroup.broadcast.miniSize")
       .categories("client")
-      .doc("The mini partitions size at which to broadcast the GetReducerFileGroupResponse to the executors.")
+      .doc("The size at which we use Broadcast to send the GetReducerFileGroupResponse to the executors.")
       .version("0.6.0")
-      .intConf
-      .createWithDefault(10000)
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefaultString("512k")
 
   val SPARK_SHUFFLE_WRITER_MODE: ConfigEntry[String] =
     buildConf("celeborn.client.spark.shuffle.writer")
diff --git a/docs/configuration/client.md b/docs/configuration/client.md
@@ -123,7 +123,7 @@ license: |
 | celeborn.client.spark.shuffle.fallback.policy | AUTO | false | Celeborn supports the following kind of fallback policies. 1. ALWAYS: always use spark built-in shuffle implementation; 2. AUTO: prefer to use celeborn shuffle implementation, and fallback to use spark built-in shuffle implementation based on certain factors, e.g. availability of enough workers and quota, shuffle partition number; 3. NEVER: always use celeborn shuffle implementation, and fail fast when it it is concluded that fallback is required based on factors above. | 0.5.0 |  | 
 | celeborn.client.spark.shuffle.forceFallback.enabled | false | false | Always use spark built-in shuffle implementation. This configuration is deprecated, consider configuring `celeborn.client.spark.shuffle.fallback.policy` instead. | 0.3.0 | celeborn.shuffle.forceFallback.enabled | 
 | celeborn.client.spark.shuffle.getReducerFileGroup.broadcast.enabled | false | false | Whether to leverage Spark broadcast mechanism to send the GetReducerFileGroupResponse. If the response size is large and Spark executor number is large, the Spark driver network may be exhausted because each executor will pull the response from the driver. With broadcasting GetReducerFileGroupResponse, it prevents the driver from being the bottleneck in sending out multiple copies of the GetReducerFileGroupResponse (one per executor). | 0.6.0 |  | 
-| celeborn.client.spark.shuffle.getReducerFileGroup.broadcast.miniPartitions | 10000 | false | The mini partitions size at which to broadcast the GetReducerFileGroupResponse to the executors. | 0.6.0 |  | 
+| celeborn.client.spark.shuffle.getReducerFileGroup.broadcast.miniSize | 512k | false | The size at which we use Broadcast to send the GetReducerFileGroupResponse to the executors. | 0.6.0 |  | 
 | celeborn.client.spark.shuffle.writer | HASH | false | Celeborn supports the following kind of shuffle writers. 1. hash: hash-based shuffle writer works fine when shuffle partition count is normal; 2. sort: sort-based shuffle writer works fine when memory pressure is high or shuffle partition count is huge. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is false. | 0.3.0 | celeborn.shuffle.writer | 
 | celeborn.client.spark.stageRerun.enabled | true | false | Whether to enable stage rerun. If true, client throws FetchFailedException instead of CelebornIOException. | 0.4.0 | celeborn.client.spark.fetch.throwsFetchFailure | 
 | celeborn.identity.provider | org.apache.celeborn.common.identity.DefaultIdentityProvider | false | IdentityProvider class name. Default class is `org.apache.celeborn.common.identity.DefaultIdentityProvider`. Optional values: org.apache.celeborn.common.identity.HadoopBasedIdentityProvider user name will be obtained by UserGroupInformation.getUserName; org.apache.celeborn.common.identity.DefaultIdentityProvider user name and tenant id are default values or user-specific values. | 0.6.0 | celeborn.quota.identity.provider |