add rampup batch size

NuojCheng · NuojCheng · commit 47a1028a9da2 · 2025-10-31T22:53:08.000Z
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -507,6 +507,18 @@ packing: True
 num_epoch: 1  # only grain and tfds pipeline supports num_epoch > 1
 generate_padding_batch_train: False
 generate_padding_batch_eval: False
+# Rampup batch size, similar to Megatron-LM, see
+# https://github.com/NVIDIA/Megatron-LM/blob/2a01637aa54ccdaf7ea9afc1f1b80f58c53d7f3c/megatron/core/num_microbatches_calculator.py#L233-L237
+# The ramp-up proceeds in stages from `per_device_batch_size_start` up to
+# the final `per_device_batch_size`. For a clean ramp-up, the total range
+# (`per_device_batch_size` - `per_device_batch_size_start`)
+# should be evenly divisible by batch size increment.
+enable_rampup_batch_size: False
+per_device_batch_size_start: 4.0
+per_device_batch_size_increment: 2.0
+# The target number of training samples to process during the ramp-up phase.
+# There is no strict rule for this value, it only needs to be positive.
+global_rampup_samples: 500
 
 # direct preference optimization (DPO)
 use_dpo: False
diff --git a/src/MaxText/data_loader.py b/src/MaxText/data_loader.py
@@ -20,7 +20,6 @@
 from jax.experimental import checkify
 
 from MaxText import exceptions
-from MaxText import sharding
 from MaxText.utils.goodput_utils import (
     GoodputEvent,
     maybe_record_goodput,
@@ -37,7 +36,6 @@ def __init__(self, config, mesh, data_iterator, goodput_recorder):
     self.goodput_recorder = goodput_recorder
     self.data_iterator = data_iterator
     self.last_batch = None
-    self.input_data_shardings = sharding.get_input_data_sharding(config, mesh)
 
   def load_next_batch(self):
     """Loads the next batch. Can keep reusing the same batch for performance reasons."""
@@ -47,12 +45,7 @@ def load_next_batch(self):
           example_batch = self.last_batch
         else:
           example_batch = next(self.data_iterator)
-        # Reshard data from loaded sharding to performant activation sharding
-        self.last_batch = sharding.maybe_shard_with_name(
-            example_batch,
-            self.input_data_shardings,
-            self.config.shard_mode,
-        )
+        self.last_batch = example_batch
         self.check_example_batch()
       except Exception as e:  # pylint: disable=broad-except
         if isinstance(e, StopIteration):
@@ -68,3 +61,104 @@ def check_example_batch(self):
       # pylint: disable=not-callable
       err, _ = jax.jit(jittable_f)(self.last_batch["inputs"][: self.config.global_batch_size_to_train_on, :])
       err.throw()
+
+
+class RampUpDataLoader(DataLoader):
+  """
+  A DataLoader that implements batch size ramp-up.
+
+  It dynamically increases the 'global_batch_size_current' in the config
+  object based on the training step. The rest of the training pipeline
+  (including the parent's `check_example_batch` and the training step itself)
+  is assumed to read this config value to determine the logical batch size.
+  """
+
+  def __init__(self, config, mesh, data_iterator, goodput_recorder):
+    # Call parent constructor
+    super().__init__(config, mesh, data_iterator, goodput_recorder)
+
+    # Get ramp-up parameters from config, with safe defaults
+    self.global_batch_size_end = config.global_batch_size_to_load
+    self.global_batch_size_start = config.global_batch_size_to_load_start
+    self.increment = config.global_batch_size_to_load_increment
+    self.samples_per_increment = config.rampup_samples_per_increment_to_load
+
+    # Check if ramp-up is active
+    self.rampup_active = self.global_batch_size_start < self.global_batch_size_end
+
+    # State for tracking ramp-up
+    self.accum_samples = 0
+    self.global_batch_size_current = self.global_batch_size_start
+    self.batch_buffer = None
+    self.buffer_start = 0
+
+  def load_next_batch(self):
+    """
+    Updates the batch size based on the schedule and then loads the next
+    batch using the parent method.
+    """
+    # If ramp-up is not active, just behave like the parent
+    if not self.rampup_active:
+      return super().load_next_batch()
+
+    # If in rampup phase, we use batch buffer to save data
+    # Check if it's time to increment the batch size
+    is_time_to_increment = self.accum_samples >= self.samples_per_increment
+
+    if is_time_to_increment:
+      # Update current batch size and refresh accumulate samples
+      self.global_batch_size_current += self.increment
+      self.accum_samples = 0
+      self.rampup_active = self.global_batch_size_current < self.global_batch_size_end
+
+    self.accum_samples += self.global_batch_size_current
+    slice_start, slice_end = self.buffer_start, self.buffer_start + self.global_batch_size_current
+
+    # Load new batch if batch_buffer is None or slice overpast the buffer end
+    if self.batch_buffer is None:
+      self.batch_buffer = super().load_next_batch()
+      slice_start, slice_end = 0, self.global_batch_size_current
+
+    if slice_end > self.global_batch_size_end:
+      old_buffer, self.batch_buffer = self.batch_buffer, super().load_next_batch()
+
+      # self.global_batch_size_end is batch_buffer size
+      def _slice_and_concat(old_data, new_data):
+        sliced_old_data = jax.lax.dynamic_slice_in_dim(
+            old_data,
+            slice_start,
+            self.global_batch_size_end - slice_start,
+            axis=0,
+        )
+        sliced_new_data = jax.lax.dynamic_slice_in_dim(
+            new_data,
+            0,
+            slice_end - self.global_batch_size_end,
+            axis=0,
+        )
+        return jax.lax.concatenate((sliced_old_data, sliced_new_data), dimension=0)
+
+      self.buffer_start = slice_end - self.global_batch_size_end
+      return jax.tree.map(_slice_and_concat, old_buffer, self.batch_buffer)
+    else:
+
+      def _slice(data):
+        return jax.lax.dynamic_slice_in_dim(
+            data,
+            slice_start,
+            self.global_batch_size_current,
+            axis=0,
+        )
+
+      self.buffer_start = slice_end
+      return jax.tree.map(_slice, self.batch_buffer)
+
+
+def create_dataloader(config, mesh, data_iterator, goodput_recorder):
+  """
+  Create the dataloader
+  """
+  if config.enable_rampup_batch_size:
+    return RampUpDataLoader(config, mesh, data_iterator, goodput_recorder)
+  else:
+    return DataLoader(config, mesh, data_iterator, goodput_recorder)
diff --git a/src/MaxText/metric_logger.py b/src/MaxText/metric_logger.py
@@ -105,13 +105,22 @@ def log_metrics(self, metrics, step, is_training):
     """Logs metrics via max_logging."""
     if is_training:
       loss = metrics["scalar"]["learning/loss"]
-      log_message = (
-          f"completed step: {step}, seconds: {metrics['scalar']['perf/step_time_seconds']:.3f}, "
-          f"TFLOP/s/device: {metrics['scalar']['perf/per_device_tflops_per_sec']:.3f}, "
-          f"Tokens/s/device: {metrics['scalar']['perf/per_device_tokens_per_sec']:.3f}, "
-          f"total_weights: {metrics['scalar']['learning/total_weights']}, "
-          f"loss: {loss:.3f}"
-      )
+      # Do not show flops and tokens during batch size rampup
+      if step >= self.config.rampup_end_step:
+        log_message = (
+            f"completed step: {step}, seconds: {metrics['scalar']['perf/step_time_seconds']:.3f}, "
+            f"TFLOP/s/device: {metrics['scalar']['perf/per_device_tflops_per_sec']:.3f}, "
+            f"Tokens/s/device: {metrics['scalar']['perf/per_device_tokens_per_sec']:.3f}, "
+            f"total_weights: {metrics['scalar']['learning/total_weights']}, "
+            f"loss: {loss:.3f}"
+        )
+      else:
+        log_message = (
+            "[Rampup Batch Size Phase]: "
+            f"completed step: {step}, seconds: {metrics['scalar']['perf/step_time_seconds']:.3f}, "
+            f"total_weights: {metrics['scalar']['learning/total_weights']}, "
+            f"loss: {loss:.3f}"
+        )
 
       if self.config.mtp_num_layers > 0:
         mtp_loss = metrics["scalar"].get("learning/mtp_loss", 0.0)
@@ -213,15 +222,16 @@ def buffer_and_write_train_metrics(self, metrics, step, step_time_delta):
   def record_train_metrics(self, metrics, step, step_time):
     """Records training metrics for the current step."""
     metrics["scalar"].update({"perf/step_time_seconds": step_time})
-    metrics["scalar"].update({"perf/per_device_tflops": self.metadata[MetadataKey.PER_DEVICE_TFLOPS]})
-    metrics["scalar"].update(
-        {"perf/per_device_tflops_per_sec": (self.metadata[MetadataKey.PER_DEVICE_TFLOPS] / step_time)}
-    )
-    metrics["scalar"].update({"perf/per_device_tokens": self.metadata[MetadataKey.PER_DEVICE_TOKENS]})
-    metrics["scalar"].update(
-        {"perf/per_device_tokens_per_sec": (self.metadata[MetadataKey.PER_DEVICE_TOKENS] / step_time)}
-    )
     metrics["scalar"].update({"learning/current_learning_rate": self.learning_rate_schedule(step)})
+    if step >= self.config.rampup_end_step:
+      metrics["scalar"].update({"perf/per_device_tflops": self.metadata[MetadataKey.PER_DEVICE_TFLOPS]})
+      metrics["scalar"].update(
+          {"perf/per_device_tflops_per_sec": (self.metadata[MetadataKey.PER_DEVICE_TFLOPS] / step_time)}
+      )
+      metrics["scalar"].update({"perf/per_device_tokens": self.metadata[MetadataKey.PER_DEVICE_TOKENS]})
+      metrics["scalar"].update(
+          {"perf/per_device_tokens_per_sec": (self.metadata[MetadataKey.PER_DEVICE_TOKENS] / step_time)}
+      )
     if self.performance_metric_queue:
       self.performance_metric_queue.put(step_time)
 
diff --git a/src/MaxText/pyconfig.py b/src/MaxText/pyconfig.py
@@ -194,6 +194,21 @@ def validate_vocab_tiling(num_vocab_tiling: int, per_device_batch_size: int, max
     raise ValueError("We currently don't support vocab tiling on NNX module.")
 
 
+def validate_rampup_batch_size(batch_size_start, batch_size_end, batch_size_increment, global_rampup_samples):
+  assert batch_size_start > 0, f"per_device_batch_size_start should be positive, got {batch_size_start}."
+  assert batch_size_increment > 0, f"per_device_batch_size_increment should be positive, got {batch_size_increment}."
+  assert global_rampup_samples > 0, f"global_rampup_samples should be positive, got {global_rampup_samples}."
+  diff_batch_size = batch_size_end - batch_size_start
+  assert diff_batch_size > 0, (
+      "per_device_batch_size must be greater than per_device_batch_size_start. "
+      f"get batch size is {batch_size_end} and batch size start is {batch_size_start}."
+  )
+  assert diff_batch_size % batch_size_increment == 0, (
+      "Expect rampup batch size change divisible by batch size increment."
+      f"Got per_device_batch_size={batch_size_end} and per_device_batch_size_start={batch_size_start}."
+  )
+
+
 def validate_keys(keys):
   validate_attention_kernel(keys["attention"])
   validate_attention_type(keys["attention_type"])
@@ -212,6 +227,13 @@ def validate_keys(keys):
   validate_vocab_tiling(
       keys["num_vocab_tiling"], keys["per_device_batch_size"], keys["max_target_length"], keys["enable_nnx"]
   )
+  if keys["enable_rampup_batch_size"]:
+    validate_rampup_batch_size(
+        keys["per_device_batch_size_start"],
+        keys["per_device_batch_size"],
+        keys["per_device_batch_size_increment"],
+        keys["global_rampup_samples"],
+    )
 
   # TODO remove after b/435512699 resolved
   if keys["context_parallel_size"] > 1 and keys["context_parallel_load_balance"] and keys["attention_type"] == "chunk":
@@ -706,6 +728,43 @@ def user_init(raw_keys):
         raw_keys["gradient_accumulation_steps"],
     )
 
+    # Initialize starting global batch size and global increments if rampup batch
+    # size is enabled
+    if raw_keys["enable_rampup_batch_size"]:
+      (
+          raw_keys["global_batch_size_to_load_start"],
+          raw_keys["global_batch_size_to_train_on_start"],
+          raw_keys["micro_batch_size_to_train_on_start"],
+      ) = calculate_global_batch_sizes(
+          raw_keys["per_device_batch_size_start"],
+          raw_keys["expansion_factor_real_data"],
+          get_num_target_devices(raw_keys),
+          raw_keys["gradient_accumulation_steps"],
+      )
+
+      (
+          raw_keys["global_batch_size_to_load_increment"],
+          raw_keys["global_batch_size_to_train_on_increment"],
+          raw_keys["micro_batch_size_to_train_on_increment"],
+      ) = calculate_global_batch_sizes(
+          raw_keys["per_device_batch_size_increment"],
+          raw_keys["expansion_factor_real_data"],
+          get_num_target_devices(raw_keys),
+          raw_keys["gradient_accumulation_steps"],
+      )
+
+      (
+          raw_keys["rampup_samples_per_increment_to_load"],
+          raw_keys["rampup_end_step"],
+      ) = calculate_rampup_samples_and_steps(
+          raw_keys["global_batch_size_to_load_start"],
+          raw_keys["global_batch_size_to_load"],
+          raw_keys["global_batch_size_to_load_increment"],
+          raw_keys["global_rampup_samples"],
+      )
+    else:
+      raw_keys["rampup_end_step"] = 0
+
     if raw_keys["eval_per_device_batch_size"] <= 0:
       raw_keys["eval_per_device_batch_size"] = raw_keys["per_device_batch_size"]
 
@@ -1253,6 +1312,27 @@ def calculate_global_batch_sizes(
   return global_batch_size_to_load, global_batch_size_to_train_on, micro_batch_size_to_train_on
 
 
+def calculate_rampup_samples_and_steps(
+    batch_size_start,
+    batch_size_end,
+    batch_size_increment,
+    global_rampup_samples,
+):
+  """Calculate num of samples for each increment and num of steps for batch rampup"""
+  diff_batch_size = batch_size_end - batch_size_start
+  num_increments = diff_batch_size // batch_size_increment
+  rampup_samples_per_increment = global_rampup_samples / num_increments
+  total_rampup_steps = 0
+  current_batch_size = batch_size_start
+
+  for _ in range(num_increments):
+    steps_for_this_stage = math.ceil(rampup_samples_per_increment / current_batch_size)
+    total_rampup_steps += steps_for_this_stage
+    current_batch_size += batch_size_increment
+
+  return rampup_samples_per_increment, total_rampup_steps
+
+
 def get_num_target_devices(raw_keys):
   # In AOT case compile_topology is set (e.g. is not the empty string), and we determine the
   # number of devices from the compile_topology. In non-AOT settings we simply can use jax.devices().
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
@@ -53,7 +53,7 @@
 from MaxText import sharding
 from MaxText.layers.multi_token_prediction import calculate_mtp_acceptance_rate, calculate_mtp_loss
 from MaxText.common_types import ShardMode
-from MaxText.data_loader import DataLoader
+from MaxText.data_loader import create_dataloader
 from MaxText.globals import EPS
 from MaxText.metric_logger import MetricLogger
 from MaxText.utils import gcs_utils
@@ -391,7 +391,7 @@ def train_loop(config, recorder, state=None):
 
   start_step = get_first_step(state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
-  data_loader = DataLoader(config, mesh, data_iterator, recorder)
+  data_loader = create_dataloader(config, mesh, data_iterator, recorder)
   metric_logger = MetricLogger(config=config, learning_rate_schedule=learning_rate_schedule)
 
   # Write train config params, num model params, and XLA flags to tensorboard
@@ -404,6 +404,12 @@ def train_loop(config, recorder, state=None):
 
       with jax.profiler.StepTraceAnnotation("train", step_num=step):
         example_batch = data_loader.load_next_batch()
+        # Reshard data from loaded sharding to performant activation sharding
+        example_batch = sharding.maybe_shard_with_name(
+            example_batch,
+            sharding.get_input_data_sharding(config, mesh),
+            shard_mode=config.shard_mode,
+        )
         # pylint: disable=not-callable
         nextrng = jax.jit(jax.random.fold_in)(init_rng, step)
         with maybe_record_goodput(recorder, GoodputEvent.STEP, step):
diff --git a/tests/data_loader_test.py b/tests/data_loader_test.py