AI-Hypercomputer
diff --git a/‎end_to_end/tpu/test_grpo.sh‎
Lines changed: 22 additions & 14 deletions b/‎end_to_end/tpu/test_grpo.sh‎
Lines changed: 22 additions & 14 deletions
diff --git a/‎src/MaxText/experimental/rl/grpo_trainer.py‎
Lines changed: 35 additions & 30 deletions b/‎src/MaxText/experimental/rl/grpo_trainer.py‎
Lines changed: 35 additions & 30 deletions
@@ -5,10 +5,14 @@
 # External users can update pre-trained model checkpoint GCS path (gs://) to your accessible locations.
 # Usage:
   HF_TOKEN=<huggingface access token> \
-  MODEL=llama3.3-70b TOKENIZER=meta-llama/Llama-3.3-70B \
-  NUM_SAMPLERS=4 DEVICES_PER_SAMPLER=8 \
+  MODEL=llama3.1-8b TOKENIZER=meta-llama/Llama-3.1-8B-Instruct \
+  NUM_SAMPLERS=2 DEVICES_PER_SAMPLER=8 \
   TRAINING_PER_DEVICE_BATCH_SIZE=1 \
-  INFERENCE_PER_DEVICE_BATCH_SIZE=4 \
+  INFERENCE_PER_DEVICE_BATCH_SIZE=1 \
+  TRAINING_SUBSLICE=2,8 \
+  INFERENCE_SUBSLICE=2,8 \
+  MAX_PREFILL_LENGTH=128 \
+  MAX_TARGET_LENGTH=256 \
   STEPS=20 \
   bash end_to_end/tpu/test_grpo.sh
 '
@@ -23,10 +27,11 @@ JAX_BACKEND_TARGET=grpc://127.0.0.1:29000
 ENABLE_PATHWAYS_PERSISTENCE='1'
 HF_TOKEN=${HF_TOKEN}
 
-MAX_PREFILL_LENGTH=128
-MAX_TARGET_LENGTH=256
+MAX_PREFILL_LENGTH=${MAX_PREFILL_LENGTH:-128}
+MAX_TARGET_LENGTH=${MAX_TARGET_LENGTH:-256}
 NUM_GENERATIONS=2
 
+INFERENCE_PER_DEVICE_BS=$((${INFERENCE_PER_DEVICE_BATCH_SIZE} * ${NUM_GENERATIONS}))
 
 COMMON_ARGS="model_name=${MODEL} base_output_directory=${BASE_OUTPUT_DIRECTORY} \
 max_prefill_predict_length=${MAX_PREFILL_LENGTH} max_target_length=${MAX_TARGET_LENGTH} \
@@ -35,19 +40,22 @@ tokenizer_type=huggingface tokenizer_path=${TOKENIZER} \
 dataset_type=hf hf_path='trl-lib/tldr' \
 enable_single_controller=true \
 dtype=bfloat16 weight_dtype=bfloat16 \
-allow_split_physical_axes=true enable_goodput_recording=false monitor_goodput=false \
-profiler=xplane skip_first_n_steps_for_profiler=10 profiler_steps=5"
+allow_split_physical_axes=true enable_goodput_recording=false monitor_goodput=false"
 
 TRAINING_ARGS="run_name=${RUN_NAME} scan_layers=true \
-inference_replicas=${NUM_SAMPLERS} inference_devices_per_replica=${DEVICES_PER_SAMPLER} \
-inference_rollouts=5 \
-per_device_batch_size=${TRAINING_PER_DEVICE_BATCH_SIZE} num_generations=${NUM_GENERATIONS} steps=${STEPS}"
+inference_replicas=${NUM_SAMPLERS} inference_devices_per_replica=${DEVICES_PER_SAMPLER} subslice_shape=${TRAINING_SUBSLICE} \
+inference_rollouts=1 \
+per_device_batch_size=${TRAINING_PER_DEVICE_BATCH_SIZE} num_generations=${NUM_GENERATIONS} steps=${STEPS} \
+profiler=xplane skip_first_n_steps_for_profiler=5 profiler_steps=3"
 
+# Make sure profiles on inference TPUs are not captured while profiling trainers TPUs
+# Set a small number for profiler_steps during inference as the profiles turn out large in size
 INFERENCE_ARGS="run_name=grpo scan_layers=false \
-per_device_batch_size=${INFERENCE_PER_DEVICE_BATCH_SIZE} \
-ici_data_parallelism=${NUM_SAMPLERS} ici_tensor_parallelism=${DEVICES_PER_SAMPLER}"
+per_device_batch_size=${INFERENCE_PER_DEVICE_BS} num_generations=${NUM_GENERATIONS} \
+ici_data_parallelism=${NUM_SAMPLERS} ici_tensor_parallelism=${DEVICES_PER_SAMPLER} subslice_shape=${INFERENCE_SUBSLICE} \
+profiler=xplane skip_first_n_steps_for_profiler=10 profiler_steps=2"
 
 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
-    python3 -m MaxText.experimental.rl.grpo_trainer "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}"/experimental/rl/grpo.yml  \
-    ${COMMON_ARGS} ${TRAINING_ARGS} ${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/experimental/rl/grpo_inference.yml \
+    python3 src/MaxText/experimental/rl/grpo_trainer.py src/MaxText/experimental/rl/grpo.yml  \
+    ${COMMON_ARGS} ${TRAINING_ARGS} src/MaxText/experimental/rl/grpo_inference.yml \
     ${COMMON_ARGS} ${INFERENCE_ARGS}
@@ -33,6 +33,8 @@
 """
 
 
+import pathwaysutils
+
 import datetime
 import time
 import os
@@ -586,8 +588,6 @@ def generate_completions(
     worker_tokenizer_model,
     worker_config_inference,
     worker_config_train,
-    worker_data_buffer,
-    worker_data_buffer_lock,
     worker_input_data_shardings,
     engine_lock,
 ):
@@ -604,8 +604,6 @@ def generate_completions(
     worker_tokenizer_model: The tokenizer model.
     worker_config_inference: The configuration for the inference process.
     worker_config_train: The main training configuration.
-    worker_data_buffer: A list acting as a shared buffer to store generated data.
-    worker_data_buffer_lock: A lock to ensure thread-safe access to the buffer.
     worker_input_data_shardings: Sharding specifications for the data.
     engine_lock: A lock to ensure thread-safe use of the inference engine.
   """
@@ -615,7 +613,7 @@ def generate_completions(
     thread_example_batch_trimmed = jax.tree_util.tree_map(
         lambda arr: arr[
             : int(
-                worker_config_inference.per_device_batch_size
+                (worker_config_inference.per_device_batch_size // worker_config_inference.num_generations)
                 * worker_config_train.inference_replicas
                 * worker_config_train.inference_devices_per_replica
             )
@@ -626,13 +624,7 @@ def generate_completions(
         worker_config_inference, worker_tokenizer_model, worker_inference_engine, thread_example_batch_trimmed
     )
     processed_batch = jax.device_put(processed_batch, worker_input_data_shardings)
-  with worker_data_buffer_lock:
-    if not worker_data_buffer:
-      worker_data_buffer.append(processed_batch)
-    else:
-      worker_data_buffer[0] = jax.tree_util.tree_map(
-          lambda a, b: np.concatenate([a, b], axis=0), worker_data_buffer[0], processed_batch
-      )
+  return processed_batch
 
 
 def train_loop(config, config_inference, recorder, state=None):
@@ -705,6 +697,7 @@ def train_loop(config, config_inference, recorder, state=None):
 
   start_step = get_first_step(state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
+  inference_prof = profiler.Profiler(config_inference, offset_step=start_step)
   data_loader = DataLoader(config_inference, inference_mesh, data_iterator, recorder)
   metric_logger = MetricLogger(config=config, learning_rate_schedule=learning_rate_schedule)
 
@@ -721,6 +714,7 @@ def generation_worker_fn(
       worker_input_data_shardings,
       engine_lock,
       stop_event,
+      profiler_object,
   ):
     """The target function for the data generation worker thread.
 
@@ -738,21 +732,40 @@ def generation_worker_fn(
       worker_input_data_shardings: Sharding specs for the generated data.
       engine_lock: A lock for thread-safe inference engine access.
       stop_event: A threading.Event to signal when the worker should stop.
+      profiling_event: a threading.Event to signal when to profile.
     """
+    worker_step = 0
+    is_profiling = False
     while not stop_event.is_set():
       try:
-        with jax.profiler.StepTraceAnnotation("inference"):
-          generate_completions(
+        if worker_step == profiler_object.start_initial_profile_step and not is_profiling:
+          profiler_object.activate()
+          is_profiling = True
+        elif worker_step == profiler_object.finished_initial_profile_step and is_profiling:
+          profiler_object.deactivate()
+          is_profiling = False
+        with jax.profiler.StepTraceAnnotation("inference", step_num=worker_step):
+          processed_batch = generate_completions(
               data_loader,
               worker_inference_engine,
               worker_tokenizer_model,
               worker_config_inference,
               worker_config_train,
-              worker_data_buffer,
-              worker_data_buffer_lock,
               worker_input_data_shardings,
               engine_lock,
           )
+          jax.block_until_ready(processed_batch)
+
+        with worker_data_buffer_lock:
+          if not worker_data_buffer:
+            worker_data_buffer.append(processed_batch)
+          else:
+            worker_data_buffer[0] = jax.tree_util.tree_map(
+                lambda a, b: np.concatenate([a, b], axis=0),
+                worker_data_buffer[0],
+                processed_batch,
+            )
+        worker_step += 1
       except StopIteration:
         max_logging.log("Data iterator exhausted in generation worker. Stopping.")
         break
@@ -764,19 +777,6 @@ def generation_worker_fn(
   stop_event = threading.Event()
   inference_engine_lock = threading.Lock()
 
-  max_logging.log("Inference Rollout")
-  generate_completions(
-      data_loader,
-      inference_engine,
-      tokenizer_model,
-      config_inference,
-      config,
-      data_buffer,
-      data_buffer_lock,
-      data_sharding,
-      inference_engine_lock,
-  )
-
   required_batch_size = int(config.per_device_batch_size * config.num_generations * mesh.size)
   generation_thread = threading.Thread(
       target=generation_worker_fn,
@@ -790,6 +790,7 @@ def generation_worker_fn(
           data_sharding,  # Sharding for the data put into the buffer
           inference_engine_lock,
           stop_event,
+          inference_prof,  # profiler object
       ),
       daemon=True,  # So it exits when the main thread exits
   )
@@ -830,8 +831,10 @@ def generation_worker_fn(
               {"params": state.params["params"]},
               {"params": state_mesh_shardings.params["params"]},
               mesh,
-              inference_state_mesh_shardings,
+              {"params": inference_state_mesh_shardings.params["params"]},
           )
+          with data_buffer_lock:
+            data_buffer.clear()
 
       step_time_delta = datetime.datetime.now() - last_step_completion
       last_step_completion = datetime.datetime.now()
@@ -895,6 +898,7 @@ def main(argv: Sequence[str]) -> None:
   training and inference, sets up system environment variables, and launches
   the `train_loop`.
   """
+  pathwaysutils.initialize()
   jax.config.update("jax_default_prng_impl", "unsafe_rbg")
   # TF allocates extraneous GPU memory when using TFDS data
   # this leads to CUDA OOMs. WAR for now is to hide GPUs from TF
@@ -923,6 +927,7 @@ def main(argv: Sequence[str]) -> None:
         f"with {jax.device_count()} devices"
     )
   config_inference = pyconfig.initialize(configs_argv[1])
+
   if config.per_device_batch_size < 1.0 or config_inference.per_device_batch_size < 1.0:
     raise ValueError("GRPO does not support setting per_device_batch_size < 1.0")
   jax.config.update("jax_use_shardy_partitioner", config.shardy)