From f083246eee85a1e449263f19d0abb58b8687ec8a Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 5 Mar 2026 16:13:52 +0000 Subject: [PATCH 01/57] feat: add grpo algorithm --- container-images/tpu/rl-on-tpu/Dockerfile | 32 +++++++++ .../tpu/rl-on-tpu/cloudbuild.yaml | 31 +++++++++ container-images/tpu/rl-on-tpu/src/app.py | 66 +++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 container-images/tpu/rl-on-tpu/Dockerfile create mode 100644 container-images/tpu/rl-on-tpu/cloudbuild.yaml create mode 100644 container-images/tpu/rl-on-tpu/src/app.py diff --git a/container-images/tpu/rl-on-tpu/Dockerfile b/container-images/tpu/rl-on-tpu/Dockerfile new file mode 100644 index 000000000..baf33f651 --- /dev/null +++ b/container-images/tpu/rl-on-tpu/Dockerfile @@ -0,0 +1,32 @@ +# syntax=docker.io/docker/dockerfile:1.17.1 + +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.12-slim + +WORKDIR /app + +RUN pip install --no-cache-dir uv + +RUN uv pip install --system "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + +RUN uv pip install --system maxtext --resolution=lowest +RUN install_maxtext_github_deps + +RUN uv pip install --system torch --index-url https://download.pytorch.org/whl/cpu + +COPY --from=primary app.py ./ + +ENTRYPOINT ["python", "app.py"] diff --git a/container-images/tpu/rl-on-tpu/cloudbuild.yaml b/container-images/tpu/rl-on-tpu/cloudbuild.yaml new file mode 100644 index 000000000..90a458d31 --- /dev/null +++ b/container-images/tpu/rl-on-tpu/cloudbuild.yaml @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --build-context=primary=container-images/tpu/rl-on-tpu/src + - --file=container-images/tpu/rl-on-tpu/Dockerfile + - --tag=${_DESTINATION} + - . + id: "Build RL on TPU image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py new file mode 100644 index 000000000..2bf645b69 --- /dev/null +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -0,0 +1,66 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + import os + import jax + import MaxText + from huggingface_hub import login + from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices + + # Environment variables for cleaner logging + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" + os.environ["SKIP_JAX_PRECOMPILE"] = "1" + os.environ["VLLM_LOGGING_LEVEL"] = "ERROR" + + HF_TOKEN = os.environ.get("HF_TOKEN", "") + if HF_TOKEN: + login(token=HF_TOKEN) + + MAXTEXT_PKG_DIR = os.path.dirname(MaxText.__file__) + MAXTEXT_REPO_ROOT = os.sep.join(["maxtext" if p == "MaxText" else p for p in MAXTEXT_PKG_DIR.split(os.sep)]) + + MODEL_NAME = "llama3.1-8b" + TOKENIZER_PATH = "meta-llama/Llama-3.1-8B-Instruct" + RUN_NAME = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + LOSS_ALGO = "grpo" + + CHAT_TEMPLATE_PATH = f"{MAXTEXT_REPO_ROOT}/examples/chat_templates/gsm8k_rl.json" + MODEL_CHECKPOINT_PATH = "/workspace/llama_checkpoint" + OUTPUT_DIRECTORY = "/workspace/rl_llama3_output" + + config_argv = [ + "", + f"{MAXTEXT_PKG_DIR}/configs/post_train/rl.yml", + f"model_name={MODEL_NAME}", + f"tokenizer_path={TOKENIZER_PATH}", + f"run_name={RUN_NAME}", + f"chat_template_path={CHAT_TEMPLATE_PATH}", + f"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items", + f"base_output_directory={OUTPUT_DIRECTORY}", + f"hf_access_token={HF_TOKEN}", + "debug.rl=False", + f"rl.loss_algo={LOSS_ALGO}", + "use_pathways=False" + ] + + trainer_config, sampler_config, trainer_devices, sampler_devices = setup_configs_and_devices(config_argv) + + print(f"🚀 Starting {LOSS_ALGO} Training...") + try: + rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices) + print("✅ Training Completed Successfully!") + except Exception as e: + print(f"❌ Training Failed: {str(e)}") + raise From 4092d9c7b234e7d5b50df9b83532407c76d61c8d Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 5 Mar 2026 16:14:51 +0000 Subject: [PATCH 02/57] feat: add reinforcement learning image build --- .../_shared_config/_cloudbuild.auto.tfvars | 1 + .../_shared_config/_cloudbuild_variables.tf | 1 + .../_shared_config/_cluster.auto.tfvars | 1 + .../_shared_config/_cluster_variables.tf | 1 + .../_shared_config/_networking.auto.tfvars | 1 + .../_shared_config/_networking_variables.tf | 1 + .../_shared_config/_platform.auto.tfvars | 1 + .../_shared_config/_platform_variables.tf | 1 + .../terraform/_shared_config/outputs.tf | 17 +++++++ .../reinforcement_learning.auto.tfvars | 0 .../reinforcement_learning_variables.tf | 23 +++++++++ .../scripts/set_environment_variables.sh | 31 ++++++++++++ .../tpu/rl_on_tpu/_cloudbuild.auto.tfvars | 1 + .../tpu/rl_on_tpu/_cloudbuild_variables.tf | 1 + .../tpu/rl_on_tpu/_platform.auto.tfvars | 1 + .../tpu/rl_on_tpu/_platform_variables.tf | 1 + .../_reinforcement_learning.auto.tfvars | 1 + .../_reinforcement_learning_variables.tf | 1 + .../images/tpu/rl_on_tpu/cloudbuild.tf | 47 +++++++++++++++++++ .../images/tpu/rl_on_tpu/local_file.tf | 17 +++++++ .../images/tpu/rl_on_tpu/versions.tf | 32 +++++++++++++ 21 files changed, 181 insertions(+) create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning.auto.tfvars create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf create mode 100755 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/local_file.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/versions.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..c730c32e8 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf new file mode 120000 index 000000000..5a143590a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars new file mode 120000 index 000000000..98a694db9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf new file mode 120000 index 000000000..00625515b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars new file mode 120000 index 000000000..9cbd92baf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/networking.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf new file mode 120000 index 000000000..1e170e71d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/networking_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars new file mode 120000 index 000000000..125a652cf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf new file mode 120000 index 000000000..486b3eaef --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf new file mode 100644 index 000000000..8a55d158e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "rl_tpu_rl_on_tpu_image_url" { + value = var.rl_tpu_rl_on_tpu_image_url +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning.auto.tfvars new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf new file mode 100644 index 000000000..21c56f74e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -0,0 +1,23 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" +} + +variable "rl_tpu_rl_on_tpu_image_url" { + default = null + description = "The URL for the RL on TPU container image." + type = string +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh new file mode 100755 index 000000000..1796c0b4e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +MY_PATH_IRA_ENV="$( + cd "$(dirname "${BASH_SOURCE}")" >/dev/null 2>&1 + pwd -P +)" + +ACP_REPO_DIR="$(realpath ${MY_PATH_IRA_ENV}/../../../../../../../../)" +ACP_PLATFORM_BASE_DIR="${ACP_REPO_DIR}/platforms/gke/base" +ACP_PLATFORM_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning" + +declare -a SHARED_CONFIG_PATHS=( + "${ACP_PLATFORM_BASE_DIR}/_shared_config" + "${ACP_PLATFORM_USE_CASE_DIR}/terraform/_shared_config" +) +export SHARED_CONFIG_PATHS + +source "${ACP_PLATFORM_BASE_DIR}/_shared_config/scripts/set_environment_variables.sh" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..171a27a35 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..79960dd37 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf new file mode 100644 index 000000000..4ea88e08d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.rl_tpu_rl_on_tpu_image_url +} + +resource "terraform_data" "submit_docker_build" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.image_destination + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="container-images/tpu/rl-on-tpu/cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = self.input.acp_root + } + + triggers_replace = { + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/tpu/rl-on-tpu/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/tpu/rl-on-tpu/Dockerfile") + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/tpu/rl-on-tpu/src", "**") : filesha256("${local.acp_root}/container-images/tpu/rl-on-tpu/src/${file}")])) + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/versions.tf new file mode 100644 index 000000000..971a10c8e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_rl_images_tpu_rl_on_tpu_deploy-v1" + } +} From 7af72f0cfae8db0d5fb7751bd3114f2495a26662 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 5 Mar 2026 16:24:34 +0000 Subject: [PATCH 03/57] feat: add reinforcement learning dictionary --- .github/workflows/dictionary/reinforcement-learning.txt | 3 +++ cspell.json | 4 ++++ 2 files changed, 7 insertions(+) create mode 100644 .github/workflows/dictionary/reinforcement-learning.txt diff --git a/.github/workflows/dictionary/reinforcement-learning.txt b/.github/workflows/dictionary/reinforcement-learning.txt new file mode 100644 index 000000000..b3bc2040a --- /dev/null +++ b/.github/workflows/dictionary/reinforcement-learning.txt @@ -0,0 +1,3 @@ +grpo +maxtext +strftime diff --git a/cspell.json b/cspell.json index 7e46f38d1..7d93e0f6c 100644 --- a/cspell.json +++ b/cspell.json @@ -70,6 +70,10 @@ "name": "ray", "path": ".github/workflows/dictionary/ray.txt" }, + { + "name": "reinforcement-learning", + "path": ".github/workflows/dictionary/reinforcement-learning.txt" + }, { "name": "shell", "path": ".github/workflows/dictionary/shell.txt" From c7b7a7f2f35faddf63cacb83966a11c76efff604 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 5 Mar 2026 16:24:58 +0000 Subject: [PATCH 04/57] fix: formatting python algorithm --- container-images/tpu/rl-on-tpu/src/app.py | 91 ++++++++++++----------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py index 2bf645b69..63e482f9b 100644 --- a/container-images/tpu/rl-on-tpu/src/app.py +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -13,54 +13,59 @@ # limitations under the License. import datetime - import os - import jax - import MaxText - from huggingface_hub import login - from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices +import os - # Environment variables for cleaner logging - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" - os.environ["SKIP_JAX_PRECOMPILE"] = "1" - os.environ["VLLM_LOGGING_LEVEL"] = "ERROR" +import jax +import MaxText +from huggingface_hub import login +from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices - HF_TOKEN = os.environ.get("HF_TOKEN", "") - if HF_TOKEN: - login(token=HF_TOKEN) +# Environment variables for cleaner logging +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" +os.environ["SKIP_JAX_PRECOMPILE"] = "1" +os.environ["VLLM_LOGGING_LEVEL"] = "ERROR" - MAXTEXT_PKG_DIR = os.path.dirname(MaxText.__file__) - MAXTEXT_REPO_ROOT = os.sep.join(["maxtext" if p == "MaxText" else p for p in MAXTEXT_PKG_DIR.split(os.sep)]) +HF_TOKEN = os.environ.get("HF_TOKEN", "") +if HF_TOKEN: + login(token=HF_TOKEN) - MODEL_NAME = "llama3.1-8b" - TOKENIZER_PATH = "meta-llama/Llama-3.1-8B-Instruct" - RUN_NAME = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - LOSS_ALGO = "grpo" +MAXTEXT_PKG_DIR = os.path.dirname(MaxText.__file__) +MAXTEXT_REPO_ROOT = os.sep.join( + ["maxtext" if p == "MaxText" else p for p in MAXTEXT_PKG_DIR.split(os.sep)] +) - CHAT_TEMPLATE_PATH = f"{MAXTEXT_REPO_ROOT}/examples/chat_templates/gsm8k_rl.json" - MODEL_CHECKPOINT_PATH = "/workspace/llama_checkpoint" - OUTPUT_DIRECTORY = "/workspace/rl_llama3_output" +MODEL_NAME = "llama3.1-8b" +TOKENIZER_PATH = "meta-llama/Llama-3.1-8B-Instruct" +RUN_NAME = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") +LOSS_ALGO = "grpo" - config_argv = [ - "", - f"{MAXTEXT_PKG_DIR}/configs/post_train/rl.yml", - f"model_name={MODEL_NAME}", - f"tokenizer_path={TOKENIZER_PATH}", - f"run_name={RUN_NAME}", - f"chat_template_path={CHAT_TEMPLATE_PATH}", - f"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items", - f"base_output_directory={OUTPUT_DIRECTORY}", - f"hf_access_token={HF_TOKEN}", - "debug.rl=False", - f"rl.loss_algo={LOSS_ALGO}", - "use_pathways=False" - ] +CHAT_TEMPLATE_PATH = f"{MAXTEXT_REPO_ROOT}/examples/chat_templates/gsm8k_rl.json" +MODEL_CHECKPOINT_PATH = "/workspace/llama_checkpoint" +OUTPUT_DIRECTORY = "/workspace/rl_llama3_output" - trainer_config, sampler_config, trainer_devices, sampler_devices = setup_configs_and_devices(config_argv) +config_argv = [ + "", + f"{MAXTEXT_PKG_DIR}/configs/post_train/rl.yml", + f"model_name={MODEL_NAME}", + f"tokenizer_path={TOKENIZER_PATH}", + f"run_name={RUN_NAME}", + f"chat_template_path={CHAT_TEMPLATE_PATH}", + f"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items", + f"base_output_directory={OUTPUT_DIRECTORY}", + f"hf_access_token={HF_TOKEN}", + "debug.rl=False", + f"rl.loss_algo={LOSS_ALGO}", + "use_pathways=False", +] - print(f"🚀 Starting {LOSS_ALGO} Training...") - try: - rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices) - print("✅ Training Completed Successfully!") - except Exception as e: - print(f"❌ Training Failed: {str(e)}") - raise +trainer_config, sampler_config, trainer_devices, sampler_devices = ( + setup_configs_and_devices(config_argv) +) + +print(f"🚀 Starting {LOSS_ALGO} Training...") +try: + rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices) + print("✅ Training Completed Successfully!") +except Exception as e: + print(f"❌ Training Failed: {str(e)}") + raise From 0cb1dcb319c96bf6832bf40e101a9a2bf386610c Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Thu, 12 Mar 2026 15:57:11 +0100 Subject: [PATCH 05/57] fix: add reinforcement learning dictionary --- cspell.json | 1 + 1 file changed, 1 insertion(+) diff --git a/cspell.json b/cspell.json index 69863053a..22adb091d 100644 --- a/cspell.json +++ b/cspell.json @@ -117,6 +117,7 @@ "nvidia", "python", "ray", + "reinforcement-learning", "shell", "svg", "terraform", From 8d07e7a3acdbeabe18ae5a041a54f2afef123731 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 10:47:47 +0100 Subject: [PATCH 06/57] feat: add first experiment --- .../reinforcement-learning/README.md | 168 ++++++++++++++++++ .../kubernetes-manifests/rl-on-tpu/job.yaml | 26 +++ .../reinforcement_learning_variables.tf | 7 + .../rl_on_tpu/_cloudbuild.auto.tfvars | 1 + .../rl_on_tpu/_cloudbuild_variables.tf | 1 + .../terraform/rl_on_tpu/_cluster.auto.tfvars | 1 + .../terraform/rl_on_tpu/_cluster_variables.tf | 1 + .../terraform/rl_on_tpu/_platform.auto.tfvars | 1 + .../rl_on_tpu/_platform_variables.tf | 1 + .../_reinforcement_learning.auto.tfvars | 1 + .../_reinforcement_learning_variables.tf | 1 + .../terraform/rl_on_tpu/iam.tf | 20 +++ .../terraform/rl_on_tpu/kubernetes.tf | 87 +++++++++ .../terraform/rl_on_tpu/mlflow.tf | 43 +++++ .../templates/kubernetes/namespace.tftpl.yaml | 18 ++ .../kubernetes/serviceaccount.tftpl.yaml | 19 ++ .../templates/mlflow/manifests.tftpl.yaml | 75 ++++++++ 17 files changed, 471 insertions(+) create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/README.md create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/README.md b/platforms/gke/base/use-cases/reinforcement-learning/README.md new file mode 100644 index 000000000..dd5a9d59f --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/README.md @@ -0,0 +1,168 @@ +# Llama 3.1 8B GRPO Training on GKE (TPU v5e) + +This repository contains a production-ready, end-to-end Reinforcement Learning +(GRPO) pipeline for a single-node smoke test of Llama 3.1 8B on Google +Kubernetes Engine (GKE) using a TPU v5e-8 slice. + +It integrates **MaxText** (for FSDP model training), **vLLM** (for +high-throughput rollout generation), and **Tunix** (the RL bridge). + +_Note: This script is currently configured as a single-batch smoke test. Scaling +up `rl.num_generations` or `per_device_batch_size` for a full training run +triggers an upstream Tunix API mismatch that requires a custom vLLM Monkey +Patch._ + +## 🚀 Quick Start & Environment Setup + +### 1. Provision & Connect to the GKE Cluster + +This pipeline is designed to run on the Accelerated Platforms training reference +architecture, which comes pre-configured with CCC and all necessary topology +routing. + +If you do not already have a cluster running, follow the official infrastructure +provisioning guide to spin up a TPU v5e cluster: 👉 +**[Accelerated Platforms GKE Training Architecture README](https://github.com/GoogleCloudPlatform/accelerated-platforms/blob/kr-rl/platforms/gke/base/use-cases/training-ref-arch/terraform/README.md)** + +Once your cluster is up and running, fetch your cluster credentials (replace +with your actual cluster name and region/zone): + +```bash +export PROJECT_ID="" +gcloud config set project $PROJECT_ID +gcloud container clusters get-credentials --location +``` + +### 2. Configure the Hugging Face Secret + +You must have access to the Meta Llama 3.1 weights. The training job securely +pulls your token from a Kubernetes secret. Create it in your active namespace: + +```bash +kubectl create secret generic hf-secret --from-literal=token="" + +``` + +### 3. Hardware & Storage Prerequisites + +- **Hardware:** This configuration is strictly tuned for a **TPU v5e-8** + topology. +- **Storage:** The container requires local ephemeral storage (or a mounted SSD) + at `/workspace` to handle the 16GB checkpoint conversions. + +--- + +## 🛠️ How to Deploy and Run + +### 1. Deploy the MLflow Tracking Server + +Before starting the training job, you must spin up the MLflow service so the +training pod has somewhere to send its metrics and artifacts. + +```bash +kubectl apply -f mlflow.yaml + +``` + +_(Note: This uses a `ClusterIP` configuration, meaning the dashboard is kept +completely internal and secure inside our GKE cluster. The training pod will +automatically discover it at `mlflow-service:5000`)_. + +### 2. Build and Push the Training Image + +```bash +docker build -t your-registry/maxtext-grpo:latest . +docker push your-registry/maxtext-grpo:latest + +``` + +### 3. Submit the GKE Training Job + +```bash +kubectl apply -f v5e-job.yaml + +``` + +### 4. Tail the Logs + +```bash +kubectl logs -f job/maxtext-grpo-job-v5e + +``` + +--- + +## 📊 Viewing Metrics (MLflow & TensorBoard) + +MaxText uses a custom C++ backend that logs directly to a local TensorBoard +folder. To make this visible to the team, the `train.py` script automatically +zips this folder and attaches it to **MLflow** as an artifact when the run +completes. + +### Accessing the MLflow UI + +Because MLflow is running securely inside the cluster, you need to port-forward +it to your local machine to view the dashboard: + +1. **Port-forward the MLflow Service:** + +```bash +kubectl port-forward svc/mlflow-service 5000:5000 + +``` + +2. **Open your Browser:** Navigate to `http://localhost:5000` +3. **View the Run Data:** + +- Go to the `MaxText-RL-GRPO` experiment. +- Click on your specific run (e.g., `Llama3.1-8B-grpo`). +- Scroll down to the **Artifacts** section. You will see the `tensorboard_logs` + folder attached there. + +### Live Tracking (During Training) + +If you want to watch the loss curves in real-time _before_ the job finishes and +uploads to MLflow, you can port-forward TensorBoard directly from the running +pod: + +```bash +kubectl exec -it job/maxtext-grpo-job-v5e -- tensorboard --logdir /workspace/rl_llama3_output --host 0.0.0.0 --port 6006 +kubectl port-forward job/maxtext-grpo-job-v5e 6006:6006 + +``` + +--- + +## ⚠️ Critical Architecture Notes & Patches (Do Not Remove) + +Because we are bridging experimental research frameworks (MaxText/Tunix) with +open-source inference (vLLM), several runtime patches are applied in `train.py` +and the `Dockerfile`. **If you modify this pipeline, keep these constraints in +mind:** + +### 1. The C++ Protobuf Shield + +vLLM uses `os.fork()` for its background workers, which fatally crashes the C++ +Protobuf engine loaded by JAX (`SIGABRT`). + +- **The Fix:** We force Python protobufs and `spawn` multiprocessing at the + absolute top of `train.py`. + +### 2. JAX Version Pinning (`0.4.25`) + +Newer versions of JAX strictly enforce `with_sharding_constraint` as an +assertion. Tunix currently violates this when mapping weights to vLLM, causing a +fatal mesh crash. + +- **The Fix:** The `Dockerfile` explicitly pins `jax[tpu]==0.4.25` using the + `--prerelease=allow` flag to grab the stable nightly drivers. + +### 3. Memory & Mesh Tuning + +To prevent vLLM from causing `RESOURCE_EXHAUSTED` (OOM) errors and starving +MaxText's FSDP optimizer: + +- `rollout_tensor_parallelism=8`: Maps vLLM across all 8 chips. +- `hbm_utilization_vllm=0.4`: Restricts vLLM to 40% of the TPU memory. +- _Note:_ The `ici_tensor_parallelism` flag is intentionally omitted so MaxText + defaults to FSDP for training. diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml new file mode 100644 index 000000000..6aaf554f0 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml @@ -0,0 +1,26 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: maxtext-grpo-job-v5e +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: "tpu-v5e-2x4" + containers: + - name: grpo-trainer + image: us-central1-docker.pkg.dev/accelerated-platforms-dev/ml-repo/maxtext-grpo:latest + resources: + limits: + google.com/tpu: 8 + env: + - name: MLFLOW_TRACKING_URI + value: "http://mlflow-service:5000" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: token + \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index 21c56f74e..ed462c27e 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -14,6 +14,7 @@ locals { rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" + rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" } variable "rl_tpu_rl_on_tpu_image_url" { @@ -21,3 +22,9 @@ variable "rl_tpu_rl_on_tpu_image_url" { description = "The URL for the RL on TPU container image." type = string } + +variable "rl_kubernetes_namespace" { + default = null + description = "The Kubernetes namespace for the RL on TPU resources." + type = string +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..2af7bbaaa --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf new file mode 120000 index 000000000..dd199215c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars new file mode 120000 index 000000000..04c4ae417 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf new file mode 120000 index 000000000..6713167a1 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf @@ -0,0 +1 @@ +../_shared_config/_cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars new file mode 120000 index 000000000..f898b3b5a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf new file mode 120000 index 000000000..f928d86dd --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf @@ -0,0 +1 @@ +../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..f56697856 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..f7d4bb73a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf new file mode 100644 index 000000000..f124ae43f --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + gsa_build_roles = [ + "roles/logging.logWriter", + ] + wi_member_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject/ns/${local.mft_kubernetes_namespace}/sa" +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf new file mode 100644 index 000000000..06977cfb2 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf @@ -0,0 +1,87 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + kubeconfig_directory = "${path.module}/../../../../kubernetes/kubeconfig/" + kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" + + workloads = { + rl_on_tpu = { + directory = "${local.namespaces_directory}/${local.rl_kubernetes_namespace}" + namespace = local.rl_kubernetes_namespace + service_account = local.rl_kubernetes_service_account_name + } + } + + manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" + namespaces_directory = "${local.manifests_directory_root}/namespace" +} + +data "local_file" "kubeconfig" { + filename = local.kubeconfig_file +} + +resource "local_file" "namespace_yaml" { + for_each = local.workloads + content = templatefile( + "${path.module}/templates/kubernetes/namespace.tftpl.yaml", + { + name = each.value.namespace + } + ) + filename = "${local.namespaces_directory}/namespace-${each.value.namespace}.yaml" +} + +module "kubectl_apply_namespace" { + for_each = local.workloads + depends_on = [ + local_file.namespace_yaml, + ] + + source = "../../../../modules/kubectl_apply" + + apply_server_side = true + delete_timeout = "60s" + error_on_delete_failure = false + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${local.namespaces_directory}/namespace-${each.value.namespace}.yaml" + manifest_includes_namespace = true +} + +resource "local_file" "serviceaccount_yaml" { + for_each = local.workloads + content = templatefile( + "${path.module}/templates/kubernetes/serviceaccount.tftpl.yaml", + { + name = each.value.service_account + namespace = each.value.namespace + } + ) + filename = "${each.value.directory}/serviceaccount-${each.value.service_account}.yaml" +} + +module "kubectl_apply_service_account" { + for_each = local.workloads + depends_on = [ + local_file.serviceaccount_yaml, + module.kubectl_apply_namespace, + ] + + source = "../../../../modules/kubectl_apply" + + apply_server_side = true + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${each.value.directory}/serviceaccount-${each.value.service_account}.yaml" + manifest_includes_namespace = true +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf new file mode 100644 index 000000000..835be0412 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf @@ -0,0 +1,43 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_storage_bucket_iam_member" "data_bucket_mlflow_storage_object_admin" { + bucket = google_storage_bucket.data.name + member = "${local.wi_member_principal_prefix}/${local.rl_kubernetes_service_accounts["mlflow"].service_account_name}" + role = "roles/storage.objectAdmin" +} + +resource "local_file" "mlflow_manifest" { + content = templatefile( + "${path.module}/templates/mlflow/manifests.tftpl.yaml", + { + bucket_name = google_storage_bucket.data.name, + service_account_name = local.rl_kubernetes_service_accounts["mlflow"].service_account_name, + } + ) + filename = "${local.rl_kubernetes_namespace_manifests_directory}/mlflow.yaml" +} + +module "kubectl_apply_mlflow_manifest" { + depends_on = [ + module.kubectl_apply_namespace_manifest, + ] + + source = "../../../../modules/kubectl_apply" + + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = local_file.mlflow_manifest.filename + manifest_includes_namespace = false + namespace = local.rl_kubernetes_namespace +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml new file mode 100644 index 000000000..e7dff839d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ${name} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml new file mode 100644 index 000000000..a0f63c9dc --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml @@ -0,0 +1,19 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ${name} + namespace: ${namespace} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml new file mode 100644 index 000000000..4383b0ee3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml @@ -0,0 +1,75 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mlflow-tracking +spec: + replicas: 1 + selector: + matchLabels: + app: mlflow-tracking + strategy: + type: RollingUpdate + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + labels: + app: mlflow-tracking + spec: + containers: + - args: + - | + mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri sqlite:///mlruns/mlflow.db + command: ["sh", "-c"] + image: ghcr.io/mlflow/mlflow:v3.10.1-full + name: mlflow + resources: + limits: + cpu: "2" + memory: 10Gi + requests: + cpu: "2" + memory: 10Gi + volumeMounts: + - mountPath: /mlruns + name: gcs-fuse-csi-ephemeral + serviceAccountName: ${service_account_name} + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + volumes: + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: ${bucket_name} + gcsfuseLoggingSeverity: warning + mountOptions: implicit-dirs + name: gcs-fuse-csi-ephemeral +--- +apiVersion: v1 +kind: Service +metadata: + name: mlflow-tracking-svc +spec: + ports: + - port: 5000 + protocol: TCP + targetPort: 5000 + selector: + app: mlflow-tracking From 05ac3b29c385e154a91ebcda875e3d535406b24f Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 10:48:15 +0100 Subject: [PATCH 07/57] feat: add training script --- container-images/tpu/rl-on-tpu/Dockerfile | 40 ++++- container-images/tpu/rl-on-tpu/src/app.py | 199 +++++++++++++++++----- 2 files changed, 191 insertions(+), 48 deletions(-) diff --git a/container-images/tpu/rl-on-tpu/Dockerfile b/container-images/tpu/rl-on-tpu/Dockerfile index baf33f651..3ea11e817 100644 --- a/container-images/tpu/rl-on-tpu/Dockerfile +++ b/container-images/tpu/rl-on-tpu/Dockerfile @@ -16,17 +16,41 @@ FROM python:3.12-slim -WORKDIR /app +WORKDIR /workspace -RUN pip install --no-cache-dir uv +# Install system dependencies +RUN apt-get update && apt-get install -y wget curl build-essential git && rm -rf /var/lib/apt/lists/* -RUN uv pip install --system "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html +# Upgrade pip and install the incredibly fast 'uv' package manager +RUN pip install --upgrade pip uv -RUN uv pip install --system maxtext --resolution=lowest -RUN install_maxtext_github_deps +# 1. Install standard Torch CPU and specific JAX TPU drivers first (Our safety net!) +RUN uv pip install --system torch torchvision --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --system "jax[tpu]==0.4.25" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html --prerelease=allow -RUN uv pip install --system torch --index-url https://download.pytorch.org/whl/cpu +# 2. THE GOOGLER WAY: Clone the repo +RUN git clone https://github.com/google/maxtext.git /workspace/maxtext -COPY --from=primary app.py ./ +# Shift Docker's working directory INSIDE the repo +WORKDIR /workspace/maxtext -ENTRYPOINT ["python", "app.py"] +# 3. Apply the Googlers' lowest flag (no 'cd' needed anymore!) +RUN uv pip install --system -e ".[tpu-post-train]" --resolution=lowest + +# 4. Run the script (it will natively find the 'src' folder now!) +RUN install_maxtext_tpu_post_train_extra_deps + +# Shift back to your main workspace for your custom files +WORKDIR /workspace + +# 5. Install our specific MLOps tracking tools +RUN uv pip install --system mlflow huggingface_hub math_verify + +# 6. Download the chat template directly +RUN wget https://raw.githubusercontent.com/google/maxtext/main/src/maxtext/examples/chat_templates/gsm8k_rl.json -O /workspace/gsm8k_rl.json + +# Copy our training script +COPY train.py /workspace/train.py + +# Execute the script +CMD ["python3", "train.py"] diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py index 63e482f9b..6f839ab55 100644 --- a/container-images/tpu/rl-on-tpu/src/app.py +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -1,48 +1,130 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime import os +# --- SYSTEM SHIELDS (Must be at the very top!) --- +os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +import sys +import datetime +import subprocess +import mlflow import jax -import MaxText from huggingface_hub import login -from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices +import jax.numpy as jnp + +# --- Replace the old tunix import with this --- +from maxtext.inference.vllm_decode import VllmRollout + +# 1. Save the original method +original_get_logps = VllmRollout.get_per_token_logps + +def patched_get_per_token_logps(self, *args, **kwargs): + # Fix A: Intercept the mask to use as a blueprint + completion_mask = kwargs.pop('completion_mask', None) + + # Call the actual vLLM execution + results = original_get_logps(self, *args, **kwargs) + + # Extract target length (defaults to 768 if mask is missing) + target_len = completion_mask.shape[-1] if completion_mask is not None else 768 + + def pad_sequence(seq): + seq_arr = jnp.array(seq) + + # If vLLM returned an empty array, return a zeroed array of correct shape + if seq_arr.size == 0: + return jnp.zeros(target_len) + + # Pad with zeros if too short, or truncate if too long + pad_amount = target_len - seq_arr.shape[0] + if pad_amount > 0: + return jnp.pad(seq_arr, (0, pad_amount), constant_values=0.0) + elif pad_amount < 0: + return seq_arr[:target_len] + return seq_arr + + # Fix B: Process ragged lists and perfectly pad them into a rigid JAX block + if isinstance(results, list): + padded_results = [pad_sequence(seq) for seq in results] + return jnp.stack(padded_results) + + elif isinstance(results, dict): + return {k: jnp.stack([pad_sequence(seq) for seq in v]) if isinstance(v, list) else v for k, v in results.items()} + + return results + +# 2. Apply the patch +VllmRollout.get_per_token_logps = patched_get_per_token_logps -# Environment variables for cleaner logging -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" -os.environ["SKIP_JAX_PRECOMPILE"] = "1" -os.environ["VLLM_LOGGING_LEVEL"] = "ERROR" +print("🔧 Applied Monkey Patch v3: Intercepted kwargs and perfectly padded ragged JAX arrays.") + +try: + import vllm + print(f"✅ vLLM Version: {vllm.__version__}") + print(f"✅ JAX TPU Devices: {len(jax.devices())}") +except ImportError as e: + print(f"🚨 FATAL: vLLM is not installed correctly: {e}") + +# --- CORE IMPORTS --- +import maxtext +from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices +import maxtext.checkpoint_conversion.to_maxtext as to_maxtext_module +from etils import epath -HF_TOKEN = os.environ.get("HF_TOKEN", "") -if HF_TOKEN: - login(token=HF_TOKEN) +HF_TOKEN = os.environ.get("HF_TOKEN") +if not HF_TOKEN: + raise ValueError("HF_TOKEN environment variable not set.") +login(token=HF_TOKEN) +# Delete this line: +# MAXTEXT_PKG_DIR = os.path.dirname(maxtext.__file__) -MAXTEXT_PKG_DIR = os.path.dirname(MaxText.__file__) -MAXTEXT_REPO_ROOT = os.sep.join( - ["maxtext" if p == "MaxText" else p for p in MAXTEXT_PKG_DIR.split(os.sep)] -) +# Replace it with the hardcoded absolute path where we cloned the repo: +MAXTEXT_PKG_DIR = "/workspace/maxtext/src/maxtext" MODEL_NAME = "llama3.1-8b" TOKENIZER_PATH = "meta-llama/Llama-3.1-8B-Instruct" RUN_NAME = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") -LOSS_ALGO = "grpo" +LOSS_ALGO = "grpo" -CHAT_TEMPLATE_PATH = f"{MAXTEXT_REPO_ROOT}/examples/chat_templates/gsm8k_rl.json" +# Paths are localized to the workspace +CHAT_TEMPLATE_PATH = "/workspace/gsm8k_rl.json" MODEL_CHECKPOINT_PATH = "/workspace/llama_checkpoint" OUTPUT_DIRECTORY = "/workspace/rl_llama3_output" +mlflow.set_tracking_uri(os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")) +mlflow.set_experiment("MaxText-RL-GRPO") + +# --- CHECKPOINT CONVERSION (With FP32 casting patch) --- +target_checkpoint_items = f"{MODEL_CHECKPOINT_PATH}/0/items" + +if not epath.Path(target_checkpoint_items).exists(): + print(f"Downloading and converting Llama 3.1 to MaxText format...") + to_maxtext_path = to_maxtext_module.__file__ + + with open(to_maxtext_path, "r") as f: + script_content = f.read() + + if "v.numpy()" in script_content: + script_content = script_content.replace("v.numpy()", "v.float().numpy()") + + with open(to_maxtext_path, "w") as f: + f.write(script_content) + + conversion_command = ( + f"JAX_PLATFORMS=cpu python3 -m maxtext.checkpoint_conversion.to_maxtext " + f"{MAXTEXT_PKG_DIR}/configs/base.yml " + f"model_name={MODEL_NAME} " + f"base_output_directory={MODEL_CHECKPOINT_PATH} " + f"hf_access_token={HF_TOKEN} " + f"use_multimodal=false scan_layers=true skip_jax_distributed_system=True" + ) + + result = subprocess.run(conversion_command, shell=True, executable='/bin/bash') + if result.returncode != 0: + raise RuntimeError("Checkpoint conversion failed! Check the logs above.") +else: + print(f"✅ Found existing Orbax checkpoint at {target_checkpoint_items}") + +# --- MAXTEXT RL CONFIGURATION --- config_argv = [ "", f"{MAXTEXT_PKG_DIR}/configs/post_train/rl.yml", @@ -51,21 +133,58 @@ f"run_name={RUN_NAME}", f"chat_template_path={CHAT_TEMPLATE_PATH}", f"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items", - f"base_output_directory={OUTPUT_DIRECTORY}", + #f"base_output_directory={OUTPUT_DIRECTORY}", + # --- DIRECT TO GCS ROUTING --- + f"base_output_directory=gs://accelerated-platforms-dev-trn-rl-gpu-hf-hub-models/my-grpo-checkpoints/rl_llama3_output/{RUN_NAME}", f"hf_access_token={HF_TOKEN}", "debug.rl=False", f"rl.loss_algo={LOSS_ALGO}", + "rl.rollout_engine=vllm", "use_pathways=False", + "rollout_expert_parallelism=1", + # --- THE MESH & MEMORY FIX --- + # 1. We keep vLLM sliced across all 8 chips + "rollout_tensor_parallelism=8", + + # 2. DELETE the ici_tensor_parallelism line! We let MaxText default to FSDP. + + # 3. Restrict vLLM to 40% memory so MaxText has room to train + "hbm_utilization_vllm=0.4", + + # 4. Give vLLM the blueprint it needs to build its mesh scaffolding + f"vllm_hf_config_path={TOKENIZER_PATH}", + + # --- SCALING UP FOR GRPO --- + #"per_device_batch_size=4", # Increased from 1 to test micro-batching + #"rl.num_generations=4", # Crucial for GRPO: Generates 4 reasoning chains per prompt + # --- THE SCALE UP (Real Training Run) --- + "per_device_batch_size=2", # Doubling the throughput (pushes your 60% HBM limit) + "num_batches=200", # Process 200 batches instead of the tiny default + "rl.num_generations=8", # GRPO magic: vLLM generates 8 different answers per prompt to compare + "rl.num_iterations=2", # Train the actor model for 2 iterations on those 8 answers + "learning_rate=1e-6", # A standard, safe learning rate for RL fine-tuning + "save_checkpoint_on_completion=True", # Ensure the final weights are saved! + "return_log_prob=True", # <-- The crucial GRPO math flag! + # --- THE MLFLOW FIX --- + "log_period=10", # Force TensorBoard to write metrics to disk every 10 steps + "checkpoint_period=50", + "profiler=True", + "profiler_steps=100,110", # Takes a massive hardware snapshot between step 100 and 110 ] -trainer_config, sampler_config, trainer_devices, sampler_devices = ( - setup_configs_and_devices(config_argv) -) +trainer_config, sampler_config, trainer_devices, sampler_devices = setup_configs_and_devices(config_argv) -print(f"🚀 Starting {LOSS_ALGO} Training...") -try: +# --- EXECUTE TRAINING --- +with mlflow.start_run(run_name=f"Llama3.1-8B-{LOSS_ALGO}"): + mlflow.log_params({ + "model_name": MODEL_NAME, + "loss_algo": LOSS_ALGO, + "tpu_devices": len(jax.devices()), + "rollout_engine": "vllm" + }) + + print(f"🚀 Starting {LOSS_ALGO} Training on {len(jax.devices())} TPUs...") rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices) - print("✅ Training Completed Successfully!") -except Exception as e: - print(f"❌ Training Failed: {str(e)}") - raise + + mlflow.log_artifacts(trainer_config.tensorboard_dir, artifact_path="tensorboard_logs") + print("✅ Training Completed and Logged to MLflow!") From cc8ad6b5b42257dfb3c068ef30c8902c9c6176ef Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 11:00:00 +0100 Subject: [PATCH 08/57] fix: cicd errors --- .../dictionary/reinforcement-learning.txt | 6 ++++++ container-images/tpu/rl-on-tpu/Dockerfile | 8 ++++---- container-images/tpu/rl-on-tpu/src/app.py | 14 ++++++++++++++ .../kubernetes-manifests/rl-on-tpu/job.yaml | 1 - 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/.github/workflows/dictionary/reinforcement-learning.txt b/.github/workflows/dictionary/reinforcement-learning.txt index b3bc2040a..af1028915 100644 --- a/.github/workflows/dictionary/reinforcement-learning.txt +++ b/.github/workflows/dictionary/reinforcement-learning.txt @@ -1,3 +1,9 @@ +epath grpo +logps maxtext +MULTIPROC +Orbax +returncode strftime +tunix diff --git a/container-images/tpu/rl-on-tpu/Dockerfile b/container-images/tpu/rl-on-tpu/Dockerfile index 3ea11e817..037a0d68a 100644 --- a/container-images/tpu/rl-on-tpu/Dockerfile +++ b/container-images/tpu/rl-on-tpu/Dockerfile @@ -28,13 +28,13 @@ RUN pip install --upgrade pip uv RUN uv pip install --system torch torchvision --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --system "jax[tpu]==0.4.25" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html --prerelease=allow -# 2. THE GOOGLER WAY: Clone the repo +# 2. Clone the repo RUN git clone https://github.com/google/maxtext.git /workspace/maxtext # Shift Docker's working directory INSIDE the repo WORKDIR /workspace/maxtext -# 3. Apply the Googlers' lowest flag (no 'cd' needed anymore!) +# 3. Apply the lowest flag (no 'cd' needed anymore!) RUN uv pip install --system -e ".[tpu-post-train]" --resolution=lowest # 4. Run the script (it will natively find the 'src' folder now!) @@ -50,7 +50,7 @@ RUN uv pip install --system mlflow huggingface_hub math_verify RUN wget https://raw.githubusercontent.com/google/maxtext/main/src/maxtext/examples/chat_templates/gsm8k_rl.json -O /workspace/gsm8k_rl.json # Copy our training script -COPY train.py /workspace/train.py +COPY train.py /workspace/app.py # Execute the script -CMD ["python3", "train.py"] +CMD ["python3", "app.py"] diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py index 6f839ab55..6aeb67fbe 100644 --- a/container-images/tpu/rl-on-tpu/src/app.py +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -1,3 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os # --- SYSTEM SHIELDS (Must be at the very top!) --- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml index 6aaf554f0..806178788 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml @@ -23,4 +23,3 @@ spec: secretKeyRef: name: hf-secret key: token - \ No newline at end of file From a29e6e1599da09cae938bcb2ffb769d21e74251d Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 16 Mar 2026 10:22:16 +0000 Subject: [PATCH 09/57] fix: formatting --- container-images/tpu/rl-on-tpu/src/app.py | 123 ++++++++++++---------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py index 6aeb67fbe..9c8ff1a18 100644 --- a/container-images/tpu/rl-on-tpu/src/app.py +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -13,66 +13,77 @@ # limitations under the License. import os + # --- SYSTEM SHIELDS (Must be at the very top!) --- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import sys + import datetime import subprocess -import mlflow +import sys + import jax -from huggingface_hub import login import jax.numpy as jnp +import mlflow +from huggingface_hub import login # --- Replace the old tunix import with this --- from maxtext.inference.vllm_decode import VllmRollout -# 1. Save the original method +# 1. Save the original method original_get_logps = VllmRollout.get_per_token_logps + def patched_get_per_token_logps(self, *args, **kwargs): # Fix A: Intercept the mask to use as a blueprint - completion_mask = kwargs.pop('completion_mask', None) - + completion_mask = kwargs.pop("completion_mask", None) + # Call the actual vLLM execution results = original_get_logps(self, *args, **kwargs) - + # Extract target length (defaults to 768 if mask is missing) target_len = completion_mask.shape[-1] if completion_mask is not None else 768 - + def pad_sequence(seq): seq_arr = jnp.array(seq) - + # If vLLM returned an empty array, return a zeroed array of correct shape if seq_arr.size == 0: return jnp.zeros(target_len) - + # Pad with zeros if too short, or truncate if too long pad_amount = target_len - seq_arr.shape[0] if pad_amount > 0: return jnp.pad(seq_arr, (0, pad_amount), constant_values=0.0) elif pad_amount < 0: - return seq_arr[:target_len] + return seq_arr[:target_len] return seq_arr # Fix B: Process ragged lists and perfectly pad them into a rigid JAX block if isinstance(results, list): padded_results = [pad_sequence(seq) for seq in results] return jnp.stack(padded_results) - + elif isinstance(results, dict): - return {k: jnp.stack([pad_sequence(seq) for seq in v]) if isinstance(v, list) else v for k, v in results.items()} - + return { + k: jnp.stack([pad_sequence(seq) for seq in v]) if isinstance(v, list) else v + for k, v in results.items() + } + return results + # 2. Apply the patch VllmRollout.get_per_token_logps = patched_get_per_token_logps -print("🔧 Applied Monkey Patch v3: Intercepted kwargs and perfectly padded ragged JAX arrays.") - +print( + "🔧 Applied Monkey Patch v3: Intercepted kwargs and perfectly padded ragged JAX arrays." +) + try: import vllm + print(f"✅ vLLM Version: {vllm.__version__}") print(f"✅ JAX TPU Devices: {len(jax.devices())}") except ImportError as e: @@ -80,9 +91,9 @@ def pad_sequence(seq): # --- CORE IMPORTS --- import maxtext -from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices import maxtext.checkpoint_conversion.to_maxtext as to_maxtext_module from etils import epath +from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: @@ -97,7 +108,7 @@ def pad_sequence(seq): MODEL_NAME = "llama3.1-8b" TOKENIZER_PATH = "meta-llama/Llama-3.1-8B-Instruct" RUN_NAME = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") -LOSS_ALGO = "grpo" +LOSS_ALGO = "grpo" # Paths are localized to the workspace CHAT_TEMPLATE_PATH = "/workspace/gsm8k_rl.json" @@ -113,16 +124,16 @@ def pad_sequence(seq): if not epath.Path(target_checkpoint_items).exists(): print(f"Downloading and converting Llama 3.1 to MaxText format...") to_maxtext_path = to_maxtext_module.__file__ - + with open(to_maxtext_path, "r") as f: script_content = f.read() - + if "v.numpy()" in script_content: script_content = script_content.replace("v.numpy()", "v.float().numpy()") - + with open(to_maxtext_path, "w") as f: f.write(script_content) - + conversion_command = ( f"JAX_PLATFORMS=cpu python3 -m maxtext.checkpoint_conversion.to_maxtext " f"{MAXTEXT_PKG_DIR}/configs/base.yml " @@ -131,8 +142,8 @@ def pad_sequence(seq): f"hf_access_token={HF_TOKEN} " f"use_multimodal=false scan_layers=true skip_jax_distributed_system=True" ) - - result = subprocess.run(conversion_command, shell=True, executable='/bin/bash') + + result = subprocess.run(conversion_command, shell=True, executable="/bin/bash") if result.returncode != 0: raise RuntimeError("Checkpoint conversion failed! Check the logs above.") else: @@ -147,58 +158,64 @@ def pad_sequence(seq): f"run_name={RUN_NAME}", f"chat_template_path={CHAT_TEMPLATE_PATH}", f"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items", - #f"base_output_directory={OUTPUT_DIRECTORY}", + # f"base_output_directory={OUTPUT_DIRECTORY}", # --- DIRECT TO GCS ROUTING --- f"base_output_directory=gs://accelerated-platforms-dev-trn-rl-gpu-hf-hub-models/my-grpo-checkpoints/rl_llama3_output/{RUN_NAME}", f"hf_access_token={HF_TOKEN}", "debug.rl=False", f"rl.loss_algo={LOSS_ALGO}", - "rl.rollout_engine=vllm", + "rl.rollout_engine=vllm", "use_pathways=False", "rollout_expert_parallelism=1", # --- THE MESH & MEMORY FIX --- # 1. We keep vLLM sliced across all 8 chips - "rollout_tensor_parallelism=8", - + "rollout_tensor_parallelism=8", # 2. DELETE the ici_tensor_parallelism line! We let MaxText default to FSDP. - # 3. Restrict vLLM to 40% memory so MaxText has room to train - "hbm_utilization_vllm=0.4", - + "hbm_utilization_vllm=0.4", # 4. Give vLLM the blueprint it needs to build its mesh scaffolding - f"vllm_hf_config_path={TOKENIZER_PATH}", - + f"vllm_hf_config_path={TOKENIZER_PATH}", # --- SCALING UP FOR GRPO --- - #"per_device_batch_size=4", # Increased from 1 to test micro-batching - #"rl.num_generations=4", # Crucial for GRPO: Generates 4 reasoning chains per prompt + # "per_device_batch_size=4", # Increased from 1 to test micro-batching + # "rl.num_generations=4", # Crucial for GRPO: Generates 4 reasoning chains per prompt # --- THE SCALE UP (Real Training Run) --- "per_device_batch_size=2", # Doubling the throughput (pushes your 60% HBM limit) - "num_batches=200", # Process 200 batches instead of the tiny default - "rl.num_generations=8", # GRPO magic: vLLM generates 8 different answers per prompt to compare - "rl.num_iterations=2", # Train the actor model for 2 iterations on those 8 answers - "learning_rate=1e-6", # A standard, safe learning rate for RL fine-tuning - "save_checkpoint_on_completion=True", # Ensure the final weights are saved! - "return_log_prob=True", # <-- The crucial GRPO math flag! + "num_batches=200", # Process 200 batches instead of the tiny default + "rl.num_generations=8", # GRPO magic: vLLM generates 8 different answers per prompt to compare + "rl.num_iterations=2", # Train the actor model for 2 iterations on those 8 answers + "learning_rate=1e-6", # A standard, safe learning rate for RL fine-tuning + "save_checkpoint_on_completion=True", # Ensure the final weights are saved! + "return_log_prob=True", # <-- The crucial GRPO math flag! # --- THE MLFLOW FIX --- - "log_period=10", # Force TensorBoard to write metrics to disk every 10 steps + "log_period=10", # Force TensorBoard to write metrics to disk every 10 steps "checkpoint_period=50", "profiler=True", - "profiler_steps=100,110", # Takes a massive hardware snapshot between step 100 and 110 + "profiler_steps=100,110", # Takes a massive hardware snapshot between step 100 and 110 ] -trainer_config, sampler_config, trainer_devices, sampler_devices = setup_configs_and_devices(config_argv) +trainer_config, sampler_config, trainer_devices, sampler_devices = ( + setup_configs_and_devices(config_argv) +) # --- EXECUTE TRAINING --- with mlflow.start_run(run_name=f"Llama3.1-8B-{LOSS_ALGO}"): - mlflow.log_params({ - "model_name": MODEL_NAME, - "loss_algo": LOSS_ALGO, - "tpu_devices": len(jax.devices()), - "rollout_engine": "vllm" - }) - + mlflow.log_params( + { + "model_name": MODEL_NAME, + "loss_algo": LOSS_ALGO, + "tpu_devices": len(jax.devices()), + "rollout_engine": "vllm", + } + ) + print(f"🚀 Starting {LOSS_ALGO} Training on {len(jax.devices())} TPUs...") rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices) - - mlflow.log_artifacts(trainer_config.tensorboard_dir, artifact_path="tensorboard_logs") + + mlflow.log_artifacts( + trainer_config.tensorboard_dir, artifact_path="tensorboard_logs" + ) + print("✅ Training Completed and Logged to MLflow!") + mlflow.log_artifacts( + trainer_config.tensorboard_dir, artifact_path="tensorboard_logs" + ) print("✅ Training Completed and Logged to MLflow!") From 237840a61bd0d424d2d89cd1e13d1c20e66b8cb4 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 11:24:50 +0100 Subject: [PATCH 10/57] fix: cicd errors --- .../dictionary/reinforcement-learning.txt | 4 ++-- .../kubernetes-manifests/rl-on-tpu/job.yaml | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dictionary/reinforcement-learning.txt b/.github/workflows/dictionary/reinforcement-learning.txt index af1028915..11c4ca2c5 100644 --- a/.github/workflows/dictionary/reinforcement-learning.txt +++ b/.github/workflows/dictionary/reinforcement-learning.txt @@ -1,9 +1,9 @@ +Orbax epath grpo logps maxtext -MULTIPROC -Orbax +multiproc returncode strftime tunix diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml index 806178788..2873b755c 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml @@ -1,3 +1,17 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- apiVersion: batch/v1 kind: Job metadata: From 4437bf6c12f4f4daca9186e401a5a98436fab7c9 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 11:26:05 +0100 Subject: [PATCH 11/57] fix: cicd errors --- .github/workflows/dictionary/reinforcement-learning.txt | 1 - container-images/tpu/rl-on-tpu/src/app.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/dictionary/reinforcement-learning.txt b/.github/workflows/dictionary/reinforcement-learning.txt index 11c4ca2c5..4eed6369e 100644 --- a/.github/workflows/dictionary/reinforcement-learning.txt +++ b/.github/workflows/dictionary/reinforcement-learning.txt @@ -1,4 +1,3 @@ -Orbax epath grpo logps diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py index 9c8ff1a18..14262ef4a 100644 --- a/container-images/tpu/rl-on-tpu/src/app.py +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -147,7 +147,7 @@ def pad_sequence(seq): if result.returncode != 0: raise RuntimeError("Checkpoint conversion failed! Check the logs above.") else: - print(f"✅ Found existing Orbax checkpoint at {target_checkpoint_items}") + print(f"✅ Found existing checkpoint at {target_checkpoint_items}") # --- MAXTEXT RL CONFIGURATION --- config_argv = [ From 1d4d355c2601e945a5ce448961d2655ac8b479a6 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 11:30:04 +0100 Subject: [PATCH 12/57] fix: cicd errors --- .github/workflows/dictionary/reinforcement-learning.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/dictionary/reinforcement-learning.txt b/.github/workflows/dictionary/reinforcement-learning.txt index 4eed6369e..240fb3a40 100644 --- a/.github/workflows/dictionary/reinforcement-learning.txt +++ b/.github/workflows/dictionary/reinforcement-learning.txt @@ -1,8 +1,11 @@ epath +etils grpo +logdir logps maxtext multiproc returncode +sigabrt strftime tunix From 73f602a9855f56849d723ebc6241c42bfe2d34c9 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 14:30:10 +0100 Subject: [PATCH 13/57] fix: removing unnecessary line --- container-images/tpu/rl-on-tpu/src/app.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/rl-on-tpu/src/app.py index 14262ef4a..7ea803688 100644 --- a/container-images/tpu/rl-on-tpu/src/app.py +++ b/container-images/tpu/rl-on-tpu/src/app.py @@ -99,8 +99,6 @@ def pad_sequence(seq): if not HF_TOKEN: raise ValueError("HF_TOKEN environment variable not set.") login(token=HF_TOKEN) -# Delete this line: -# MAXTEXT_PKG_DIR = os.path.dirname(maxtext.__file__) # Replace it with the hardcoded absolute path where we cloned the repo: MAXTEXT_PKG_DIR = "/workspace/maxtext/src/maxtext" From c6104a2c0657be3bd4fda91d1941a4f2f6de040f Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 14:33:09 +0100 Subject: [PATCH 14/57] feat: add doc for rl on tpu --- README.md | 4 ++-- .../gke/base/use-cases/reinforcement-larning/README.md | 1 + .../use-cases/reinforcement-larning/rl-on-tpu}/README.md | 7 ------- 3 files changed, 3 insertions(+), 9 deletions(-) create mode 100644 docs/platforms/gke/base/use-cases/reinforcement-larning/README.md rename {platforms/gke/base/use-cases/reinforcement-learning => docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu}/README.md (99%) diff --git a/README.md b/README.md index 0a5223ae8..d729b3a92 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,6 @@ the primary runtime. - [ComfyUI reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/examples/comfyui/README.md) - [Federated learning](/docs/platforms/gke/base/use-cases/federated-learning/README.md) - [Inference reference architecture](/docs/platforms/gke/base/use-cases/inference-ref-arch/README.md) - - [Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md) - [Online inference with GPUs](/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/README.md) - [Online inference using Diffusers with GPUs on Google Kubernetes Engine (GKE)](/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md) @@ -54,13 +53,14 @@ the primary runtime. - [Batch inference with GPUs](/docs/platforms/gke/base/use-cases/inference-ref-arch/batch-inference/README.md) - [Offline batch inference with GPUs](/docs/platforms/gke/base/use-cases/inference-ref-arch/offline-batch/README.md) - [Intelligent inference scheduling quickstart using llm-d](/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/README.md) - - [Training reference architecture](/docs/platforms/gke/base/use-cases/training-ref-arch/README.md) - [Model fine tuning](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/README.md) - [Data processing](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/data-processing.md) - [Data preparation](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/data-preparation.md) - [Fine tuning](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/fine-tuning.md) - [Model evaluation](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/model-evaluation.md) +- [Reinforcement Learning reference architecture](/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md) + - [RL on TPU](/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md) ### Guides diff --git a/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md b/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md new file mode 100644 index 000000000..b979d0027 --- /dev/null +++ b/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md @@ -0,0 +1 @@ +# Reinforcement Learning reference architecture \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/README.md b/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md similarity index 99% rename from platforms/gke/base/use-cases/reinforcement-learning/README.md rename to docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md index dd5a9d59f..578e219af 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/README.md +++ b/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md @@ -40,7 +40,6 @@ pulls your token from a Kubernetes secret. Create it in your active namespace: ```bash kubectl create secret generic hf-secret --from-literal=token="" - ``` ### 3. Hardware & Storage Prerequisites @@ -61,7 +60,6 @@ training pod has somewhere to send its metrics and artifacts. ```bash kubectl apply -f mlflow.yaml - ``` _(Note: This uses a `ClusterIP` configuration, meaning the dashboard is kept @@ -73,21 +71,18 @@ automatically discover it at `mlflow-service:5000`)_. ```bash docker build -t your-registry/maxtext-grpo:latest . docker push your-registry/maxtext-grpo:latest - ``` ### 3. Submit the GKE Training Job ```bash kubectl apply -f v5e-job.yaml - ``` ### 4. Tail the Logs ```bash kubectl logs -f job/maxtext-grpo-job-v5e - ``` --- @@ -108,7 +103,6 @@ it to your local machine to view the dashboard: ```bash kubectl port-forward svc/mlflow-service 5000:5000 - ``` 2. **Open your Browser:** Navigate to `http://localhost:5000` @@ -128,7 +122,6 @@ pod: ```bash kubectl exec -it job/maxtext-grpo-job-v5e -- tensorboard --logdir /workspace/rl_llama3_output --host 0.0.0.0 --port 6006 kubectl port-forward job/maxtext-grpo-job-v5e 6006:6006 - ``` --- From ce364614fb2197b81884436f13182839b974cac2 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 14:33:51 +0100 Subject: [PATCH 15/57] feat: add reinforcement learning cluster deployment --- .../reinforcement_learning_variables.tf | 19 +++++++++++++++++-- .../terraform/rl_on_tpu/iam.tf | 2 +- .../terraform/rl_on_tpu/kubernetes.tf | 5 +++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index ed462c27e..143f413d9 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -13,8 +13,17 @@ # limitations under the License. locals { - rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" - rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" + rl_kubernetes_namespace_manifests_directory = "${path.module}/manifests/${local.rl_kubernetes_namespace}" + rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" + rl_kubernetes_service_account_name = var.rl_kubernetes_service_account_name != null ? var.rl_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-sa" + rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" + + rl_kubernetes_service_accounts = { + mlflow = { + automount_service_account_token = false + service_account_name = "${local.unique_identifier_prefix}-mlflow" + } + } } variable "rl_tpu_rl_on_tpu_image_url" { @@ -28,3 +37,9 @@ variable "rl_kubernetes_namespace" { description = "The Kubernetes namespace for the RL on TPU resources." type = string } + +variable "rl_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL on TPU resources." + type = string +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf index f124ae43f..c1d693321 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf @@ -16,5 +16,5 @@ locals { gsa_build_roles = [ "roles/logging.logWriter", ] - wi_member_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject/ns/${local.mft_kubernetes_namespace}/sa" + wi_member_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject/ns/${local.rl_kubernetes_namespace}/sa" } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf index 06977cfb2..a60d47100 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf @@ -24,8 +24,9 @@ locals { } } - manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" - namespaces_directory = "${local.manifests_directory_root}/namespace" + manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" + namespaces_directory = "${local.manifests_directory_root}/namespace" + rl_kubernetes_namespace_manifests_directory = "${local.namespaces_directory}/${local.rl_kubernetes_namespace}" } data "local_file" "kubeconfig" { From 6ecb8154e34dca9b20a524aa907e5cd3c7863f54 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 14:39:51 +0100 Subject: [PATCH 16/57] fix: missing final newline --- .../gke/base/use-cases/reinforcement-larning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md b/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md index b979d0027..46dd442f3 100644 --- a/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md +++ b/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md @@ -1 +1 @@ -# Reinforcement Learning reference architecture \ No newline at end of file +# Reinforcement Learning reference architecture From 64f6ba7c323c97ae964440ef54ae4795bba1baf1 Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 15:43:55 +0100 Subject: [PATCH 17/57] feat: add storage for dataset --- .../_shared_config/_huggingface.auto.tfvars | 1 + .../_shared_config/_huggingface_variables.tf | 1 + .../reinforcement_learning_variables.tf | 14 ++++++++++ .../rl_on_tpu/_huggingface.auto.tfvars | 1 + .../rl_on_tpu/_huggingface_variables.tf | 1 + .../terraform/rl_on_tpu/storage.tf | 26 +++++++++++++++++++ 6 files changed, 44 insertions(+) create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars new file mode 120000 index 000000000..276530b81 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf new file mode 120000 index 000000000..f384bc7e1 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index 143f413d9..d0fb0e3ae 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -13,9 +13,11 @@ # limitations under the License. locals { + rl_dataset_bucket_name = var.rl_dataset_bucket_name != null ? var.rl_dataset_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-dataset" rl_kubernetes_namespace_manifests_directory = "${path.module}/manifests/${local.rl_kubernetes_namespace}" rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" rl_kubernetes_service_account_name = var.rl_kubernetes_service_account_name != null ? var.rl_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-sa" + rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" rl_kubernetes_service_accounts = { @@ -43,3 +45,15 @@ variable "rl_kubernetes_service_account_name" { description = "The Kubernetes service account name for the RL on TPU resources." type = string } + +variable "rl_project_id" { + default = null + description = "The GCP project ID for the RL on TPU resources." + type = string +} + +variable "rl_dataset_bucket_name" { + default = null + description = "The GCP bucket name for the RL dataset." + type = string +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars new file mode 120000 index 000000000..488145ca9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf new file mode 120000 index 000000000..91b00dc64 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf @@ -0,0 +1 @@ +../_shared_config/_huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf new file mode 100644 index 000000000..d9d4e4f0e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_storage_bucket" "hub_models" { + name = local.huggingface_hub_models_bucket_name + project = local.huggingface_hub_models_bucket_project_id +} + +resource "google_storage_bucket" "dataset" { + name = local.rl_dataset_bucket_name + project = local.rl_project_id + location = local.cluster_region + + uniform_bucket_level_access = true +} From e8a0094b01bffd09a2e8ba6ec1afe8cf5a22db7f Mon Sep 17 00:00:00 2001 From: laurentgrangeau Date: Mon, 16 Mar 2026 15:47:06 +0100 Subject: [PATCH 18/57] feat: add dataset downloader for gsm8k --- .../Dockerfile | 55 +++++++ .../cloudbuild.yaml | 31 ++++ .../src/app.py | 136 ++++++++++++++++++ .../src/requirements.txt | 2 + 4 files changed, 224 insertions(+) create mode 100644 container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile create mode 100644 container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml create mode 100644 container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py create mode 100644 container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile b/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile new file mode 100644 index 000000000..b2992c32d --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile @@ -0,0 +1,55 @@ +# syntax=docker.io/docker/dockerfile:1.17.1 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# --- STAGE 1: Build Stage --- +# Use a Python image that includes tools for installing dependencies +FROM python:3.14.0-slim-trixie as builder + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV APP_HOME /usr/src/app + +# Create and set the working directory +WORKDIR $APP_HOME + +# Copy only the requirements file first to leverage Docker cache +COPY --from=primary requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# --- STAGE 2: Final Runtime Stage --- +# Use a minimal runtime image for security and size +FROM python:3.14.0-slim-trixie + +# Set environment variables for the runtime +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV APP_HOME /usr/src/app + +# Create and set the working directory +WORKDIR $APP_HOME + +# Copy installed dependencies from the builder stage +COPY --from=builder /usr/local/lib/python3.14/site-packages /usr/local/lib/python3.14/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy the application script itself +COPY --from=primary app.py . + +# Command to run the application when the container starts +CMD ["python", "app.py"] diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml b/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml new file mode 100644 index 000000000..132075a69 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --build-context=primary=container-images/cpu/reinforcement-learning-dataset-downloader/src + - --file=container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile + - --tag=${_DESTINATION} + - . + id: "Build Reinforcement Learning Dataset Downloader image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py new file mode 100644 index 000000000..eebc6aa57 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py @@ -0,0 +1,136 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import logging.config +import math +import os + +from datasets import load_dataset +from google.cloud import storage + +# --- LOGGING CONFIGURATION --- +ROOT_LEVEL = "INFO" +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": True, + "formatters": { + "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"}, + }, + "handlers": { + "default": { + "level": "INFO", + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", # Default is stderr + }, + }, + "loggers": { + "": { # root logger + "level": ROOT_LEVEL, # "INFO", + "handlers": ["default"], + "propagate": False, + }, + "uvicorn.error": { + "level": "DEBUG", + "handlers": ["default"], + }, + "uvicorn.access": { + "level": "DEBUG", + "handlers": ["default"], + }, + }, +} + +logging.config.dictConfig(LOGGING_CONFIG) + +LOG = logging.getLogger(__name__) + +# --- Configuration --- +# Fetch bucket name from environment variable +DATASET_BUCKET_NAME = os.getenv("DATASET_BUCKET_NAME") +GCS_PREFIX = "gsm8k" +NUM_SHARDS = 10 + + +def validate_config(): + if not DATASET_BUCKET_NAME: + LOG.error("❌ Error: Environment variable 'DATASET_BUCKET_NAME' is not set.") + raise ValueError("DATASET_BUCKET_NAME environment variable is required.") + + +def prepare_and_upload_shards(): + validate_config() + + # 1. Initialize GCS Client + try: + storage_client = storage.Client() + bucket = storage_client.bucket(DATASET_BUCKET_NAME) + # fast check if bucket exists (optional, but good for fail-fast) + if not bucket.exists(): + LOG.error( + f"❌ Error: Bucket '{DATASET_BUCKET_NAME}' does not exist or you lack permissions." + ) + raise ValueError(f"Bucket '{DATASET_BUCKET_NAME}' is not accessible.") + except Exception as e: + LOG.error(f"❌ Error connecting to GCS: {e}") + raise e + + # 2. Load Dataset (Alpaca Cleaned) + LOG.info("⬇️ Downloading dataset from Hugging Face...") + try: + dataset = load_dataset("openai/gsm8k", split="train") + except Exception as e: + LOG.error(f"❌ Error loading dataset: {e}") + raise e + + total_records = len(dataset) + shard_size = math.ceil(total_records / NUM_SHARDS) + + LOG.info(f"✅ Dataset loaded. Total records: {total_records}") + LOG.info(f"⚡ Splitting into {NUM_SHARDS} shards of ~{shard_size} records each.") + + # 3. Shard and Upload + LOG.info(f"🚀 Uploading to gs://{DATASET_BUCKET_NAME}/{GCS_PREFIX}/ ...") + + for i in range(NUM_SHARDS): + start_idx = i * shard_size + end_idx = min((i + 1) * shard_size, total_records) + + subset = dataset.select(range(start_idx, end_idx)) + shard_data = list(subset) + + # Serialize to JSON + json_data = json.dumps(shard_data, indent=2) + + # Define GCS path + blob_name = f"{GCS_PREFIX}/input_shard_{i}.json" + blob = bucket.blob(blob_name) + + try: + # Upload string directly to GCS + blob.upload_from_string(data=json_data, content_type="application/json") + LOG.info( + f" • Uploaded shard {i}: {blob_name} ({len(shard_data)} records)" + ) + except Exception as e: + LOG.error(f" ❌ Failed to upload shard {i}: {e}") + raise e + + LOG.info("\n✨ All shards uploaded successfully.") + + +if __name__ == "__main__": + prepare_and_upload_shards() diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt b/container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt new file mode 100644 index 000000000..17c983e20 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt @@ -0,0 +1,2 @@ +datasets==4.5.0 +google-cloud-storage==3.8.0 From ff820d02a97573c187c0d659e5950e5744b583c1 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 16 Mar 2026 16:21:47 +0000 Subject: [PATCH 19/57] feat: add mlflow deployment and creation of dataset bucket --- .../reinforcement_learning_variables.tf | 17 +++++++--- .../terraform/rl_on_tpu/kubernetes.tf | 5 ++- .../terraform/rl_on_tpu/mlflow.tf | 6 ++-- .../terraform/rl_on_tpu/project.tf | 17 ++++++++++ .../terraform/rl_on_tpu/storage.tf | 8 +++++ .../terraform/rl_on_tpu/versions.tf | 32 +++++++++++++++++++ 6 files changed, 74 insertions(+), 11 deletions(-) create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index d0fb0e3ae..3b563f203 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -17,20 +17,21 @@ locals { rl_kubernetes_namespace_manifests_directory = "${path.module}/manifests/${local.rl_kubernetes_namespace}" rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" rl_kubernetes_service_account_name = var.rl_kubernetes_service_account_name != null ? var.rl_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-sa" + rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" rl_kubernetes_service_accounts = { mlflow = { automount_service_account_token = false - service_account_name = "${local.unique_identifier_prefix}-mlflow" + service_account_name = "${local.rl_kubernetes_service_account_name}" } } } -variable "rl_tpu_rl_on_tpu_image_url" { +variable "rl_dataset_bucket_name" { default = null - description = "The URL for the RL on TPU container image." + description = "The GCP bucket name for the RL dataset." type = string } @@ -46,14 +47,20 @@ variable "rl_kubernetes_service_account_name" { type = string } +variable "rl_mlflow_data_bucket_name" { + default = null + description = "The GCP bucket name for the MLflow data." + type = string +} + variable "rl_project_id" { default = null description = "The GCP project ID for the RL on TPU resources." type = string } -variable "rl_dataset_bucket_name" { +variable "rl_tpu_rl_on_tpu_image_url" { default = null - description = "The GCP bucket name for the RL dataset." + description = "The URL for the RL on TPU container image." type = string } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf index a60d47100..06977cfb2 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf @@ -24,9 +24,8 @@ locals { } } - manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" - namespaces_directory = "${local.manifests_directory_root}/namespace" - rl_kubernetes_namespace_manifests_directory = "${local.namespaces_directory}/${local.rl_kubernetes_namespace}" + manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" + namespaces_directory = "${local.manifests_directory_root}/namespace" } data "local_file" "kubeconfig" { diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf index 835be0412..86ab44709 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf @@ -13,7 +13,7 @@ # limitations under the License. resource "google_storage_bucket_iam_member" "data_bucket_mlflow_storage_object_admin" { - bucket = google_storage_bucket.data.name + bucket = google_storage_bucket.mlflow_data.name member = "${local.wi_member_principal_prefix}/${local.rl_kubernetes_service_accounts["mlflow"].service_account_name}" role = "roles/storage.objectAdmin" } @@ -22,7 +22,7 @@ resource "local_file" "mlflow_manifest" { content = templatefile( "${path.module}/templates/mlflow/manifests.tftpl.yaml", { - bucket_name = google_storage_bucket.data.name, + bucket_name = google_storage_bucket.mlflow_data.name, service_account_name = local.rl_kubernetes_service_accounts["mlflow"].service_account_name, } ) @@ -31,7 +31,7 @@ resource "local_file" "mlflow_manifest" { module "kubectl_apply_mlflow_manifest" { depends_on = [ - module.kubectl_apply_namespace_manifest, + module.kubectl_apply_namespace, ] source = "../../../../modules/kubectl_apply" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf new file mode 100644 index 000000000..4c878f945 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf @@ -0,0 +1,17 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_project" "cluster" { + project_id = local.cluster_project_id +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf index d9d4e4f0e..8a147be2b 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf @@ -24,3 +24,11 @@ resource "google_storage_bucket" "dataset" { uniform_bucket_level_access = true } + +resource "google_storage_bucket" "mlflow_data" { + name = local.rl_mlflow_data_bucket_name + project = local.rl_project_id + location = local.cluster_region + + uniform_bucket_level_access = true +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf new file mode 100644 index 000000000..7da69f06b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "7.6.0" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/rl_on_tpu_deploy-v1" + } +} From c1ab6168d4c3dd8268bd702a6537dced83fd61aa Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 13:06:58 +0000 Subject: [PATCH 20/57] feat: renaming of folder for consistency --- .../Dockerfile | 2 +- .../cloudbuild.yaml | 0 .../src/app.py | 0 .../_cloudbuild.auto.tfvars | 0 .../_cloudbuild_variables.tf | 0 .../_platform.auto.tfvars | 0 .../_platform_variables.tf | 0 .../_reinforcement_learning.auto.tfvars | 0 .../_reinforcement_learning_variables.tf | 0 .../cloudbuild.tf | 10 +++++----- .../local_file.tf | 0 .../versions.tf | 0 12 files changed, 6 insertions(+), 6 deletions(-) rename container-images/tpu/{rl-on-tpu => reinforcement-learning-on-tpu}/Dockerfile (98%) rename container-images/tpu/{rl-on-tpu => reinforcement-learning-on-tpu}/cloudbuild.yaml (100%) rename container-images/tpu/{rl-on-tpu => reinforcement-learning-on-tpu}/src/app.py (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/_cloudbuild.auto.tfvars (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/_cloudbuild_variables.tf (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/_platform.auto.tfvars (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/_platform_variables.tf (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/_reinforcement_learning.auto.tfvars (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/_reinforcement_learning_variables.tf (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/cloudbuild.tf (77%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/local_file.tf (100%) rename platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/{rl_on_tpu => reinforcement_learning_on_tpu}/versions.tf (100%) diff --git a/container-images/tpu/rl-on-tpu/Dockerfile b/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile similarity index 98% rename from container-images/tpu/rl-on-tpu/Dockerfile rename to container-images/tpu/reinforcement-learning-on-tpu/Dockerfile index 037a0d68a..9f3177b68 100644 --- a/container-images/tpu/rl-on-tpu/Dockerfile +++ b/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile @@ -50,7 +50,7 @@ RUN uv pip install --system mlflow huggingface_hub math_verify RUN wget https://raw.githubusercontent.com/google/maxtext/main/src/maxtext/examples/chat_templates/gsm8k_rl.json -O /workspace/gsm8k_rl.json # Copy our training script -COPY train.py /workspace/app.py +COPY --from=primary app.py . # Execute the script CMD ["python3", "app.py"] diff --git a/container-images/tpu/rl-on-tpu/cloudbuild.yaml b/container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml similarity index 100% rename from container-images/tpu/rl-on-tpu/cloudbuild.yaml rename to container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml diff --git a/container-images/tpu/rl-on-tpu/src/app.py b/container-images/tpu/reinforcement-learning-on-tpu/src/app.py similarity index 100% rename from container-images/tpu/rl-on-tpu/src/app.py rename to container-images/tpu/reinforcement-learning-on-tpu/src/app.py diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild.auto.tfvars similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild.auto.tfvars rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild.auto.tfvars diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild_variables.tf similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_cloudbuild_variables.tf rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild_variables.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform.auto.tfvars similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform.auto.tfvars rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform.auto.tfvars diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform_variables.tf similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_platform_variables.tf rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform_variables.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning.auto.tfvars similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning.auto.tfvars rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning.auto.tfvars diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning_variables.tf similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/_reinforcement_learning_variables.tf rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning_variables.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/cloudbuild.tf similarity index 77% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/cloudbuild.tf index 4ea88e08d..23c7661c5 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/cloudbuild.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/cloudbuild.tf @@ -13,7 +13,7 @@ # limitations under the License. locals { - image_destination = local.rl_tpu_rl_on_tpu_image_url + image_destination = local.rl_tpu_reinforcement_learning_on_tpu_image_url } resource "terraform_data" "submit_docker_build" { @@ -28,7 +28,7 @@ resource "terraform_data" "submit_docker_build" { provisioner "local-exec" { command = <<-EOT gcloud builds submit \ ---config="container-images/tpu/rl-on-tpu/cloudbuild.yaml" \ +--config="container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml" \ --gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ --project="${self.input.cloudbuild_project_id}" \ --quiet \ @@ -40,8 +40,8 @@ EOT } triggers_replace = { - cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/tpu/rl-on-tpu/cloudbuild.yaml") - dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/tpu/rl-on-tpu/Dockerfile") - source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/tpu/rl-on-tpu/src", "**") : filesha256("${local.acp_root}/container-images/tpu/rl-on-tpu/src/${file}")])) + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile") + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/src", "**") : filesha256("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/src/${file}")])) } } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/local_file.tf similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/local_file.tf rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/local_file.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/versions.tf similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/rl_on_tpu/versions.tf rename to platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/versions.tf From 4065f21d6ff0c3c5997d9977e6db9890c90ccd83 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 13:08:18 +0000 Subject: [PATCH 21/57] feat: add dataset downloader for reinforcement learning --- .../_cloudbuild.auto.tfvars | 1 + .../_cloudbuild_variables.tf | 1 + .../_platform.auto.tfvars | 1 + .../_platform_variables.tf | 1 + .../_reinforcement_learning.auto.tfvars | 1 + .../_reinforcement_learning_variables.tf | 1 + .../cloudbuild.tf | 47 +++++++++++++++++++ .../local_file.tf | 17 +++++++ .../versions.tf | 32 +++++++++++++ 9 files changed, 102 insertions(+) create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..171a27a35 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..79960dd37 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf new file mode 100644 index 000000000..6267d950e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.rl_cpu_reinforcement_learning_dataset_downloader_image_url +} + +resource "terraform_data" "submit_docker_build" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.image_destination + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = self.input.acp_root + } + + triggers_replace = { + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile") + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/src", "**") : filesha256("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/src/${file}")])) + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf new file mode 100644 index 000000000..35e8c5d4d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_rl_images_cpu_reinforcement_learning_dataset_downloader_deploy-v1" + } +} From 59ec65f95232786977735359572eeaaa34b81d01 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 13:09:15 +0000 Subject: [PATCH 22/57] feat: renamin variable for consistency --- .../reinforcement_learning_variables.tf | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index 3b563f203..7303b73db 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -19,7 +19,9 @@ locals { rl_kubernetes_service_account_name = var.rl_kubernetes_service_account_name != null ? var.rl_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-sa" rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id - rl_tpu_rl_on_tpu_image_url = var.rl_tpu_rl_on_tpu_image_url != null ? var.rl_tpu_rl_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" + + rl_cpu_reinforcement_learning_dataset_downloader_image_url = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-dataset-downloader:latest" + rl_tpu_reinforcement_learning_on_tpu_image_url = var.rl_tpu_reinforcement_learning_on_tpu_image_url != null ? var.rl_tpu_reinforcement_learning_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" rl_kubernetes_service_accounts = { mlflow = { @@ -59,7 +61,13 @@ variable "rl_project_id" { type = string } -variable "rl_tpu_rl_on_tpu_image_url" { +variable "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { + default = null + description = "The URL for the RL dataset downloader container image." + type = string +} + +variable "rl_tpu_reinforcement_learning_on_tpu_image_url" { default = null description = "The URL for the RL on TPU container image." type = string From 732febed0bb7aa109f318ccf1927bd913310d08f Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 13:12:14 +0000 Subject: [PATCH 23/57] feat: add namespace on mlflow deployment --- .../reinforcement-learning/terraform/rl_on_tpu/mlflow.tf | 1 + .../terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf index 86ab44709..5b3ead6da 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf @@ -24,6 +24,7 @@ resource "local_file" "mlflow_manifest" { { bucket_name = google_storage_bucket.mlflow_data.name, service_account_name = local.rl_kubernetes_service_accounts["mlflow"].service_account_name, + namespace = local.rl_kubernetes_namespace, } ) filename = "${local.rl_kubernetes_namespace_manifests_directory}/mlflow.yaml" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml index 4383b0ee3..d604075ba 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml @@ -16,6 +16,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: mlflow-tracking + namespace: ${namespace} spec: replicas: 1 selector: @@ -66,6 +67,7 @@ apiVersion: v1 kind: Service metadata: name: mlflow-tracking-svc + namespace: ${namespace} spec: ports: - port: 5000 From be7927e9c113f5f2ab2cedbe0ecafa455c49963b Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 15:16:46 +0000 Subject: [PATCH 24/57] feat: add dataset downloader kubernetes manifests --- .../base/job.yaml | 38 ++++++++++++ .../base/kustomization.yaml | 61 +++++++++++++++++++ .../base/patch-nodeselector.yaml | 24 ++++++++ ...cement-learning-dataset-downloader.tpl.env | 4 ++ .../configure_dataset_downloader.sh | 33 ++++++++++ 5 files changed, 160 insertions(+) create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/job.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env create mode 100755 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/job.yaml new file mode 100644 index 000000000..ddc728070 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/job.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize +spec: + template: + metadata: + labels: + app: reinforcement-learning-dataset-downloader + spec: + restartPolicy: OnFailure + containers: + - env: + - name: DATASET_BUCKET_NAME + valueFrom: + configMapKeyRef: + key: DATASET_BUCKET_NAME + name: reinforcement-learning-dataset-downloader + image: replaced-by-kustomize + imagePullPolicy: Always + name: reinforcement-learning-dataset-downloader + resources: {} + serviceAccountName: replaced-by-kustomize diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml new file mode 100644 index 000000000..825561157 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - reinforcement-learning-dataset-downloader.env + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize + +replacements: + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - spec.template.spec.containers.[name=reinforcement-learning-dataset-downloader].image + select: + kind: Job + - source: + fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - source: + fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + +patches: + - path: patch-nodeselector.yaml + +resources: + - job.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml new file mode 100644 index 000000000..f66df7f6c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: cpu-e2-s-16-co diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env new file mode 100644 index 000000000..ce591328d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env @@ -0,0 +1,4 @@ +DATASET_DOWNLOADER_KUBERNETES_NAMESPACE=${ira_offline_batch_cpu_dataset_downloader_kubernetes_namespace_name} +DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT=${ira_offline_batch_cpu_dataset_downloader_kubernetes_service_account_name} +CONTAINER_IMAGE_URL=${ira_offline_batch_cpu_dataset_downloader_image_url} +DATASET_BUCKET_NAME=${ira_offline_batch_dataset_bucket_name} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh new file mode 100755 index 000000000..0bd57a955 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +RANDOM_HASH=$(openssl rand -hex 4) +echo "${RANDOM_HASH}" > job_random_hash.txt + +source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_variables.sh" + +envsubst <"${MY_PATH}/base/templates/reinforcement-learning-dataset-downloader.tpl.env" | sponge "${MY_PATH}/base/reinforcement-learning-dataset-downloader.env" + +cd "${MY_PATH}/base" +kustomize edit set nameprefix "${RANDOM_HASH}-" From d1cadec1d8f6ffcd36354eb094d1e6e6bdd084a5 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 15:17:15 +0000 Subject: [PATCH 25/57] feat: add rl on tpu kubernetes manifests --- .../rl-on-tpu/base}/job.yaml | 2 +- .../rl-on-tpu/base/kustomization.yaml | 0 .../kustomization.yaml | 131 ++++++++++++++++++ .../patch-nodeselector.yaml | 24 ++++ .../patch-resources.yaml | 29 ++++ 5 files changed, 185 insertions(+), 1 deletion(-) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{rl-on-tpu => reinforcement-learning/rl-on-tpu/base}/job.yaml (95%) create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml similarity index 95% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml index 2873b755c..1ffc2db3b 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/job.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml @@ -22,7 +22,7 @@ spec: spec: restartPolicy: Never nodeSelector: - cloud.google.com/compute-class: "tpu-v5e-2x4" + cloud.google.com/compute-class: "tpu-v5e-2x4" containers: - name: grpo-trainer image: us-central1-docker.pkg.dev/accelerated-platforms-dev/ml-repo/maxtext-grpo:latest diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml new file mode 100644 index 000000000..b7ebbf4b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: "-v5e-2x4-llama-3-1-8b-instruct" + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml new file mode 100644 index 000000000..832e2fceb --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v5e-1x1 diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml new file mode 100644 index 000000000..b3371ea99 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "1" + requests: + google.com/tpu: "1" From c37c20c66f3ed6c62ff8a19b333f654072e27862 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Tue, 17 Mar 2026 15:30:09 +0000 Subject: [PATCH 26/57] fix: add license header --- .../rl-on-tpu/base/kustomization.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml index e69de29bb..48a2ea4da 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml @@ -0,0 +1,14 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- From 7e7b74ccedec472fa09761985ef2df02a1b9c8c2 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 18 Mar 2026 10:09:22 +0000 Subject: [PATCH 27/57] feat: add reinforcement learning dataset downloader --- ...cement-learning-dataset-downloader.tpl.env | 8 ++--- .../rl-on-tpu/base/job.yaml | 32 +++++++++++-------- .../terraform/_shared_config/outputs.tf | 20 ++++++++++-- .../reinforcement_learning_variables.tf | 31 +++++++++++++----- 4 files changed, 63 insertions(+), 28 deletions(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env index ce591328d..c40591a1d 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env @@ -1,4 +1,4 @@ -DATASET_DOWNLOADER_KUBERNETES_NAMESPACE=${ira_offline_batch_cpu_dataset_downloader_kubernetes_namespace_name} -DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT=${ira_offline_batch_cpu_dataset_downloader_kubernetes_service_account_name} -CONTAINER_IMAGE_URL=${ira_offline_batch_cpu_dataset_downloader_image_url} -DATASET_BUCKET_NAME=${ira_offline_batch_dataset_bucket_name} +DATASET_DOWNLOADER_KUBERNETES_NAMESPACE=${rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name} +DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT=${rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name} +CONTAINER_IMAGE_URL=${rl_cpu_reinforcement_learning_dataset_downloader_image_url} +DATASET_BUCKET_NAME=${rl_dataset_bucket_name} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml index 1ffc2db3b..20904ba21 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml @@ -15,7 +15,8 @@ apiVersion: batch/v1 kind: Job metadata: - name: maxtext-grpo-job-v5e + name: reinforcement-learning-maxtext-grpo + namespace: replaced-by-kustomize spec: backoffLimit: 0 template: @@ -24,16 +25,19 @@ spec: nodeSelector: cloud.google.com/compute-class: "tpu-v5e-2x4" containers: - - name: grpo-trainer - image: us-central1-docker.pkg.dev/accelerated-platforms-dev/ml-repo/maxtext-grpo:latest - resources: - limits: - google.com/tpu: 8 - env: - - name: MLFLOW_TRACKING_URI - value: "http://mlflow-service:5000" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: token + - name: grpo-trainer + image: replaced-by-kustomize + resources: + requests: + google.com/tpu: 8 + limits: + google.com/tpu: 8 + env: + - name: MLFLOW_TRACKING_URI + value: "http://mlflow-service:5000" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: token + serviceAccountName: replaced-by-kustomize diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf index 8a55d158e..3f09275af 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf @@ -12,6 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -output "rl_tpu_rl_on_tpu_image_url" { - value = var.rl_tpu_rl_on_tpu_image_url +output "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { + value = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url +} + +output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name" { + value = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name +} + +output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name" { + value = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name +} + +output "rl_kubernetes_namespace" { + value = var.rl_kubernetes_namespace +} + +output "rl_tpu_reinforcement_learning_on_tpu_image_url" { + value = var.rl_tpu_reinforcement_learning_on_tpu_image_url } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index 7303b73db..8e7c3f45a 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -13,6 +13,10 @@ # limitations under the License. locals { + rl_cpu_reinforcement_learning_dataset_downloader_image_url = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-dataset-downloader:latest" + rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-dataset-downloader" + rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-dataset-downloader-sa" + rl_dataset_bucket_name = var.rl_dataset_bucket_name != null ? var.rl_dataset_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-dataset" rl_kubernetes_namespace_manifests_directory = "${path.module}/manifests/${local.rl_kubernetes_namespace}" rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" @@ -20,8 +24,7 @@ locals { rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id - rl_cpu_reinforcement_learning_dataset_downloader_image_url = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-dataset-downloader:latest" - rl_tpu_reinforcement_learning_on_tpu_image_url = var.rl_tpu_reinforcement_learning_on_tpu_image_url != null ? var.rl_tpu_reinforcement_learning_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" + rl_tpu_reinforcement_learning_on_tpu_image_url = var.rl_tpu_reinforcement_learning_on_tpu_image_url != null ? var.rl_tpu_reinforcement_learning_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" rl_kubernetes_service_accounts = { mlflow = { @@ -31,6 +34,24 @@ locals { } } +variable "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { + default = null + description = "The URL for the RL dataset downloader container image." + type = string +} + +variable "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL dataset downloader." + type = string +} + +variable "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL dataset downloader." + type = string +} + variable "rl_dataset_bucket_name" { default = null description = "The GCP bucket name for the RL dataset." @@ -61,12 +82,6 @@ variable "rl_project_id" { type = string } -variable "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { - default = null - description = "The URL for the RL dataset downloader container image." - type = string -} - variable "rl_tpu_reinforcement_learning_on_tpu_image_url" { default = null description = "The URL for the RL on TPU container image." From 946ca2b76ff60c100bbc99e3d0e188ab0b6add8a Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 18 Mar 2026 16:42:21 +0100 Subject: [PATCH 28/57] fix: rendering of deployment on kustomize (#390) --- .../gke/base/use-cases/inference-ref-arch/validate_kustomize.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh index 78f80bb7d..2d2b7f386 100755 --- a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh @@ -60,6 +60,7 @@ export ACCELERATOR="GPU" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/configure_jobset.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-dataset-downloader/configure_dataset_downloader.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/configure_worker.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh" find "${ACP_PLATFORM_BASE_DIR}/use-cases/inference-ref-arch/kubernetes-manifests" -name "kustomization.yaml" -print0 | while read -d $'\0' file; do From 8d948308be3aabfb0e70ae5c76b0f1a35c8ecaf1 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 18 Mar 2026 19:34:54 +0100 Subject: [PATCH 29/57] fix: change accelerator type and model id for offline batch inference (#391) --- .../use-cases/inference-ref-arch/validate_kustomize.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh index 2d2b7f386..d3223a9d0 100755 --- a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh @@ -58,10 +58,15 @@ export ACCELERATOR_TYPE="v5e" export ACCELERATOR_TYPE="rtx-pro-6000" export ACCELERATOR="GPU" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh" + +# Validate offline-batch-inference-gpu kustomize +export ACCELERATOR_TYPE="rtx-pro-6000" +export HF_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" +source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/configure_jobset.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-dataset-downloader/configure_dataset_downloader.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/configure_worker.sh" -"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh" find "${ACP_PLATFORM_BASE_DIR}/use-cases/inference-ref-arch/kubernetes-manifests" -name "kustomization.yaml" -print0 | while read -d $'\0' file; do kustomize_directory_path="$(dirname "${file}")" From 49990f768468c827991558f51ebb1b5c3a370c9e Mon Sep 17 00:00:00 2001 From: Shobhit Gupta Date: Wed, 18 Mar 2026 19:32:24 -0400 Subject: [PATCH 30/57] fix kustomization CI failures (#392) --- .../offline-batch-worker/h100-llama-3-3-70b-instruct/runtime.env | 1 + .../offline-batch-worker/h200-llama-3-3-70b-instruct/runtime.env | 1 + 2 files changed, 2 insertions(+) diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h100-llama-3-3-70b-instruct/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h100-llama-3-3-70b-instruct/runtime.env index 08eab7846..d2bee6d01 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h100-llama-3-3-70b-instruct/runtime.env +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h100-llama-3-3-70b-instruct/runtime.env @@ -4,3 +4,4 @@ MAX_MODEL_LEN=131072 MODEL_ID=meta-llama/llama-3.3-70b-instruct MODEL_NAME=llama-3.3-70b-instruct TENSOR_PARALLEL_SIZE=4 +COMPUTE_CLASS=gpu-h100-80gb-high-x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h200-llama-3-3-70b-instruct/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h200-llama-3-3-70b-instruct/runtime.env index a428dcbf1..f251d80a2 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h200-llama-3-3-70b-instruct/runtime.env +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/h200-llama-3-3-70b-instruct/runtime.env @@ -4,3 +4,4 @@ MAX_MODEL_LEN=131072 MODEL_ID=meta-llama/llama-3.3-70b-instruct MODEL_NAME=llama-3.3-70b-instruct TENSOR_PARALLEL_SIZE=4 +COMPUTE_CLASS=gpu-h200-141gb-ultra-x8 From 5ee2a60b6dea81d5c0de119e9356eb1264ce53ef Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 19 Mar 2026 16:09:42 +0000 Subject: [PATCH 31/57] feat: add consistency on deployment --- .../terraform/_shared_config/outputs.tf | 12 ++--- .../reinforcement_learning_variables.tf | 53 +++++++++++-------- .../terraform/rl_on_tpu/iam.tf | 13 +++-- .../terraform/rl_on_tpu/kubernetes.tf | 16 ++++-- .../terraform/rl_on_tpu/mlflow.tf | 10 ++-- 5 files changed, 61 insertions(+), 43 deletions(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf index 3f09275af..5f1ed3d4b 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf @@ -13,21 +13,17 @@ # limitations under the License. output "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { - value = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url + value = local.rl_cpu_reinforcement_learning_dataset_downloader_image_url } output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name" { - value = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name + value = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name } output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name" { - value = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name -} - -output "rl_kubernetes_namespace" { - value = var.rl_kubernetes_namespace + value = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name } output "rl_tpu_reinforcement_learning_on_tpu_image_url" { - value = var.rl_tpu_reinforcement_learning_on_tpu_image_url + value = local.rl_tpu_reinforcement_learning_on_tpu_image_url } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index 8e7c3f45a..f4ac09734 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -13,25 +13,32 @@ # limitations under the License. locals { + rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-mlflow" + rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-mlflow-sa" + rl_cpu_reinforcement_learning_dataset_downloader_image_url = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-dataset-downloader:latest" rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-dataset-downloader" rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-dataset-downloader-sa" - rl_dataset_bucket_name = var.rl_dataset_bucket_name != null ? var.rl_dataset_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-dataset" - rl_kubernetes_namespace_manifests_directory = "${path.module}/manifests/${local.rl_kubernetes_namespace}" - rl_kubernetes_namespace = var.rl_kubernetes_namespace != null ? var.rl_kubernetes_namespace : "${local.unique_identifier_prefix}-rl" - rl_kubernetes_service_account_name = var.rl_kubernetes_service_account_name != null ? var.rl_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-sa" - rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" - rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id + rl_dataset_bucket_name = var.rl_dataset_bucket_name != null ? var.rl_dataset_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-dataset" + rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" + rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id + + rl_tpu_reinforcement_learning_on_tpu_image_url = var.rl_tpu_reinforcement_learning_on_tpu_image_url != null ? var.rl_tpu_reinforcement_learning_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" + rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name = var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name != null ? var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-on-tpu" + rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name = var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name != null ? var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-on-tpu-sa" +} - rl_tpu_reinforcement_learning_on_tpu_image_url = var.rl_tpu_reinforcement_learning_on_tpu_image_url != null ? var.rl_tpu_reinforcement_learning_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" +variable "rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL MLflow deployment." + type = string +} - rl_kubernetes_service_accounts = { - mlflow = { - automount_service_account_token = false - service_account_name = "${local.rl_kubernetes_service_account_name}" - } - } +variable "rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL MLflow deployment." + type = string } variable "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { @@ -58,32 +65,32 @@ variable "rl_dataset_bucket_name" { type = string } -variable "rl_kubernetes_namespace" { +variable "rl_mlflow_data_bucket_name" { default = null - description = "The Kubernetes namespace for the RL on TPU resources." + description = "The GCP bucket name for the MLflow data." type = string } -variable "rl_kubernetes_service_account_name" { +variable "rl_project_id" { default = null - description = "The Kubernetes service account name for the RL on TPU resources." + description = "The GCP project ID for the RL on TPU resources." type = string } -variable "rl_mlflow_data_bucket_name" { +variable "rl_tpu_reinforcement_learning_on_tpu_image_url" { default = null - description = "The GCP bucket name for the MLflow data." + description = "The URL for the RL on TPU container image." type = string } -variable "rl_project_id" { +variable "rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name" { default = null - description = "The GCP project ID for the RL on TPU resources." + description = "The Kubernetes namespace name for the RL on TPU deployment." type = string } -variable "rl_tpu_reinforcement_learning_on_tpu_image_url" { +variable "rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name" { default = null - description = "The URL for the RL on TPU container image." + description = "The Kubernetes service account name for the RL on TPU deployment." type = string } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf index c1d693321..ba780452c 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf @@ -13,8 +13,13 @@ # limitations under the License. locals { - gsa_build_roles = [ - "roles/logging.logWriter", - ] - wi_member_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject/ns/${local.rl_kubernetes_namespace}/sa" + cluster_wi_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject" + rl_on_tpu_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name}/sa/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name}" + rl_mlflow_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name}/sa/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name}" +} + +resource "google_storage_bucket_iam_member" "hub_models_rl_on_tpu_ksa" { + bucket = data.google_storage_bucket.hub_models.name + member = local.rl_on_tpu_ksa_member + role = local.cluster_gcsfuse_user_role } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf index 06977cfb2..f41822b13 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf @@ -17,10 +17,20 @@ locals { kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" workloads = { + rl_reinforcement_learning_mlflow = { + directory = "${local.namespaces_directory}/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name}" + namespace = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name + service_account = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name + } rl_on_tpu = { - directory = "${local.namespaces_directory}/${local.rl_kubernetes_namespace}" - namespace = local.rl_kubernetes_namespace - service_account = local.rl_kubernetes_service_account_name + directory = "${local.namespaces_directory}/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name}" + namespace = local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name + service_account = local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name + } + rl_reinforcement_learning_dataset_downloader = { + directory = "${local.namespaces_directory}/${local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name}" + namespace = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name + service_account = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name } } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf index 5b3ead6da..2fe2a6484 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf @@ -14,7 +14,7 @@ resource "google_storage_bucket_iam_member" "data_bucket_mlflow_storage_object_admin" { bucket = google_storage_bucket.mlflow_data.name - member = "${local.wi_member_principal_prefix}/${local.rl_kubernetes_service_accounts["mlflow"].service_account_name}" + member = local.rl_mlflow_ksa_member role = "roles/storage.objectAdmin" } @@ -23,11 +23,11 @@ resource "local_file" "mlflow_manifest" { "${path.module}/templates/mlflow/manifests.tftpl.yaml", { bucket_name = google_storage_bucket.mlflow_data.name, - service_account_name = local.rl_kubernetes_service_accounts["mlflow"].service_account_name, - namespace = local.rl_kubernetes_namespace, + service_account_name = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name, + namespace = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name, } ) - filename = "${local.rl_kubernetes_namespace_manifests_directory}/mlflow.yaml" + filename = "${local.namespaces_directory}/mlflow.yaml" } module "kubectl_apply_mlflow_manifest" { @@ -40,5 +40,5 @@ module "kubectl_apply_mlflow_manifest" { kubeconfig_file = data.local_file.kubeconfig.filename manifest = local_file.mlflow_manifest.filename manifest_includes_namespace = false - namespace = local.rl_kubernetes_namespace + namespace = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name } From 87a43d5570ba1035f0b8088e241023a2c61eee06 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Fri, 20 Mar 2026 09:25:25 +0000 Subject: [PATCH 32/57] feat: add scripts to deploy standard cluster in reinforcement learning use case --- .../terraform/_shared_config/outputs.tf | 4 + .../terraform/deploy-standard.sh | 81 ++++++++++++++ .../terraform/teardown-standard.sh | 100 ++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100755 platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh create mode 100755 platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf index 5f1ed3d4b..8e11022ef 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf @@ -24,6 +24,10 @@ output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_acco value = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name } +output "rl_dataset_bucket_name" { + value = local.rl_dataset_bucket_name +} + output "rl_tpu_reinforcement_learning_on_tpu_image_url" { value = local.rl_tpu_reinforcement_learning_on_tpu_image_url } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh new file mode 100755 index 000000000..6fed3c08c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset + +start_timestamp=$(date +%s) + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +# Set repository values +export ACP_REPO_DIR="$(realpath ${MY_PATH}/../../../../../../)" +export ACP_PLATFORM_BASE_DIR="${ACP_REPO_DIR}/platforms/gke/base" +export ACP_PLATFORM_CORE_DIR="${ACP_PLATFORM_BASE_DIR}/core" +export ACP_PLATFORM_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning" + +# Enable Terraform plugin caching and specifies location of the plugin cache directory +export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + +# Set use-case specific values +export TF_VAR_initialize_backend_use_case_name="reinforcement-learning/terraform" +export TF_VAR_resource_name_prefix="${TF_VAR_resource_name_prefix:-rl}" + +declare -a CORE_TERRASERVICES_APPLY=( + "networking" + "container_cluster" + "workloads/cluster_credentials" + "cloudbuild/initialize" + "huggingface/initialize" + "huggingface/hub_downloader" + "custom_compute_class" + "workloads/auto_monitoring" + "workloads/custom_metrics_adapter" + "workloads/inference_gateway" + "workloads/jobset" + "workloads/lws" + "workloads/priority_class" + "workloads/kueue" +) +CORE_TERRASERVICES_APPLY="${CORE_TERRASERVICES_APPLY[*]}" "${ACP_PLATFORM_CORE_DIR}/deploy.sh" + +# shellcheck disable=SC1091 +source "${ACP_PLATFORM_USE_CASE_DIR}/terraform/_shared_config/scripts/set_environment_variables.sh" + +declare -a use_case_terraservices=( + "initialize" +) +for terraservice in "${use_case_terraservices[@]}"; do + cd "${ACP_PLATFORM_USE_CASE_DIR}/terraform/${terraservice}" && + echo "Current directory: $(pwd)" && + rm -rf .terraform/ && + terraform init && + terraform plan -input=false -out=tfplan && + terraform apply -input=false tfplan || exit 1 + rm tfplan +done + +# shellcheck disable=SC2154 +gcloud container clusters get-credentials "${cluster_name}" \ + --region "${cluster_region}" \ + --project "${cluster_project_id}" \ + --dns-endpoint + +end_timestamp=$(date +%s) +total_runtime_value=$((end_timestamp - start_timestamp)) +echo "reinforcement-learning deploy total runtime: $(date -d@${total_runtime_value} -u +%H:%M:%S)" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh new file mode 100755 index 000000000..cb212af3c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset + +start_timestamp=$(date +%s) + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +# Set repository values +export ACP_REPO_DIR="$(realpath ${MY_PATH}/../../../../../../)" +export ACP_PLATFORM_BASE_DIR="${ACP_REPO_DIR}/platforms/gke/base" +export ACP_PLATFORM_CORE_DIR="${ACP_PLATFORM_BASE_DIR}/core" +export ACP_PLATFORM_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning" + +# Enable Terraform plugin caching and specifies location of the plugin cache directory +export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + +# Set use-case specific values +export TF_VAR_initialize_backend_use_case_name="reinforcement-learning/terraform" +export TF_VAR_resource_name_prefix="${TF_VAR_resource_name_prefix:-rl}" + +# Set execution specific values +export ACP_TEARDOWN_CORE_PLATFORM=${ACP_TEARDOWN_CORE_PLATFORM:-"true"} + +# shellcheck disable=SC1091 +source "${ACP_PLATFORM_USE_CASE_DIR}/terraform/_shared_config/scripts/set_environment_variables.sh" + +# shellcheck disable=SC2154 +cd "${ACP_PLATFORM_CORE_DIR}/initialize" && + echo "Current directory: $(pwd)" && + sed -i "s/^\([[:blank:]]*bucket[[:blank:]]*=\).*$/\1 \"${terraform_bucket_name}\"/" "${ACP_PLATFORM_CORE_DIR}/initialize/backend.tf.bucket" && + cp backend.tf.bucket backend.tf && + rm -rf .terraform/ && + terraform init && + terraform plan -input=false -out=tfplan && + terraform apply -input=false tfplan || exit 1 +rm tfplan + +declare -a use_case_terraservices=( + "initialize" +) +for terraservice in "${use_case_terraservices[@]}"; do + cd "${ACP_PLATFORM_USE_CASE_DIR}/terraform/${terraservice}" && + echo "Current directory: $(pwd)" && + rm -rf .terraform/ && + terraform init && + terraform destroy -auto-approve || exit 1 + rm -rf .terraform/ \ + "terraform.tfstate"* +done + +if [ "${ACP_TEARDOWN_CORE_PLATFORM}" = "true" ]; then + declare -a CORE_TERRASERVICES_DESTROY=( + "workloads/kueue" + "workloads/priority_class" + "workloads/lws" + "workloads/jobset" + "workloads/inference_gateway" + "workloads/custom_metrics_adapter" + "workloads/auto_monitoring" + "custom_compute_class" + "huggingface/hub_downloader" + "huggingface/initialize" + "cloudbuild/initialize" + "workloads/cluster_credentials" + "container_cluster" + "networking" + "initialize" + ) + CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY[*]}" "${ACP_PLATFORM_CORE_DIR}/teardown.sh" +else + echo "Skipping core platform teardown." +fi + +rm -rf \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/model-download/huggingface/downloader.env" \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/model-download/huggingface/secretproviderclass-huggingface-tokens.yaml" \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/online-inference-gpu/base/deployment.env" \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/online-inference-tpu/base/deployment.env" + +end_timestamp=$(date +%s) +total_runtime_value=$((end_timestamp - start_timestamp)) +echo "reinforcement-learning teardown total runtime: $(date -d@${total_runtime_value} -u +%H:%M:%S)" From a44b2a72f7d612e983cc7990b6d5884b8556bf77 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 11:56:45 +0000 Subject: [PATCH 33/57] feat: add initialize folder --- .../initialize/_cloudbuild.auto.tfvars | 1 + .../initialize/_cloudbuild_variables.tf | 1 + .../terraform/initialize/_cluster.auto.tfvars | 1 + .../initialize/_cluster_variables.tf | 1 + .../initialize/_huggingface.auto.tfvars | 1 + .../initialize/_huggingface_variables.tf | 1 + .../initialize/_networking.auto.tfvars | 1 + .../initialize/_networking_variables.tf | 1 + .../initialize/_platform.auto.tfvars | 1 + .../initialize/_platform_variables.tf | 1 + .../_reinforcement_learning.auto.tfvars | 1 + .../_reinforcement_learning_variables.tf | 1 + .../terraform/initialize/versions.tf | 28 +++++++++++++++++++ 13 files changed, 40 insertions(+) create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..c730c32e8 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf new file mode 120000 index 000000000..5a143590a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars new file mode 120000 index 000000000..98a694db9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf new file mode 120000 index 000000000..00625515b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars new file mode 120000 index 000000000..276530b81 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf new file mode 120000 index 000000000..f384bc7e1 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars new file mode 120000 index 000000000..9cbd92baf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/networking.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf new file mode 120000 index 000000000..1e170e71d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/networking_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars new file mode 120000 index 000000000..125a652cf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf new file mode 120000 index 000000000..486b3eaef --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..f56697856 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..f7d4bb73a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf new file mode 100644 index 000000000..efe70345a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "7.6.0" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/rl_initialize_deploy-v1" + } +} From a1be42644348507c0a8f476c955d45e930ca78f3 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 12:09:50 +0000 Subject: [PATCH 34/57] feat: add usecase for reinforcement learning --- .../standard-scripts.yaml | 153 ++++++++++++++++++ .../populate_huggingface_token_secrets.sh | 44 +++++ 2 files changed, 197 insertions(+) create mode 100644 test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml create mode 100755 test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh diff --git a/test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml b/test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml new file mode 100644 index 000000000..d849b9d00 --- /dev/null +++ b/test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml @@ -0,0 +1,153 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +options: + automapSubstitutions: true + logging: CLOUD_LOGGING_ONLY + +steps: + - args: + - "${_WAIT_FOR_TRIGGER}" + entrypoint: "test/ci-cd/scripts/cloudbuild/wait_for_trigger.sh" + env: + - "LOCATION=${LOCATION}" + - "PROJECT_ID=${PROJECT_ID}" + id: "Check triggers" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: ["-"] + + - args: + - DEBUG=${_DEBUG} + - TF_VAR_platform_default_project_id="${PROJECT_ID}-$${PROJECT_SUFFIX}" + - TF_VAR_platform_name="ch${SHORT_SHA}" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/configure_build_environment.sh" + env: + - ACP_PLATFORM_DIR="$${ACP_REPO_DIR}/platforms/gke/base" + - BUILD_ID=${BUILD_ID} + - DEBUG=${_DEBUG} + - PROJECT_ID=${PROJECT_ID} + - SHORT_SHA=${SHORT_SHA} + id: "Configure the build environment" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Check triggers" + + - args: + - "Deploy platforms/gke/base/use-cases/reinforcement-learning Standard" + - "platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/run_deploy_script.sh" + id: "Deploy platforms/gke/base/use-cases/reinforcement-learning Standard" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Configure the build environment" + + - args: + - "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh" + id: "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Deploy platforms/gke/base/use-cases/reinforcement-learning Standard" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_gpu + entrypoint: "test/ci-cd/scripts/terraservice/apply.sh" + id: "Apply reinforcement-learning Terraservice 'online_gpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_tpu + entrypoint: "test/ci-cd/scripts/terraservice/apply.sh" + id: "Apply reinforcement-learning Terraservice 'online_tpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + + - args: + - "Validate Kustomize" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh" + id: "Validate Kustomize" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Apply reinforcement-learning Terraservice 'online_gpu'" + - "Apply reinforcement-learning Terraservice 'online_tpu'" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_gpu + entrypoint: "test/ci-cd/scripts/terraservice/plan.sh" + id: "reinforcement-learning Terraservice 'online_gpu' check for changes" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Validate Kustomize" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_tpu + entrypoint: "test/ci-cd/scripts/terraservice/plan.sh" + id: "reinforcement-learning Terraservice 'online_tpu' check for changes" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Validate Kustomize" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_gpu + entrypoint: "test/ci-cd/scripts/terraservice/destroy.sh" + id: "Destroy reinforcement-learning Terraservice 'online_gpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "reinforcement-learning Terraservice 'online_gpu' check for changes" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_tpu + entrypoint: "test/ci-cd/scripts/terraservice/destroy.sh" + id: "Destroy reinforcement-learning Terraservice 'online_tpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "reinforcement-learning Terraservice 'online_tpu' check for changes" + + - args: + - "Teardown platforms/gke/base/use-cases/reinforcement-learning Standard" + - "platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/run_teardown_script.sh" + id: "Teardown platforms/gke/base/use-cases/reinforcement-learning Standard" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Destroy reinforcement-learning Terraservice 'online_gpu'" + - "Destroy reinforcement-learning Terraservice 'online_tpu'" + + - args: + - "Cleanup the build environment" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/cleanup_build_environment.sh" + id: "Cleanup the build environment" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Teardown platforms/gke/base/use-cases/reinforcement-learning Standard" + + - entrypoint: "test/ci-cd/scripts/platforms/gke/base/set_build_status.sh" + id: "Set the build status" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Cleanup the build environment" + +substitutions: + _DEBUG: "false" + +timeout: 90m diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh new file mode 100755 index 000000000..1492fe390 --- /dev/null +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +source /workspace/build.env +if [ "${DEBUG,,}" == "true" ]; then + set -o xtrace +fi + +STEP_ID=${1} + +exit_handler() { + exit_code=$? + + if [ ${exit_code} -ne 0 ]; then + echo "${STEP_ID}" >>/workspace/build-failed.lock + fi + + exit 0 +} +trap exit_handler EXIT + +set -- + +source "${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh" + +echo "HF_TOKEN_READ" | gcloud secrets versions add ${huggingface_hub_access_token_read_secret_manager_secret_name} \ +--data-file=- \ +--project=${huggingface_secret_manager_project_id} From 620a9697952556b9233db95dcd76e9b904a3f892 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 12:13:31 +0000 Subject: [PATCH 35/57] feat: add reinforcement downloader --- .../base/job.yaml | 0 .../base/kustomization.yaml | 61 +++++++++++++++++++ .../base/patch-nodeselector.yaml | 0 ...cement-learning-dataset-downloader.tpl.env | 0 .../configure_dataset_downloader.sh | 0 .../base/kustomization.yaml | 61 ------------------- 6 files changed, 61 insertions(+), 61 deletions(-) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/reinforcement-learning-dataset-downloader/base/job.yaml (100%) create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml (100%) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env (100%) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh (100%) delete mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/job.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/job.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/job.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml new file mode 100644 index 000000000..104e572b7 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: +- envs: + - reinforcement-learning-dataset-downloader.env + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize + +replacements: +- source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - spec.template.spec.containers.[name=reinforcement-learning-dataset-downloader].image + select: + kind: Job +- source: + fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job +- source: + fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + +patches: +- path: patch-nodeselector.yaml + +resources: +- job.yaml +namePrefix: 26854af7- diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml deleted file mode 100644 index 825561157..000000000 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/reinforcement-learning-dataset-downloader/base/kustomization.yaml +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -configMapGenerator: - - envs: - - reinforcement-learning-dataset-downloader.env - name: reinforcement-learning-dataset-downloader - namespace: replaced-by-kustomize - -replacements: - - source: - fieldPath: data.CONTAINER_IMAGE_URL - kind: ConfigMap - name: reinforcement-learning-dataset-downloader - targets: - - fieldPaths: - - spec.template.spec.containers.[name=reinforcement-learning-dataset-downloader].image - select: - kind: Job - - source: - fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_NAMESPACE - kind: ConfigMap - name: reinforcement-learning-dataset-downloader - targets: - - fieldPaths: - - metadata.namespace - select: - kind: ConfigMap - - fieldPaths: - - metadata.namespace - select: - kind: Job - - source: - fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT - kind: ConfigMap - name: reinforcement-learning-dataset-downloader - targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Job - -patches: - - path: patch-nodeselector.yaml - -resources: - - job.yaml From ce30a7c4b449635917cb17ac3077e3072a411bf0 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 12:14:03 +0000 Subject: [PATCH 36/57] feat: add job for llama 3.1 --- .../{reinforcement-learning => }/rl-on-tpu/base/job.yaml | 0 .../rl-on-tpu/base/kustomization.yaml | 0 .../rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml | 0 .../v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml | 0 .../rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/rl-on-tpu/base/job.yaml (100%) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/rl-on-tpu/base/kustomization.yaml (100%) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml (100%) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml (100%) rename platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/{reinforcement-learning => }/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml (100%) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/job.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/job.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/job.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/kustomization.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/base/kustomization.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/kustomization.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml similarity index 100% rename from platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml rename to platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml From 772d9401a2e721d48bea118ac55d2abfdf2518c1 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 12:14:24 +0000 Subject: [PATCH 37/57] feat: add tests use case --- .../validate_kustomize.sh | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100755 test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh new file mode 100755 index 000000000..f818b409f --- /dev/null +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +source /workspace/build.env +if [ "${DEBUG,,}" == "true" ]; then + set -o xtrace +fi + +STEP_ID=${1} + +exit_handler() { + exit_code=$? + + if [ ${exit_code} -ne 0 ]; then + echo "${STEP_ID}" >>/workspace/build-failed.lock + fi + + exit 0 +} +trap exit_handler EXIT + +set -- + +export HF_MODEL_ID="google/gemma-3-27b-it" + +source "${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh" + +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/configure_huggingface.sh" + +export ACCELERATOR_TYPE="l4" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/batch-inference-gpu/batch-load-generator/configure_load_generator.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/batch-inference-gpu/batch-pubsub-subscriber/configure_pubsub_subscriber.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/batch-inference-gpu/vllm/configure_vllm.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-gpu/vllm/configure_vllm.sh" + +export ACCELERATOR_TYPE="v5e" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-tpu/max-diffusion/configure_max_diffusion.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-tpu/vllm/configure_vllm.sh" + +# Validate inference-perf kustomize +export ACCELERATOR_TYPE="rtx-pro-6000" +export ACCELERATOR="GPU" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh" + +# Validate offline-batch-inference-gpu kustomize +export ACCELERATOR_TYPE="rtx-pro-6000" +export HF_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" +source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/offline-batch-inference-gpu/configure_jobset.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-dataset-downloader/configure_dataset_downloader.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/configure_worker.sh" + +find "${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning/kubernetes-manifests" -name "kustomization.yaml" -print0 | while read -d $'\0' file; do + kustomize_directory_path="$(dirname "${file}")" + rendered_kubernetes_manifests_file_path="/tmp/rendered-kustomize.yaml" + + # Basic validation: + # - Render manifests with Kustomize + # - Validate manifests with kubectl-validate + kubectl kustomize "${kustomize_directory_path}" | tee "${rendered_kubernetes_manifests_file_path}" + kubectl validate "${rendered_kubernetes_manifests_file_path}" +done From db22d777f437b797b9992a0765588a7f6890cfe6 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 12:53:43 +0000 Subject: [PATCH 38/57] feat: add model downloader --- .../model-download/configure_huggingface.sh | 44 ++++++ .../huggingface/configmap-scripts.yaml | 128 ++++++++++++++++++ .../model-download/huggingface/job.yaml | 80 +++++++++++ .../huggingface/kustomization.yaml | 69 ++++++++++ .../huggingface/set-compute-class.yaml | 24 ++++ .../huggingface/templates/downloader.tpl.env | 6 + ...tproviderclass-huggingface-tokens.tpl.yaml | 37 +++++ 7 files changed, 388 insertions(+) create mode 100755 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/configure_huggingface.sh create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/configure_huggingface.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/configure_huggingface.sh new file mode 100755 index 000000000..59a2eefd9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/configure_huggingface.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +if [[ ! -v HF_MODEL_ID ]]; then + echo "HF_MODEL_ID is not set, exiting!" + exit 1 +fi + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +secret_version_found=$(gcloud secrets versions list "${huggingface_hub_access_token_read_secret_manager_secret_name}" \ +--project="${huggingface_secret_manager_project_id}" 2>/dev/null | grep "enabled" | wc -l) + +if [[ ${secret_version_found} == 0 ]]; then + echo "Hugging Face Hub read token secret '${huggingface_hub_access_token_read_secret_manager_secret_name}' version is missing or not enabled! Please add the token to the secret, exiting." + exit 1 +fi + +envsubst < "${MY_PATH}/huggingface/templates/downloader.tpl.env" | sponge "${MY_PATH}/huggingface/downloader.env" + +envsubst < "${MY_PATH}/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/huggingface/secretproviderclass-huggingface-tokens.yaml" + +cd "${MY_PATH}/huggingface" +kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml new file mode 100644 index 000000000..f08d03c67 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml @@ -0,0 +1,128 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: hf-model-to-gcs + namespace: replaced-by-kustomize +data: + download.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + + start_download=$(date +%s) + + echo "Starting download of ${MODEL_ID}..." + hf download \ + --local-dir /local/hf/model \ + --max-workers ${HF_MAX_WORKERS:-"8"} \ + --repo-type model \ + ${MODEL_ID} + + end_download=$(date +%s) + runtime_download=$((end_download - start_download)) + echo "Download runtime: $(date -d@${runtime_download} -u +%H:%M:%S)" + + echo "Removing cache directory" + rm -rf /local/hf/model/.cache + install_packages.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + + echo "Installing required packages..." + + pip3 install \ + --break-system-packages \ + --root-user-action=ignore \ + --upgrade \ + huggingface_hub + run.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P + )" + + start=$(date +%s) + + if [ -z "${MODEL_ID:-}" ]; then + echo "Error: MODEL_ID is not set." + exit 1 + fi + if [ -z "${MODEL_BUCKET_NAME:-}" ]; then + echo "Error: MODEL_BUCKET_NAME is not set." + exit 1 + fi + + export MODEL_ID=${MODEL_ID,,} + echo "Preparing to download '${MODEL_ID}' from Hugging Face to the '${MODEL_BUCKET_NAME}' Cloud Storage bucket" + + echo "Creating '${MODEL_ID}' model folder in '${MODEL_BUCKET_NAME}' bucket" + if [[ "${REPLACE_EXISTING:-false}" == "true" ]]; then + gcloud storage folders create --recursive "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/" || echo "Bucket already exists" + else + gcloud storage folders create --recursive "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/" + fi + + "${MY_PATH}/install_packages.sh" + + "${MY_PATH}/download.sh" + + "${MY_PATH}/transfer.sh" + + end=$(date +%s) + runtime=$((end - start)) + echo "Total runtime: $(date -d@${runtime} -u +%H:%M:%S)" + transfer.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + + start_transfer=$(date +%s) + + if [[ "${REPLACE_EXISTING:-false}" == "true" ]]; then + echo "Removing existing model files..." + gcloud storage rm \ + --recursive \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/*" || echo "No existing model files" + fi + + echo "Transferring model to the bucket" + gcloud config set storage/parallel_composite_upload_enabled True + gcloud config set storage/parallel_composite_upload_component_prefix parallel_composite_uploads + + gcloud storage cp \ + --gzip-in-flight-all \ + --recursive \ + /local/hf/model/* \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/" + + echo "Removing temporary files" + gcloud storage rm \ + --recursive \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/parallel_composite_uploads" \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/**/parallel_composite_uploads" || echo "No temporary files to removes" + + end_transfer=$(date +%s) + runtime_transfer=$((end_transfer - start_transfer)) + echo "Transfer runtime: $(date -d@${runtime_transfer} -u +%H:%M:%S)" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml new file mode 100644 index 000000000..319ec4270 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hf-model-to-gcs + namespace: replaced-by-kustomize +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: hf-model-to-gcs + spec: + containers: + - args: ["/scripts/run.sh"] + command: ["/bin/sh", "-c"] + env: + - name: HF_TOKEN_PATH + value: /var/run/secrets/huggingface.co/token + - name: HF_MAX_WORKERS + value: "2" + - name: HF_XET_CACHE + value: /local/hf/xet + - name: HF_XET_NUM_CONCURRENT_RANGE_GETS + value: "4" + - name: HF_XET_HIGH_PERFORMANCE + value: "0" + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: download + - name: MODEL_BUCKET_NAME + valueFrom: + configMapKeyRef: + key: MODEL_BUCKET_NAME + name: download + - name: REPLACE_EXISTING + value: "true" + image: gcr.io/google.com/cloudsdktool/cloud-sdk:slim + name: hf-model-to-gcs + resources: + requests: + cpu: 2000m + ephemeral-storage: 1Gi + memory: 10Gi + volumeMounts: + - mountPath: /scripts + name: scripts + - mountPath: /var/run/secrets/huggingface.co + name: huggingface-token + restartPolicy: OnFailure + securityContext: + fsGroup: 10000 + serviceAccountName: replaced-by-kustomize + terminationGracePeriodSeconds: 0 + volumes: + - configMap: + defaultMode: 0744 + name: hf-model-to-gcs + name: scripts + - csi: + driver: secrets-store-gke.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: huggingface-token-read + name: huggingface-token diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml new file mode 100644 index 000000000..73e8fd288 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml @@ -0,0 +1,69 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - downloader.env + name: download + namespace: replaced-by-kustomize + +patches: + - path: set-compute-class.yaml + +replacements: + - source: + fieldPath: data.DOWNLOADER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: download + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: SecretProviderClass + - source: + fieldPath: data.DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: download + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + name: hf-model-to-gcs + - source: + kind: SecretProviderClass + name: huggingface-token-read + fieldPath: metadata.name + targets: + - select: + kind: Job + name: hf-model-to-gcs + fieldPaths: + - spec.template.spec.volumes.[name=huggingface-token].csi.volumeAttributes.secretProviderClass + +resources: + - configmap-scripts.yaml + - job.yaml + - secretproviderclass-huggingface-tokens.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml new file mode 100644 index 000000000..739d3164a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hf-model-to-gcs + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: model-download diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env new file mode 100644 index 000000000..969cf873f --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env @@ -0,0 +1,6 @@ +DOWNLOADER_KUBERNETES_NAMESPACE=${huggingface_hub_downloader_kubernetes_namespace_name} +DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT=${huggingface_hub_downloader_kubernetes_service_account_name} +HUGGINGFACE_TOKEN_READ_SECRET_PROVIDER_CLASS_NAME=huggingface-token-read +HUGGINGFACE_TOKEN_WRITE_SECRET_PROVIDER_CLASS_NAME=huggingface-token-write +MODEL_BUCKET_NAME=${huggingface_hub_models_bucket_name} +MODEL_ID=${HF_MODEL_ID} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml new file mode 100644 index 000000000..dd8db6665 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: huggingface-token-read + namespace: replaced-by-kustomize +spec: + parameters: + secrets: | + - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" + path: "token" + provider: gke +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: huggingface-token-write + namespace: replaced-by-kustomize +spec: + parameters: + secrets: | + - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_write_secret_manager_secret_name}/versions/latest" + path: "token" + provider: gke From 8a68a13ef773f5c81da91d91dc2b9b3d212ff2be Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 13:34:26 +0000 Subject: [PATCH 39/57] fix: environment script path --- .../configure_dataset_downloader.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh index 0bd57a955..e8090bc79 100755 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh @@ -25,7 +25,7 @@ MY_PATH="$( RANDOM_HASH=$(openssl rand -hex 4) echo "${RANDOM_HASH}" > job_random_hash.txt -source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_variables.sh" +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" envsubst <"${MY_PATH}/base/templates/reinforcement-learning-dataset-downloader.tpl.env" | sponge "${MY_PATH}/base/reinforcement-learning-dataset-downloader.env" From 85993f853d91fa644bca0cc808667b8582bf2c1e Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 13:34:41 +0000 Subject: [PATCH 40/57] feat: add iam role for dataset downloader --- .../terraform/rl_on_tpu/iam.tf | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf index ba780452c..b0d1ac1eb 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf @@ -13,9 +13,10 @@ # limitations under the License. locals { - cluster_wi_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject" - rl_on_tpu_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name}/sa/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name}" - rl_mlflow_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name}/sa/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name}" + cluster_wi_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject" + rl_on_tpu_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name}/sa/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name}" + rl_mlflow_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name}/sa/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name}" + rl_dataset_downloader_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name}/sa/${local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name}" } resource "google_storage_bucket_iam_member" "hub_models_rl_on_tpu_ksa" { @@ -23,3 +24,9 @@ resource "google_storage_bucket_iam_member" "hub_models_rl_on_tpu_ksa" { member = local.rl_on_tpu_ksa_member role = local.cluster_gcsfuse_user_role } + +resource "google_project_iam_member" "gcsfuse_user_member_ira_offline_batch_cpu_dataset_downloader_ksa" { + project = data.google_project.cluster.project_id + member = local.rl_dataset_downloader_ksa_member + role = local.cluster_gcsfuse_user_role +} From f67c8701aebbae8359c48300981235b3ec787a32 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 14:00:04 +0000 Subject: [PATCH 41/57] fix: naming issue --- .../use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf index b0d1ac1eb..06514827a 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf @@ -25,7 +25,7 @@ resource "google_storage_bucket_iam_member" "hub_models_rl_on_tpu_ksa" { role = local.cluster_gcsfuse_user_role } -resource "google_project_iam_member" "gcsfuse_user_member_ira_offline_batch_cpu_dataset_downloader_ksa" { +resource "google_project_iam_member" "gcsfuse_user_member_rl_cpu_dataset_downloader_ksa" { project = data.google_project.cluster.project_id member = local.rl_dataset_downloader_ksa_member role = local.cluster_gcsfuse_user_role From fbf187a07e62cf001df90460e3f86b9ed3ff8a55 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 14:09:37 +0000 Subject: [PATCH 42/57] feat: remove sharding --- .../src/app.py | 73 +++++++------------ 1 file changed, 25 insertions(+), 48 deletions(-) diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py index eebc6aa57..17f2f002d 100644 --- a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py @@ -15,7 +15,6 @@ import json import logging import logging.config -import math import os from datasets import load_dataset @@ -34,35 +33,25 @@ "level": "INFO", "formatter": "standard", "class": "logging.StreamHandler", - "stream": "ext://sys.stdout", # Default is stderr + "stream": "ext://sys.stdout", }, }, "loggers": { - "": { # root logger - "level": ROOT_LEVEL, # "INFO", + "": { + "level": ROOT_LEVEL, "handlers": ["default"], "propagate": False, }, - "uvicorn.error": { - "level": "DEBUG", - "handlers": ["default"], - }, - "uvicorn.access": { - "level": "DEBUG", - "handlers": ["default"], - }, }, } logging.config.dictConfig(LOGGING_CONFIG) - LOG = logging.getLogger(__name__) # --- Configuration --- -# Fetch bucket name from environment variable DATASET_BUCKET_NAME = os.getenv("DATASET_BUCKET_NAME") GCS_PREFIX = "gsm8k" -NUM_SHARDS = 10 +OUTPUT_FILENAME = "gsm8k_full.json" def validate_config(): @@ -71,66 +60,54 @@ def validate_config(): raise ValueError("DATASET_BUCKET_NAME environment variable is required.") -def prepare_and_upload_shards(): +def prepare_and_upload_dataset(): validate_config() # 1. Initialize GCS Client try: storage_client = storage.Client() bucket = storage_client.bucket(DATASET_BUCKET_NAME) - # fast check if bucket exists (optional, but good for fail-fast) if not bucket.exists(): - LOG.error( - f"❌ Error: Bucket '{DATASET_BUCKET_NAME}' does not exist or you lack permissions." - ) + LOG.error(f"❌ Error: Bucket '{DATASET_BUCKET_NAME}' is not accessible.") raise ValueError(f"Bucket '{DATASET_BUCKET_NAME}' is not accessible.") except Exception as e: LOG.error(f"❌ Error connecting to GCS: {e}") raise e - # 2. Load Dataset (Alpaca Cleaned) + # 2. Load Dataset (GSM8K from Hugging Face) LOG.info("⬇️ Downloading dataset from Hugging Face...") try: - dataset = load_dataset("openai/gsm8k", split="train") + # Loading the full 'main' split + dataset = load_dataset("openai/gsm8k", "main", split="train") except Exception as e: - LOG.error(f"❌ Error loading dataset: {e}") - raise e + LOG.info("Attempting alternative split loading...") + dataset = load_dataset("openai/gsm8k", split="train") total_records = len(dataset) - shard_size = math.ceil(total_records / NUM_SHARDS) - LOG.info(f"✅ Dataset loaded. Total records: {total_records}") - LOG.info(f"⚡ Splitting into {NUM_SHARDS} shards of ~{shard_size} records each.") - - # 3. Shard and Upload - LOG.info(f"🚀 Uploading to gs://{DATASET_BUCKET_NAME}/{GCS_PREFIX}/ ...") - for i in range(NUM_SHARDS): - start_idx = i * shard_size - end_idx = min((i + 1) * shard_size, total_records) + # 3. Convert to List and Upload + LOG.info(f"🚀 Uploading to gs://{DATASET_BUCKET_NAME}/{GCS_PREFIX}/{OUTPUT_FILENAME} ...") - subset = dataset.select(range(start_idx, end_idx)) - shard_data = list(subset) + try: + # Convert the entire dataset to a list of dicts + dataset_list = list(dataset) # Serialize to JSON - json_data = json.dumps(shard_data, indent=2) + json_data = json.dumps(dataset_list, indent=2) # Define GCS path - blob_name = f"{GCS_PREFIX}/input_shard_{i}.json" + blob_name = f"{GCS_PREFIX}/{OUTPUT_FILENAME}" blob = bucket.blob(blob_name) - try: - # Upload string directly to GCS - blob.upload_from_string(data=json_data, content_type="application/json") - LOG.info( - f" • Uploaded shard {i}: {blob_name} ({len(shard_data)} records)" - ) - except Exception as e: - LOG.error(f" ❌ Failed to upload shard {i}: {e}") - raise e + # Upload string directly to GCS + blob.upload_from_string(data=json_data, content_type="application/json") + LOG.info(f"✨ Successfully uploaded {total_records} records to {blob_name}") - LOG.info("\n✨ All shards uploaded successfully.") + except Exception as e: + LOG.error(f"❌ Failed to process or upload dataset: {e}") + raise e if __name__ == "__main__": - prepare_and_upload_shards() + prepare_and_upload_dataset() From c2a30ed1bffd90ddec70ee6f3b5e809e2deaebd3 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 14:11:58 +0000 Subject: [PATCH 43/57] fix: file formatting --- .../cpu/reinforcement-learning-dataset-downloader/src/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py index 17f2f002d..774d8d3ac 100644 --- a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py @@ -87,7 +87,9 @@ def prepare_and_upload_dataset(): LOG.info(f"✅ Dataset loaded. Total records: {total_records}") # 3. Convert to List and Upload - LOG.info(f"🚀 Uploading to gs://{DATASET_BUCKET_NAME}/{GCS_PREFIX}/{OUTPUT_FILENAME} ...") + LOG.info( + f"🚀 Uploading to gs://{DATASET_BUCKET_NAME}/{GCS_PREFIX}/{OUTPUT_FILENAME} ..." + ) try: # Convert the entire dataset to a list of dicts From f9aa29e196779ab4623ce75cf4028e367cb59a2a Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Mon, 23 Mar 2026 15:19:49 +0000 Subject: [PATCH 44/57] feat: refactoring --- .../src/app.py | 46 ++++++++----------- .../src/logging.conf | 35 ++++++++++++++ 2 files changed, 54 insertions(+), 27 deletions(-) create mode 100644 container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py index 774d8d3ac..87fa1e997 100644 --- a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py @@ -21,31 +21,7 @@ from google.cloud import storage # --- LOGGING CONFIGURATION --- -ROOT_LEVEL = "INFO" -LOGGING_CONFIG = { - "version": 1, - "disable_existing_loggers": True, - "formatters": { - "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"}, - }, - "handlers": { - "default": { - "level": "INFO", - "formatter": "standard", - "class": "logging.StreamHandler", - "stream": "ext://sys.stdout", - }, - }, - "loggers": { - "": { - "level": ROOT_LEVEL, - "handlers": ["default"], - "propagate": False, - }, - }, -} - -logging.config.dictConfig(LOGGING_CONFIG) +logging.config.fileConfig("logging.conf", disable_existing_loggers=True) LOG = logging.getLogger(__name__) # --- Configuration --- @@ -54,13 +30,29 @@ OUTPUT_FILENAME = "gsm8k_full.json" -def validate_config(): +def validate_config() -> None: + """Validates that required environment variables are set. + + Raises: + ValueError: If the DATASET_BUCKET_NAME environment variable is missing or empty. + """ if not DATASET_BUCKET_NAME: LOG.error("❌ Error: Environment variable 'DATASET_BUCKET_NAME' is not set.") raise ValueError("DATASET_BUCKET_NAME environment variable is required.") -def prepare_and_upload_dataset(): +def prepare_and_upload_dataset() -> None: + """Downloads the GSM8K dataset from Hugging Face and uploads it to Google Cloud Storage. + + This function initializes a GCS client, attempts to fetch the GSM8K dataset + from the Hugging Face hub, converts the records into a single JSON string, + and uploads the resulting file to the configured GCS bucket. + + Raises: + ValueError: If the specified GCS bucket does not exist or is inaccessible. + Exception: If an error occurs during GCS client initialization, dataset + download, or the final upload process. + """ validate_config() # 1. Initialize GCS Client diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf b/container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf new file mode 100644 index 000000000..2c8872eeb --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf @@ -0,0 +1,35 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[loggers] +keys=root + +[handlers] +keys=defaultHandler + +[formatters] +keys=standardFormatter + +[logger_root] +level=INFO +handlers=defaultHandler + +[handler_defaultHandler] +class=StreamHandler +level=INFO +formatter=standardFormatter +args=(sys.stdout,) + +[formatter_standardFormatter] +format=%(asctime)s [%(levelname)s] %(name)s: %(message)s From 09c0092206da666c773b2e088a3d76673619250e Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 25 Mar 2026 10:30:24 +0000 Subject: [PATCH 45/57] feat: update to latest version of core components --- .../base/_shared_config/workloads_variables.tf | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/platforms/gke/base/_shared_config/workloads_variables.tf b/platforms/gke/base/_shared_config/workloads_variables.tf index 21962f00f..eb3b053d8 100644 --- a/platforms/gke/base/_shared_config/workloads_variables.tf +++ b/platforms/gke/base/_shared_config/workloads_variables.tf @@ -17,7 +17,7 @@ locals { } variable "custom_metrics_adapter_version" { - default = "0.16.2" + default = "0.16.5" description = "Version of Custom Metrics Adapter (https://github.com/GoogleCloudPlatform/k8s-stackdriver) to install." type = string } @@ -29,31 +29,37 @@ variable "inference_gateway_kubernetes_namespace" { } variable "inference_gateway_version" { - default = "1.1.0" + default = "1.4.0" description = "Version of Gateway API Inference Extension (https://github.com/kubernetes-sigs/gateway-api-inference-extension) to install." type = string } variable "jobset_version" { - default = "0.10.1" + default = "0.11.1" description = "Version of JobSet (https://github.com/kubernetes-sigs/jobset/) to install." type = string } variable "kuberay_version" { - default = "1.5.1" + default = "1.6.0" description = "Version of KubeRay (https://github.com/ray-project/kuberay) to install." type = string } variable "kueue_version" { - default = "0.14.4" + default = "0.16.4" description = "Version of Kueue (https://kueue.sigs.k8s.io/) to install." type = string } variable "lws_version" { - default = "0.7.0" + default = "0.8.0" description = "Version of LeaderWorkerSet (LWS) (https://github.com/kubernetes-sigs/lws/) to install." type = string } + +variable "pathways_version" { + default = "0.1.4" + description = "Version of Pathways (https://github.com/google/pathways-job) to install." + type = string +} From ebc401504923e1a9a910e98b700a4c40ea828623 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 25 Mar 2026 10:30:38 +0000 Subject: [PATCH 46/57] feat: add core workload pathways --- .../workloads/pathways/_cluster.auto.tfvars | 1 + .../workloads/pathways/_cluster_variables.tf | 1 + .../workloads/pathways/_platform.auto.tfvars | 1 + .../workloads/pathways/_platform_variables.tf | 1 + .../workloads/pathways/_workloads.auto.tfvars | 1 + .../pathways/_workloads_variables.tf | 1 + .../gke/base/core/workloads/pathways/main.tf | 129 ++++++++++++++++++ .../templates/namespace-jobset-system.yaml | 22 +++ .../templates/workload/kustomization.yaml | 24 ++++ .../gke-managed-components-toleration.yaml | 28 ++++ .../workload/patch/no-create-namespace.yaml | 19 +++ .../base/core/workloads/pathways/versions.tf | 32 +++++ 12 files changed, 260 insertions(+) create mode 120000 platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars create mode 120000 platforms/gke/base/core/workloads/pathways/_cluster_variables.tf create mode 120000 platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars create mode 120000 platforms/gke/base/core/workloads/pathways/_platform_variables.tf create mode 120000 platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars create mode 120000 platforms/gke/base/core/workloads/pathways/_workloads_variables.tf create mode 100644 platforms/gke/base/core/workloads/pathways/main.tf create mode 100644 platforms/gke/base/core/workloads/pathways/templates/namespace-jobset-system.yaml create mode 100644 platforms/gke/base/core/workloads/pathways/templates/workload/kustomization.yaml create mode 100644 platforms/gke/base/core/workloads/pathways/templates/workload/patch/gke-managed-components-toleration.yaml create mode 100644 platforms/gke/base/core/workloads/pathways/templates/workload/patch/no-create-namespace.yaml create mode 100644 platforms/gke/base/core/workloads/pathways/versions.tf diff --git a/platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars b/platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars new file mode 120000 index 000000000..4d9954e5a --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_cluster_variables.tf b/platforms/gke/base/core/workloads/pathways/_cluster_variables.tf new file mode 120000 index 000000000..3f2c29e19 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_cluster_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars b/platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars new file mode 120000 index 000000000..c3133e727 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_platform_variables.tf b/platforms/gke/base/core/workloads/pathways/_platform_variables.tf new file mode 120000 index 000000000..c68738baa --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars b/platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars new file mode 120000 index 000000000..b65551f53 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/workloads.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_workloads_variables.tf b/platforms/gke/base/core/workloads/pathways/_workloads_variables.tf new file mode 120000 index 000000000..fec5c48ce --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_workloads_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/workloads_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/main.tf b/platforms/gke/base/core/workloads/pathways/main.tf new file mode 100644 index 000000000..f72efb6d5 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/main.tf @@ -0,0 +1,129 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + kubeconfig_directory = "${path.module}/../../../kubernetes/kubeconfig" + kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" + + manifests_directory = "${local.namespace_directory}/pathways-system" + namespace_directory = "${local.manifests_directory_root}/namespace" + version_manifests_directory = "${path.module}/manifests/pathways-${var.jobset_version}" +} + +data "local_file" "kubeconfig" { + filename = local.kubeconfig_file +} + +resource "terraform_data" "namespace" { + input = { + manifests_dir = local.namespace_directory + } + + provisioner "local-exec" { + command = < Date: Wed, 25 Mar 2026 10:30:54 +0000 Subject: [PATCH 47/57] feat: add deployment of pathways for reinforcement learning --- .../reinforcement-learning/terraform/deploy-standard.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh index 6fed3c08c..9ff8fe538 100755 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh @@ -51,6 +51,7 @@ declare -a CORE_TERRASERVICES_APPLY=( "workloads/lws" "workloads/priority_class" "workloads/kueue" + "workloads/pathways" ) CORE_TERRASERVICES_APPLY="${CORE_TERRASERVICES_APPLY[*]}" "${ACP_PLATFORM_CORE_DIR}/deploy.sh" From 3b00021959120c63d458aab0bd8e2d0b0cd701f3 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Wed, 25 Mar 2026 11:04:59 +0000 Subject: [PATCH 48/57] fix: change yaml name for pathways namespace --- ...amespace-jobset-system.yaml => namespace-pathways-system.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename platforms/gke/base/core/workloads/pathways/templates/{namespace-jobset-system.yaml => namespace-pathways-system.yaml} (100%) diff --git a/platforms/gke/base/core/workloads/pathways/templates/namespace-jobset-system.yaml b/platforms/gke/base/core/workloads/pathways/templates/namespace-pathways-system.yaml similarity index 100% rename from platforms/gke/base/core/workloads/pathways/templates/namespace-jobset-system.yaml rename to platforms/gke/base/core/workloads/pathways/templates/namespace-pathways-system.yaml From 89dd00c2d38b97020b387c7f57ea9019ef696eb2 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 26 Mar 2026 08:15:00 +0000 Subject: [PATCH 49/57] feat: add reinforcement learning model converter --- .../Dockerfile | 74 +++++++++++++++++++ .../cloudbuild.yaml | 30 ++++++++ 2 files changed, 104 insertions(+) create mode 100644 container-images/cpu/reinforcement-learning-model-converter/Dockerfile create mode 100644 container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml diff --git a/container-images/cpu/reinforcement-learning-model-converter/Dockerfile b/container-images/cpu/reinforcement-learning-model-converter/Dockerfile new file mode 100644 index 000000000..dd898d3c8 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-model-converter/Dockerfile @@ -0,0 +1,74 @@ +# syntax=docker.io/docker/dockerfile:1.17.1 + +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ========================================== +# Stage 1: Builder +# ========================================== +FROM python:3.12.13-slim-trixie AS builder + +# Install build dependencies and git +RUN apt-get update && apt-get install -y \ + git \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +# Clone the MaxText repository and immediately remove the hidden .git folder +RUN git clone https://github.com/AI-Hypercomputer/maxtext.git . \ + && rm -rf .git + +# Create a virtual environment to isolate dependencies +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install 'uv' (the fast Python package installer) +RUN pip install --no-cache-dir uv + +# Install MaxText with the [tpu] dependency group +# (This includes absl-py, jax, etc., which are needed for conversion) +RUN uv pip install .[tpu] --resolution=lowest + +# MaxText uses a custom CLI command to install unreleased dependencies from GitHub +RUN install_maxtext_tpu_github_deps + +# Explicitly install CPU-only PyTorch (Required to read HF Checkpoints) +RUN uv pip install torch --index-url https://download.pytorch.org/whl/cpu + +# ========================================== +# Stage 2: Final Runtime +# ========================================== +FROM python:3.12.13-slim-trixie + +WORKDIR /workspace + +# Copy the pre-built virtual environment from the builder stage +COPY --from=builder /opt/venv /opt/venv + +# Copy the necessary source code from the builder stage +COPY --from=builder /workspace /workspace + +# Activate the virtual environment by default +ENV PATH="/opt/venv/bin:$PATH" + +# Set PYTHONPATH so Python can locate the maxtext modules +ENV PYTHONPATH="/workspace/src:${PYTHONPATH}" + +# Set the entrypoint to the checkpoint conversion script +ENTRYPOINT ["python", "src/maxtext/checkpoint_conversion/to_maxtext.py"] + +# Fallback command to display help flags +CMD ["--help"] diff --git a/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml b/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml new file mode 100644 index 000000000..0f0e71978 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml @@ -0,0 +1,30 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --file=container-images/cpu/reinforcement-learning-model-converter/Dockerfile + - --tag=${_DESTINATION} + - . + id: "Build Reinforcement Learning Model Converter image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] From 826194a96639e815c33e41a7e738258048678c5c Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 26 Mar 2026 08:22:52 +0000 Subject: [PATCH 50/57] feat: add reinforcement learning model converter --- .../base/job.yaml | 80 +++++++++++++++++++ .../base/kustomization.yaml | 58 ++++++++++++++ .../base/set-compute-class.yaml | 24 ++++++ .../base/templates/model_converter.tpl.env | 5 ++ .../configure_model_converter.sh | 34 ++++++++ 5 files changed, 201 insertions(+) create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env create mode 100755 platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml new file mode 100644 index 000000000..2a2c521f0 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hf-model-to-maxtext + namespace: replaced-by-kustomize +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: hf-model-to-maxtext + spec: + containers: + - args: ["/scripts/run.sh"] + command: ["/bin/sh", "-c"] + env: + - name: HF_TOKEN_PATH + value: /var/run/secrets/huggingface.co/token + - name: HF_MAX_WORKERS + value: "2" + - name: HF_XET_CACHE + value: /local/hf/xet + - name: HF_XET_NUM_CONCURRENT_RANGE_GETS + value: "4" + - name: HF_XET_HIGH_PERFORMANCE + value: "0" + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: download + - name: MODEL_BUCKET_NAME + valueFrom: + configMapKeyRef: + key: MODEL_BUCKET_NAME + name: download + - name: REPLACE_EXISTING + value: "true" + image: gcr.io/google.com/cloudsdktool/cloud-sdk:slim + name: hf-model-to-maxtext + resources: + requests: + cpu: 2000m + ephemeral-storage: 1Gi + memory: 10Gi + volumeMounts: + - mountPath: /scripts + name: scripts + - mountPath: /var/run/secrets/huggingface.co + name: huggingface-token + restartPolicy: OnFailure + securityContext: + fsGroup: 10000 + serviceAccountName: replaced-by-kustomize + terminationGracePeriodSeconds: 0 + volumes: + - configMap: + defaultMode: 0744 + name: hf-model-to-maxtext + name: scripts + - csi: + driver: secrets-store-gke.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: huggingface-token-read + name: huggingface-token diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml new file mode 100644 index 000000000..5024a063c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml @@ -0,0 +1,58 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - model_converter.env + name: reinforcement-learning-model-converter + namespace: replaced-by-kustomize + +patches: + - path: set-compute-class.yaml + +replacements: +- source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - spec.template.spec.containers.[name=reinforcement-learning-model-converter].image + select: + kind: Job + - source: + fieldPath: data.MODEL_CONVERTER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - metadata.namespace + select: + kind: Job + - source: + fieldPath: data.MODEL_CONVERTER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + name: reinforcement-learning-model-converter + +resources: + - job.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml new file mode 100644 index 000000000..dc25b4b0e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hf-model-to-maxtext + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: model-converter diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env new file mode 100644 index 000000000..1551b7a22 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env @@ -0,0 +1,5 @@ +MODEL_CONVERTER_KUBERNETES_NAMESPACE=${rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name} +MODEL_CONVERTER_KUBERNETES_SERVICE_ACCOUNT=${rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name} +CONTAINER_IMAGE_URL=${rl_cpu_reinforcement_learning_model_converter_image_url} +MODEL_BUCKET_NAME=${huggingface_hub_models_bucket_name} +MODEL_ID=${HF_MODEL_ID} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh new file mode 100755 index 000000000..d9c9e7938 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +if [[ ! -v HF_MODEL_ID ]]; then + echo "HF_MODEL_ID is not set, exiting!" + exit 1 +fi + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +envsubst < "${MY_PATH}/base/templates/model_converter.tpl.env" | sponge "${MY_PATH}/base/model_converter.env" + +cd "${MY_PATH}/base" +kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" From dcdf7aa5cc4300d1ea6a9ee94fe2d00d27ae902e Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 26 Mar 2026 08:23:42 +0000 Subject: [PATCH 51/57] feat: add model downloader terraform image builder --- .../reinforcement_learning_variables.tf | 22 +++++++++ .../_cloudbuild.auto.tfvars | 1 + .../_cloudbuild_variables.tf | 1 + .../_platform.auto.tfvars | 1 + .../_platform_variables.tf | 1 + .../_reinforcement_learning.auto.tfvars | 1 + .../_reinforcement_learning_variables.tf | 1 + .../cloudbuild.tf | 46 +++++++++++++++++++ .../local_file.tf | 17 +++++++ .../versions.tf | 32 +++++++++++++ 10 files changed, 123 insertions(+) create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars create mode 120000 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf create mode 100644 platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf index f4ac09734..f89fae510 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -20,6 +20,10 @@ locals { rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-dataset-downloader" rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-dataset-downloader-sa" + rl_cpu_reinforcement_learning_model_converter_image_url = var.rl_cpu_reinforcement_learning_model_converter_image_url != null ? var.rl_cpu_reinforcement_learning_model_converter_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-model-converter:latest" + rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-model-converter" + rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-model-converter-sa" + rl_dataset_bucket_name = var.rl_dataset_bucket_name != null ? var.rl_dataset_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-dataset" rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id @@ -59,6 +63,24 @@ variable "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_ac type = string } +variable "rl_cpu_reinforcement_learning_model_converter_image_url" { + default = null + description = "The URL for the RL model converter container image." + type = string +} + +variable "rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL model converter." + type = string +} + +variable "rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL model converter." + type = string +} + variable "rl_dataset_bucket_name" { default = null description = "The GCP bucket name for the RL dataset." diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..171a27a35 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..79960dd37 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf new file mode 100644 index 000000000..81c2c2ded --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.rl_cpu_reinforcement_learning_model_converter_image_url +} + +resource "terraform_data" "submit_docker_build" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.image_destination + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = self.input.acp_root + } + + triggers_replace = { + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-model-converter/Dockerfile") + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf new file mode 100644 index 000000000..178937b81 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_rl_images_cpu_reinforcement_learning_model_converter_deploy-v1" + } +} From b2595cf0311d98e5b4e2ea9e3b59a470358330ee Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 2 Apr 2026 15:09:00 +0000 Subject: [PATCH 52/57] feat: add model converter --- .../base/job.yaml | 96 ++++++++++--------- .../base/kustomization.yaml | 31 +++--- .../base/set-compute-class.yaml | 2 +- .../terraform/_shared_config/outputs.tf | 12 +++ .../scripts/set_environment_variables.sh | 10 ++ .../terraform/rl_on_tpu/kubernetes.tf | 5 + 6 files changed, 94 insertions(+), 62 deletions(-) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml index 2a2c521f0..9c14ff05e 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml @@ -1,10 +1,10 @@ -# Copyright 2025 Google LLC +# Copyright 2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# https://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -15,66 +15,68 @@ apiVersion: batch/v1 kind: Job metadata: - name: hf-model-to-maxtext + name: model-converter namespace: replaced-by-kustomize spec: - backoffLimit: 0 + ttlSecondsAfterFinished: 3600 template: metadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" labels: - app: hf-model-to-maxtext + app: model-converter spec: + restartPolicy: Never + serviceAccountName: replaced-by-kustomize containers: - - args: ["/scripts/run.sh"] - command: ["/bin/sh", "-c"] + - name: model-converter + image: replaced-by-kustomize # Replace with your pushed model-converter image + command: + - python + - src/maxtext/checkpoint_conversion/to_maxtext.py + args: + - "--hf_model_path=/gcs/$(MODEL_ID)" + - "--lazy_load_tensors=True" + # Add output flags based on your specific MaxText version, e.g., + # - "--base_output_directory=/gcs/converted_models/" env: - - name: HF_TOKEN_PATH - value: /var/run/secrets/huggingface.co/token - - name: HF_MAX_WORKERS - value: "2" - - name: HF_XET_CACHE - value: /local/hf/xet - - name: HF_XET_NUM_CONCURRENT_RANGE_GETS - value: "4" - - name: HF_XET_HIGH_PERFORMANCE - value: "0" - name: MODEL_ID valueFrom: configMapKeyRef: key: MODEL_ID - name: download - - name: MODEL_BUCKET_NAME - valueFrom: - configMapKeyRef: - key: MODEL_BUCKET_NAME - name: download - - name: REPLACE_EXISTING - value: "true" - image: gcr.io/google.com/cloudsdktool/cloud-sdk:slim - name: hf-model-to-maxtext + name: reinforcement-learning-model-converter resources: requests: - cpu: 2000m - ephemeral-storage: 1Gi - memory: 10Gi + cpu: "4" + memory: "32Gi" + limits: + cpu: "8" + memory: "64Gi" volumeMounts: - - mountPath: /scripts - name: scripts - - mountPath: /var/run/secrets/huggingface.co - name: huggingface-token - restartPolicy: OnFailure - securityContext: - fsGroup: 10000 - serviceAccountName: replaced-by-kustomize - terminationGracePeriodSeconds: 0 + - mountPath: /dev/shm + name: dev-shm + - mountPath: /gcs + name: huggingface-hub-model-bucket volumes: - - configMap: - defaultMode: 0744 - name: hf-model-to-maxtext - name: scripts + - emptyDir: + medium: Memory + name: dev-shm - csi: - driver: secrets-store-gke.csi.k8s.io - readOnly: true + driver: gcsfuse.csi.storage.gke.io volumeAttributes: - secretProviderClass: huggingface-token-read - name: huggingface-token + bucketName: cloud-storage-bucket-name + mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize + skipCSIBucketAccessCheck: "true" + name: huggingface-hub-model-bucket + - emptyDir: + medium: Memory + name: gke-gcsfuse-cache + - emptyDir: + medium: Memory + name: gke-gcsfuse-tmp + - emptyDir: + medium: Memory + name: gke-gcsfuse-buffer diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml index 5024a063c..77ef591ea 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml @@ -25,15 +25,15 @@ patches: - path: set-compute-class.yaml replacements: -- source: - fieldPath: data.CONTAINER_IMAGE_URL - kind: ConfigMap - name: reinforcement-learning-model-converter - targets: - - fieldPaths: - - spec.template.spec.containers.[name=reinforcement-learning-model-converter].image - select: - kind: Job + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - spec.template.spec.containers.[name=model-converter].image + select: + kind: Job - source: fieldPath: data.MODEL_CONVERTER_KUBERNETES_NAMESPACE kind: ConfigMap @@ -41,6 +41,10 @@ replacements: targets: - fieldPaths: - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace select: kind: Job - source: @@ -48,11 +52,10 @@ replacements: kind: ConfigMap name: reinforcement-learning-model-converter targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Job - name: reinforcement-learning-model-converter + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job resources: - job.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml index dc25b4b0e..00b104052 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml @@ -15,7 +15,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: hf-model-to-maxtext + name: model-converter namespace: replaced-by-kustomize spec: template: diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf index 8e11022ef..7668e9748 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf @@ -24,6 +24,18 @@ output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_acco value = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name } +output "rl_cpu_reinforcement_learning_model_converter_image_url" { + value = local.rl_cpu_reinforcement_learning_model_converter_image_url +} + +output "rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name" { + value = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name +} + +output "rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name" { + value = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name +} + output "rl_dataset_bucket_name" { value = local.rl_dataset_bucket_name } diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh index 1796c0b4e..2d397ec39 100755 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh @@ -29,3 +29,13 @@ declare -a SHARED_CONFIG_PATHS=( export SHARED_CONFIG_PATHS source "${ACP_PLATFORM_BASE_DIR}/_shared_config/scripts/set_environment_variables.sh" + +if [[ -v HF_MODEL_ID ]]; then + HF_MODEL_ID_HASH=$(echo "${HF_MODEL_ID}" | md5sum | cut -c1-8) + export HF_MODEL_ID_HASH + + HF_MODEL_NAME="${HF_MODEL_ID##*/}" + HF_MODEL_NAME="${HF_MODEL_NAME//./-}" + HF_MODEL_NAME="${HF_MODEL_NAME,,}" + export HF_MODEL_NAME +fi diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf index f41822b13..7f61cac7d 100644 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf @@ -32,6 +32,11 @@ locals { namespace = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name service_account = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name } + rl_reinforcement_learning_model_converter = { + directory = "${local.namespaces_directory}/${local.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name}" + namespace = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name + service_account = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name + } } manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" From e57c04e610c14780f049bda244aa2dbb5b91fa4d Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 2 Apr 2026 15:19:31 +0000 Subject: [PATCH 53/57] feat: add ccc for model converter --- .../cpu/custom-compute-model-converter.yaml | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml new file mode 100644 index 000000000..27f3e4a97 --- /dev/null +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: model-converter +spec: + activeMigration: + optimizeRulePriority: true + nodePoolConfig: + imageStreaming: + enabled: true + nodePoolAutoCreation: + enabled: true + priorities: + - machineType: c4-highmem-8-lssd + maxPodsPerNode: 32 + spot: false + storage: + localSSDCount: 1 + + - machineType: c3-highmem-8-lssd + maxPodsPerNode: 32 + spot: false + storage: + localSSDCount: 1 From aae9d4818cf0078d93886fd4c97aecf1f54d0f14 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 2 Apr 2026 15:20:39 +0000 Subject: [PATCH 54/57] feat: teardown pathways --- .../reinforcement-learning/terraform/teardown-standard.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh index cb212af3c..d327ca761 100755 --- a/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh @@ -68,6 +68,7 @@ done if [ "${ACP_TEARDOWN_CORE_PLATFORM}" = "true" ]; then declare -a CORE_TERRASERVICES_DESTROY=( + "workloads/pathways" "workloads/kueue" "workloads/priority_class" "workloads/lws" From 8219f3e412ba676d5738d2d945e7389f1fa8e0a2 Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 2 Apr 2026 16:38:25 +0000 Subject: [PATCH 55/57] fix: readme formatting --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 27e18125f..7b7b8e6d9 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ the primary runtime. - [ComfyUI reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/examples/comfyui/README.md) - [Federated learning](/docs/platforms/gke/base/use-cases/federated-learning/README.md) - [Inference reference architecture](/docs/platforms/gke/base/use-cases/inference-ref-arch/README.md) + - [Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md) - [Online inference with GPUs](/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/README.md) - [Online inference using Diffusers with GPUs on Google Kubernetes Engine (GKE)](/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md) From 37151062f465692f51d849127bbfe8fef339327e Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 2 Apr 2026 16:39:54 +0000 Subject: [PATCH 56/57] feat: add highmem to reinforcement learning dictionary --- .github/workflows/dictionary/reinforcement-learning.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/dictionary/reinforcement-learning.txt b/.github/workflows/dictionary/reinforcement-learning.txt index 240fb3a40..e8fc1eff9 100644 --- a/.github/workflows/dictionary/reinforcement-learning.txt +++ b/.github/workflows/dictionary/reinforcement-learning.txt @@ -1,6 +1,7 @@ epath etils grpo +highmem logdir logps maxtext From 11f71a9fe2fe965bf5777583433660b6a8a9c20a Mon Sep 17 00:00:00 2001 From: Laurent Grangeau Date: Thu, 2 Apr 2026 17:15:48 +0000 Subject: [PATCH 57/57] fix: change ccc for model converter --- .../templates/manifests/cpu/custom-compute-model-converter.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml index 27f3e4a97..3c38b1cd3 100644 --- a/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml @@ -31,7 +31,7 @@ spec: storage: localSSDCount: 1 - - machineType: c3-highmem-8-lssd + - machineType: c3-highmem-8 maxPodsPerNode: 32 spot: false storage: