diff --git a/README.md b/README.md index b8c8324a8..2d7536c51 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,8 @@ the primary runtime. - [Data preparation](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/data-preparation.md) - [Fine tuning](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/fine-tuning.md) - [Model evaluation](/docs/platforms/gke/base/use-cases/training-ref-arch/model-fine-tuning/model-evaluation.md) +- [Reinforcement Learning reference architecture](/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md) + - [RL on TPU](/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md) - [Reinforcement Learning reference architecture](/docs/platforms/gke/base/use-cases/reinforcement-learning/README.md) - [RL on TPU](/docs/platforms/gke/base/use-cases/reinforcement-learning/single-host-tpu-grpo/README.md) diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile b/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile new file mode 100644 index 000000000..b2992c32d --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile @@ -0,0 +1,55 @@ +# syntax=docker.io/docker/dockerfile:1.17.1 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# --- STAGE 1: Build Stage --- +# Use a Python image that includes tools for installing dependencies +FROM python:3.14.0-slim-trixie as builder + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV APP_HOME /usr/src/app + +# Create and set the working directory +WORKDIR $APP_HOME + +# Copy only the requirements file first to leverage Docker cache +COPY --from=primary requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# --- STAGE 2: Final Runtime Stage --- +# Use a minimal runtime image for security and size +FROM python:3.14.0-slim-trixie + +# Set environment variables for the runtime +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV APP_HOME /usr/src/app + +# Create and set the working directory +WORKDIR $APP_HOME + +# Copy installed dependencies from the builder stage +COPY --from=builder /usr/local/lib/python3.14/site-packages /usr/local/lib/python3.14/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Copy the application script itself +COPY --from=primary app.py . + +# Command to run the application when the container starts +CMD ["python", "app.py"] diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml b/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml new file mode 100644 index 000000000..132075a69 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --build-context=primary=container-images/cpu/reinforcement-learning-dataset-downloader/src + - --file=container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile + - --tag=${_DESTINATION} + - . + id: "Build Reinforcement Learning Dataset Downloader image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py new file mode 100644 index 000000000..87fa1e997 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/app.py @@ -0,0 +1,107 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import logging.config +import os + +from datasets import load_dataset +from google.cloud import storage + +# --- LOGGING CONFIGURATION --- +logging.config.fileConfig("logging.conf", disable_existing_loggers=True) +LOG = logging.getLogger(__name__) + +# --- Configuration --- +DATASET_BUCKET_NAME = os.getenv("DATASET_BUCKET_NAME") +GCS_PREFIX = "gsm8k" +OUTPUT_FILENAME = "gsm8k_full.json" + + +def validate_config() -> None: + """Validates that required environment variables are set. + + Raises: + ValueError: If the DATASET_BUCKET_NAME environment variable is missing or empty. + """ + if not DATASET_BUCKET_NAME: + LOG.error("❌ Error: Environment variable 'DATASET_BUCKET_NAME' is not set.") + raise ValueError("DATASET_BUCKET_NAME environment variable is required.") + + +def prepare_and_upload_dataset() -> None: + """Downloads the GSM8K dataset from Hugging Face and uploads it to Google Cloud Storage. + + This function initializes a GCS client, attempts to fetch the GSM8K dataset + from the Hugging Face hub, converts the records into a single JSON string, + and uploads the resulting file to the configured GCS bucket. + + Raises: + ValueError: If the specified GCS bucket does not exist or is inaccessible. + Exception: If an error occurs during GCS client initialization, dataset + download, or the final upload process. + """ + validate_config() + + # 1. Initialize GCS Client + try: + storage_client = storage.Client() + bucket = storage_client.bucket(DATASET_BUCKET_NAME) + if not bucket.exists(): + LOG.error(f"❌ Error: Bucket '{DATASET_BUCKET_NAME}' is not accessible.") + raise ValueError(f"Bucket '{DATASET_BUCKET_NAME}' is not accessible.") + except Exception as e: + LOG.error(f"❌ Error connecting to GCS: {e}") + raise e + + # 2. Load Dataset (GSM8K from Hugging Face) + LOG.info("⬇️ Downloading dataset from Hugging Face...") + try: + # Loading the full 'main' split + dataset = load_dataset("openai/gsm8k", "main", split="train") + except Exception as e: + LOG.info("Attempting alternative split loading...") + dataset = load_dataset("openai/gsm8k", split="train") + + total_records = len(dataset) + LOG.info(f"✅ Dataset loaded. Total records: {total_records}") + + # 3. Convert to List and Upload + LOG.info( + f"🚀 Uploading to gs://{DATASET_BUCKET_NAME}/{GCS_PREFIX}/{OUTPUT_FILENAME} ..." + ) + + try: + # Convert the entire dataset to a list of dicts + dataset_list = list(dataset) + + # Serialize to JSON + json_data = json.dumps(dataset_list, indent=2) + + # Define GCS path + blob_name = f"{GCS_PREFIX}/{OUTPUT_FILENAME}" + blob = bucket.blob(blob_name) + + # Upload string directly to GCS + blob.upload_from_string(data=json_data, content_type="application/json") + LOG.info(f"✨ Successfully uploaded {total_records} records to {blob_name}") + + except Exception as e: + LOG.error(f"❌ Failed to process or upload dataset: {e}") + raise e + + +if __name__ == "__main__": + prepare_and_upload_dataset() diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf b/container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf new file mode 100644 index 000000000..2c8872eeb --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/logging.conf @@ -0,0 +1,35 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[loggers] +keys=root + +[handlers] +keys=defaultHandler + +[formatters] +keys=standardFormatter + +[logger_root] +level=INFO +handlers=defaultHandler + +[handler_defaultHandler] +class=StreamHandler +level=INFO +formatter=standardFormatter +args=(sys.stdout,) + +[formatter_standardFormatter] +format=%(asctime)s [%(levelname)s] %(name)s: %(message)s diff --git a/container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt b/container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt new file mode 100644 index 000000000..17c983e20 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-dataset-downloader/src/requirements.txt @@ -0,0 +1,2 @@ +datasets==4.5.0 +google-cloud-storage==3.8.0 diff --git a/container-images/cpu/reinforcement-learning-model-converter/Dockerfile b/container-images/cpu/reinforcement-learning-model-converter/Dockerfile new file mode 100644 index 000000000..dd898d3c8 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-model-converter/Dockerfile @@ -0,0 +1,74 @@ +# syntax=docker.io/docker/dockerfile:1.17.1 + +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ========================================== +# Stage 1: Builder +# ========================================== +FROM python:3.12.13-slim-trixie AS builder + +# Install build dependencies and git +RUN apt-get update && apt-get install -y \ + git \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +# Clone the MaxText repository and immediately remove the hidden .git folder +RUN git clone https://github.com/AI-Hypercomputer/maxtext.git . \ + && rm -rf .git + +# Create a virtual environment to isolate dependencies +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install 'uv' (the fast Python package installer) +RUN pip install --no-cache-dir uv + +# Install MaxText with the [tpu] dependency group +# (This includes absl-py, jax, etc., which are needed for conversion) +RUN uv pip install .[tpu] --resolution=lowest + +# MaxText uses a custom CLI command to install unreleased dependencies from GitHub +RUN install_maxtext_tpu_github_deps + +# Explicitly install CPU-only PyTorch (Required to read HF Checkpoints) +RUN uv pip install torch --index-url https://download.pytorch.org/whl/cpu + +# ========================================== +# Stage 2: Final Runtime +# ========================================== +FROM python:3.12.13-slim-trixie + +WORKDIR /workspace + +# Copy the pre-built virtual environment from the builder stage +COPY --from=builder /opt/venv /opt/venv + +# Copy the necessary source code from the builder stage +COPY --from=builder /workspace /workspace + +# Activate the virtual environment by default +ENV PATH="/opt/venv/bin:$PATH" + +# Set PYTHONPATH so Python can locate the maxtext modules +ENV PYTHONPATH="/workspace/src:${PYTHONPATH}" + +# Set the entrypoint to the checkpoint conversion script +ENTRYPOINT ["python", "src/maxtext/checkpoint_conversion/to_maxtext.py"] + +# Fallback command to display help flags +CMD ["--help"] diff --git a/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml b/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml new file mode 100644 index 000000000..0f0e71978 --- /dev/null +++ b/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml @@ -0,0 +1,30 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --file=container-images/cpu/reinforcement-learning-model-converter/Dockerfile + - --tag=${_DESTINATION} + - . + id: "Build Reinforcement Learning Model Converter image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile b/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile new file mode 100644 index 000000000..9f3177b68 --- /dev/null +++ b/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile @@ -0,0 +1,56 @@ +# syntax=docker.io/docker/dockerfile:1.17.1 + +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.12-slim + +WORKDIR /workspace + +# Install system dependencies +RUN apt-get update && apt-get install -y wget curl build-essential git && rm -rf /var/lib/apt/lists/* + +# Upgrade pip and install the incredibly fast 'uv' package manager +RUN pip install --upgrade pip uv + +# 1. Install standard Torch CPU and specific JAX TPU drivers first (Our safety net!) +RUN uv pip install --system torch torchvision --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --system "jax[tpu]==0.4.25" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html --prerelease=allow + +# 2. Clone the repo +RUN git clone https://github.com/google/maxtext.git /workspace/maxtext + +# Shift Docker's working directory INSIDE the repo +WORKDIR /workspace/maxtext + +# 3. Apply the lowest flag (no 'cd' needed anymore!) +RUN uv pip install --system -e ".[tpu-post-train]" --resolution=lowest + +# 4. Run the script (it will natively find the 'src' folder now!) +RUN install_maxtext_tpu_post_train_extra_deps + +# Shift back to your main workspace for your custom files +WORKDIR /workspace + +# 5. Install our specific MLOps tracking tools +RUN uv pip install --system mlflow huggingface_hub math_verify + +# 6. Download the chat template directly +RUN wget https://raw.githubusercontent.com/google/maxtext/main/src/maxtext/examples/chat_templates/gsm8k_rl.json -O /workspace/gsm8k_rl.json + +# Copy our training script +COPY --from=primary app.py . + +# Execute the script +CMD ["python3", "app.py"] diff --git a/container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml b/container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml new file mode 100644 index 000000000..90a458d31 --- /dev/null +++ b/container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --build-context=primary=container-images/tpu/rl-on-tpu/src + - --file=container-images/tpu/rl-on-tpu/Dockerfile + - --tag=${_DESTINATION} + - . + id: "Build RL on TPU image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/container-images/tpu/reinforcement-learning-on-tpu/src/app.py b/container-images/tpu/reinforcement-learning-on-tpu/src/app.py new file mode 100644 index 000000000..7ea803688 --- /dev/null +++ b/container-images/tpu/reinforcement-learning-on-tpu/src/app.py @@ -0,0 +1,219 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +# --- SYSTEM SHIELDS (Must be at the very top!) --- +os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +import datetime +import subprocess +import sys + +import jax +import jax.numpy as jnp +import mlflow +from huggingface_hub import login + +# --- Replace the old tunix import with this --- +from maxtext.inference.vllm_decode import VllmRollout + +# 1. Save the original method +original_get_logps = VllmRollout.get_per_token_logps + + +def patched_get_per_token_logps(self, *args, **kwargs): + # Fix A: Intercept the mask to use as a blueprint + completion_mask = kwargs.pop("completion_mask", None) + + # Call the actual vLLM execution + results = original_get_logps(self, *args, **kwargs) + + # Extract target length (defaults to 768 if mask is missing) + target_len = completion_mask.shape[-1] if completion_mask is not None else 768 + + def pad_sequence(seq): + seq_arr = jnp.array(seq) + + # If vLLM returned an empty array, return a zeroed array of correct shape + if seq_arr.size == 0: + return jnp.zeros(target_len) + + # Pad with zeros if too short, or truncate if too long + pad_amount = target_len - seq_arr.shape[0] + if pad_amount > 0: + return jnp.pad(seq_arr, (0, pad_amount), constant_values=0.0) + elif pad_amount < 0: + return seq_arr[:target_len] + return seq_arr + + # Fix B: Process ragged lists and perfectly pad them into a rigid JAX block + if isinstance(results, list): + padded_results = [pad_sequence(seq) for seq in results] + return jnp.stack(padded_results) + + elif isinstance(results, dict): + return { + k: jnp.stack([pad_sequence(seq) for seq in v]) if isinstance(v, list) else v + for k, v in results.items() + } + + return results + + +# 2. Apply the patch +VllmRollout.get_per_token_logps = patched_get_per_token_logps + +print( + "🔧 Applied Monkey Patch v3: Intercepted kwargs and perfectly padded ragged JAX arrays." +) + +try: + import vllm + + print(f"✅ vLLM Version: {vllm.__version__}") + print(f"✅ JAX TPU Devices: {len(jax.devices())}") +except ImportError as e: + print(f"🚨 FATAL: vLLM is not installed correctly: {e}") + +# --- CORE IMPORTS --- +import maxtext +import maxtext.checkpoint_conversion.to_maxtext as to_maxtext_module +from etils import epath +from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices + +HF_TOKEN = os.environ.get("HF_TOKEN") +if not HF_TOKEN: + raise ValueError("HF_TOKEN environment variable not set.") +login(token=HF_TOKEN) + +# Replace it with the hardcoded absolute path where we cloned the repo: +MAXTEXT_PKG_DIR = "/workspace/maxtext/src/maxtext" + +MODEL_NAME = "llama3.1-8b" +TOKENIZER_PATH = "meta-llama/Llama-3.1-8B-Instruct" +RUN_NAME = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") +LOSS_ALGO = "grpo" + +# Paths are localized to the workspace +CHAT_TEMPLATE_PATH = "/workspace/gsm8k_rl.json" +MODEL_CHECKPOINT_PATH = "/workspace/llama_checkpoint" +OUTPUT_DIRECTORY = "/workspace/rl_llama3_output" + +mlflow.set_tracking_uri(os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")) +mlflow.set_experiment("MaxText-RL-GRPO") + +# --- CHECKPOINT CONVERSION (With FP32 casting patch) --- +target_checkpoint_items = f"{MODEL_CHECKPOINT_PATH}/0/items" + +if not epath.Path(target_checkpoint_items).exists(): + print(f"Downloading and converting Llama 3.1 to MaxText format...") + to_maxtext_path = to_maxtext_module.__file__ + + with open(to_maxtext_path, "r") as f: + script_content = f.read() + + if "v.numpy()" in script_content: + script_content = script_content.replace("v.numpy()", "v.float().numpy()") + + with open(to_maxtext_path, "w") as f: + f.write(script_content) + + conversion_command = ( + f"JAX_PLATFORMS=cpu python3 -m maxtext.checkpoint_conversion.to_maxtext " + f"{MAXTEXT_PKG_DIR}/configs/base.yml " + f"model_name={MODEL_NAME} " + f"base_output_directory={MODEL_CHECKPOINT_PATH} " + f"hf_access_token={HF_TOKEN} " + f"use_multimodal=false scan_layers=true skip_jax_distributed_system=True" + ) + + result = subprocess.run(conversion_command, shell=True, executable="/bin/bash") + if result.returncode != 0: + raise RuntimeError("Checkpoint conversion failed! Check the logs above.") +else: + print(f"✅ Found existing checkpoint at {target_checkpoint_items}") + +# --- MAXTEXT RL CONFIGURATION --- +config_argv = [ + "", + f"{MAXTEXT_PKG_DIR}/configs/post_train/rl.yml", + f"model_name={MODEL_NAME}", + f"tokenizer_path={TOKENIZER_PATH}", + f"run_name={RUN_NAME}", + f"chat_template_path={CHAT_TEMPLATE_PATH}", + f"load_parameters_path={MODEL_CHECKPOINT_PATH}/0/items", + # f"base_output_directory={OUTPUT_DIRECTORY}", + # --- DIRECT TO GCS ROUTING --- + f"base_output_directory=gs://accelerated-platforms-dev-trn-rl-gpu-hf-hub-models/my-grpo-checkpoints/rl_llama3_output/{RUN_NAME}", + f"hf_access_token={HF_TOKEN}", + "debug.rl=False", + f"rl.loss_algo={LOSS_ALGO}", + "rl.rollout_engine=vllm", + "use_pathways=False", + "rollout_expert_parallelism=1", + # --- THE MESH & MEMORY FIX --- + # 1. We keep vLLM sliced across all 8 chips + "rollout_tensor_parallelism=8", + # 2. DELETE the ici_tensor_parallelism line! We let MaxText default to FSDP. + # 3. Restrict vLLM to 40% memory so MaxText has room to train + "hbm_utilization_vllm=0.4", + # 4. Give vLLM the blueprint it needs to build its mesh scaffolding + f"vllm_hf_config_path={TOKENIZER_PATH}", + # --- SCALING UP FOR GRPO --- + # "per_device_batch_size=4", # Increased from 1 to test micro-batching + # "rl.num_generations=4", # Crucial for GRPO: Generates 4 reasoning chains per prompt + # --- THE SCALE UP (Real Training Run) --- + "per_device_batch_size=2", # Doubling the throughput (pushes your 60% HBM limit) + "num_batches=200", # Process 200 batches instead of the tiny default + "rl.num_generations=8", # GRPO magic: vLLM generates 8 different answers per prompt to compare + "rl.num_iterations=2", # Train the actor model for 2 iterations on those 8 answers + "learning_rate=1e-6", # A standard, safe learning rate for RL fine-tuning + "save_checkpoint_on_completion=True", # Ensure the final weights are saved! + "return_log_prob=True", # <-- The crucial GRPO math flag! + # --- THE MLFLOW FIX --- + "log_period=10", # Force TensorBoard to write metrics to disk every 10 steps + "checkpoint_period=50", + "profiler=True", + "profiler_steps=100,110", # Takes a massive hardware snapshot between step 100 and 110 +] + +trainer_config, sampler_config, trainer_devices, sampler_devices = ( + setup_configs_and_devices(config_argv) +) + +# --- EXECUTE TRAINING --- +with mlflow.start_run(run_name=f"Llama3.1-8B-{LOSS_ALGO}"): + mlflow.log_params( + { + "model_name": MODEL_NAME, + "loss_algo": LOSS_ALGO, + "tpu_devices": len(jax.devices()), + "rollout_engine": "vllm", + } + ) + + print(f"🚀 Starting {LOSS_ALGO} Training on {len(jax.devices())} TPUs...") + rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices) + + mlflow.log_artifacts( + trainer_config.tensorboard_dir, artifact_path="tensorboard_logs" + ) + print("✅ Training Completed and Logged to MLflow!") + mlflow.log_artifacts( + trainer_config.tensorboard_dir, artifact_path="tensorboard_logs" + ) + print("✅ Training Completed and Logged to MLflow!") diff --git a/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md b/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md new file mode 100644 index 000000000..46dd442f3 --- /dev/null +++ b/docs/platforms/gke/base/use-cases/reinforcement-larning/README.md @@ -0,0 +1 @@ +# Reinforcement Learning reference architecture diff --git a/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md b/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md new file mode 100644 index 000000000..578e219af --- /dev/null +++ b/docs/platforms/gke/base/use-cases/reinforcement-larning/rl-on-tpu/README.md @@ -0,0 +1,161 @@ +# Llama 3.1 8B GRPO Training on GKE (TPU v5e) + +This repository contains a production-ready, end-to-end Reinforcement Learning +(GRPO) pipeline for a single-node smoke test of Llama 3.1 8B on Google +Kubernetes Engine (GKE) using a TPU v5e-8 slice. + +It integrates **MaxText** (for FSDP model training), **vLLM** (for +high-throughput rollout generation), and **Tunix** (the RL bridge). + +_Note: This script is currently configured as a single-batch smoke test. Scaling +up `rl.num_generations` or `per_device_batch_size` for a full training run +triggers an upstream Tunix API mismatch that requires a custom vLLM Monkey +Patch._ + +## 🚀 Quick Start & Environment Setup + +### 1. Provision & Connect to the GKE Cluster + +This pipeline is designed to run on the Accelerated Platforms training reference +architecture, which comes pre-configured with CCC and all necessary topology +routing. + +If you do not already have a cluster running, follow the official infrastructure +provisioning guide to spin up a TPU v5e cluster: 👉 +**[Accelerated Platforms GKE Training Architecture README](https://github.com/GoogleCloudPlatform/accelerated-platforms/blob/kr-rl/platforms/gke/base/use-cases/training-ref-arch/terraform/README.md)** + +Once your cluster is up and running, fetch your cluster credentials (replace +with your actual cluster name and region/zone): + +```bash +export PROJECT_ID="" +gcloud config set project $PROJECT_ID +gcloud container clusters get-credentials --location +``` + +### 2. Configure the Hugging Face Secret + +You must have access to the Meta Llama 3.1 weights. The training job securely +pulls your token from a Kubernetes secret. Create it in your active namespace: + +```bash +kubectl create secret generic hf-secret --from-literal=token="" +``` + +### 3. Hardware & Storage Prerequisites + +- **Hardware:** This configuration is strictly tuned for a **TPU v5e-8** + topology. +- **Storage:** The container requires local ephemeral storage (or a mounted SSD) + at `/workspace` to handle the 16GB checkpoint conversions. + +--- + +## 🛠️ How to Deploy and Run + +### 1. Deploy the MLflow Tracking Server + +Before starting the training job, you must spin up the MLflow service so the +training pod has somewhere to send its metrics and artifacts. + +```bash +kubectl apply -f mlflow.yaml +``` + +_(Note: This uses a `ClusterIP` configuration, meaning the dashboard is kept +completely internal and secure inside our GKE cluster. The training pod will +automatically discover it at `mlflow-service:5000`)_. + +### 2. Build and Push the Training Image + +```bash +docker build -t your-registry/maxtext-grpo:latest . +docker push your-registry/maxtext-grpo:latest +``` + +### 3. Submit the GKE Training Job + +```bash +kubectl apply -f v5e-job.yaml +``` + +### 4. Tail the Logs + +```bash +kubectl logs -f job/maxtext-grpo-job-v5e +``` + +--- + +## 📊 Viewing Metrics (MLflow & TensorBoard) + +MaxText uses a custom C++ backend that logs directly to a local TensorBoard +folder. To make this visible to the team, the `train.py` script automatically +zips this folder and attaches it to **MLflow** as an artifact when the run +completes. + +### Accessing the MLflow UI + +Because MLflow is running securely inside the cluster, you need to port-forward +it to your local machine to view the dashboard: + +1. **Port-forward the MLflow Service:** + +```bash +kubectl port-forward svc/mlflow-service 5000:5000 +``` + +2. **Open your Browser:** Navigate to `http://localhost:5000` +3. **View the Run Data:** + +- Go to the `MaxText-RL-GRPO` experiment. +- Click on your specific run (e.g., `Llama3.1-8B-grpo`). +- Scroll down to the **Artifacts** section. You will see the `tensorboard_logs` + folder attached there. + +### Live Tracking (During Training) + +If you want to watch the loss curves in real-time _before_ the job finishes and +uploads to MLflow, you can port-forward TensorBoard directly from the running +pod: + +```bash +kubectl exec -it job/maxtext-grpo-job-v5e -- tensorboard --logdir /workspace/rl_llama3_output --host 0.0.0.0 --port 6006 +kubectl port-forward job/maxtext-grpo-job-v5e 6006:6006 +``` + +--- + +## ⚠️ Critical Architecture Notes & Patches (Do Not Remove) + +Because we are bridging experimental research frameworks (MaxText/Tunix) with +open-source inference (vLLM), several runtime patches are applied in `train.py` +and the `Dockerfile`. **If you modify this pipeline, keep these constraints in +mind:** + +### 1. The C++ Protobuf Shield + +vLLM uses `os.fork()` for its background workers, which fatally crashes the C++ +Protobuf engine loaded by JAX (`SIGABRT`). + +- **The Fix:** We force Python protobufs and `spawn` multiprocessing at the + absolute top of `train.py`. + +### 2. JAX Version Pinning (`0.4.25`) + +Newer versions of JAX strictly enforce `with_sharding_constraint` as an +assertion. Tunix currently violates this when mapping weights to vLLM, causing a +fatal mesh crash. + +- **The Fix:** The `Dockerfile` explicitly pins `jax[tpu]==0.4.25` using the + `--prerelease=allow` flag to grab the stable nightly drivers. + +### 3. Memory & Mesh Tuning + +To prevent vLLM from causing `RESOURCE_EXHAUSTED` (OOM) errors and starving +MaxText's FSDP optimizer: + +- `rollout_tensor_parallelism=8`: Maps vLLM across all 8 chips. +- `hbm_utilization_vllm=0.4`: Restricts vLLM to 40% of the TPU memory. +- _Note:_ The `ici_tensor_parallelism` flag is intentionally omitted so MaxText + defaults to FSDP for training. diff --git a/platforms/gke/base/_shared_config/workloads_variables.tf b/platforms/gke/base/_shared_config/workloads_variables.tf index 21962f00f..eb3b053d8 100644 --- a/platforms/gke/base/_shared_config/workloads_variables.tf +++ b/platforms/gke/base/_shared_config/workloads_variables.tf @@ -17,7 +17,7 @@ locals { } variable "custom_metrics_adapter_version" { - default = "0.16.2" + default = "0.16.5" description = "Version of Custom Metrics Adapter (https://github.com/GoogleCloudPlatform/k8s-stackdriver) to install." type = string } @@ -29,31 +29,37 @@ variable "inference_gateway_kubernetes_namespace" { } variable "inference_gateway_version" { - default = "1.1.0" + default = "1.4.0" description = "Version of Gateway API Inference Extension (https://github.com/kubernetes-sigs/gateway-api-inference-extension) to install." type = string } variable "jobset_version" { - default = "0.10.1" + default = "0.11.1" description = "Version of JobSet (https://github.com/kubernetes-sigs/jobset/) to install." type = string } variable "kuberay_version" { - default = "1.5.1" + default = "1.6.0" description = "Version of KubeRay (https://github.com/ray-project/kuberay) to install." type = string } variable "kueue_version" { - default = "0.14.4" + default = "0.16.4" description = "Version of Kueue (https://kueue.sigs.k8s.io/) to install." type = string } variable "lws_version" { - default = "0.7.0" + default = "0.8.0" description = "Version of LeaderWorkerSet (LWS) (https://github.com/kubernetes-sigs/lws/) to install." type = string } + +variable "pathways_version" { + default = "0.1.4" + description = "Version of Pathways (https://github.com/google/pathways-job) to install." + type = string +} diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml new file mode 100644 index 000000000..3c38b1cd3 --- /dev/null +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/custom-compute-model-converter.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: model-converter +spec: + activeMigration: + optimizeRulePriority: true + nodePoolConfig: + imageStreaming: + enabled: true + nodePoolAutoCreation: + enabled: true + priorities: + - machineType: c4-highmem-8-lssd + maxPodsPerNode: 32 + spot: false + storage: + localSSDCount: 1 + + - machineType: c3-highmem-8 + maxPodsPerNode: 32 + spot: false + storage: + localSSDCount: 1 diff --git a/platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars b/platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars new file mode 120000 index 000000000..4d9954e5a --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_cluster_variables.tf b/platforms/gke/base/core/workloads/pathways/_cluster_variables.tf new file mode 120000 index 000000000..3f2c29e19 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_cluster_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars b/platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars new file mode 120000 index 000000000..c3133e727 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_platform_variables.tf b/platforms/gke/base/core/workloads/pathways/_platform_variables.tf new file mode 120000 index 000000000..c68738baa --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars b/platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars new file mode 120000 index 000000000..b65551f53 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_workloads.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/workloads.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/_workloads_variables.tf b/platforms/gke/base/core/workloads/pathways/_workloads_variables.tf new file mode 120000 index 000000000..fec5c48ce --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/_workloads_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/workloads_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/pathways/main.tf b/platforms/gke/base/core/workloads/pathways/main.tf new file mode 100644 index 000000000..f72efb6d5 --- /dev/null +++ b/platforms/gke/base/core/workloads/pathways/main.tf @@ -0,0 +1,129 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + kubeconfig_directory = "${path.module}/../../../kubernetes/kubeconfig" + kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" + + manifests_directory = "${local.namespace_directory}/pathways-system" + namespace_directory = "${local.manifests_directory_root}/namespace" + version_manifests_directory = "${path.module}/manifests/pathways-${var.jobset_version}" +} + +data "local_file" "kubeconfig" { + filename = local.kubeconfig_file +} + +resource "terraform_data" "namespace" { + input = { + manifests_dir = local.namespace_directory + } + + provisioner "local-exec" { + command = </dev/null 2>&1 + pwd -P +)" + +if [[ ! -v HF_MODEL_ID ]]; then + echo "HF_MODEL_ID is not set, exiting!" + exit 1 +fi + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +secret_version_found=$(gcloud secrets versions list "${huggingface_hub_access_token_read_secret_manager_secret_name}" \ +--project="${huggingface_secret_manager_project_id}" 2>/dev/null | grep "enabled" | wc -l) + +if [[ ${secret_version_found} == 0 ]]; then + echo "Hugging Face Hub read token secret '${huggingface_hub_access_token_read_secret_manager_secret_name}' version is missing or not enabled! Please add the token to the secret, exiting." + exit 1 +fi + +envsubst < "${MY_PATH}/huggingface/templates/downloader.tpl.env" | sponge "${MY_PATH}/huggingface/downloader.env" + +envsubst < "${MY_PATH}/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/huggingface/secretproviderclass-huggingface-tokens.yaml" + +cd "${MY_PATH}/huggingface" +kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml new file mode 100644 index 000000000..f08d03c67 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/configmap-scripts.yaml @@ -0,0 +1,128 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: hf-model-to-gcs + namespace: replaced-by-kustomize +data: + download.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + + start_download=$(date +%s) + + echo "Starting download of ${MODEL_ID}..." + hf download \ + --local-dir /local/hf/model \ + --max-workers ${HF_MAX_WORKERS:-"8"} \ + --repo-type model \ + ${MODEL_ID} + + end_download=$(date +%s) + runtime_download=$((end_download - start_download)) + echo "Download runtime: $(date -d@${runtime_download} -u +%H:%M:%S)" + + echo "Removing cache directory" + rm -rf /local/hf/model/.cache + install_packages.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + + echo "Installing required packages..." + + pip3 install \ + --break-system-packages \ + --root-user-action=ignore \ + --upgrade \ + huggingface_hub + run.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P + )" + + start=$(date +%s) + + if [ -z "${MODEL_ID:-}" ]; then + echo "Error: MODEL_ID is not set." + exit 1 + fi + if [ -z "${MODEL_BUCKET_NAME:-}" ]; then + echo "Error: MODEL_BUCKET_NAME is not set." + exit 1 + fi + + export MODEL_ID=${MODEL_ID,,} + echo "Preparing to download '${MODEL_ID}' from Hugging Face to the '${MODEL_BUCKET_NAME}' Cloud Storage bucket" + + echo "Creating '${MODEL_ID}' model folder in '${MODEL_BUCKET_NAME}' bucket" + if [[ "${REPLACE_EXISTING:-false}" == "true" ]]; then + gcloud storage folders create --recursive "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/" || echo "Bucket already exists" + else + gcloud storage folders create --recursive "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/" + fi + + "${MY_PATH}/install_packages.sh" + + "${MY_PATH}/download.sh" + + "${MY_PATH}/transfer.sh" + + end=$(date +%s) + runtime=$((end - start)) + echo "Total runtime: $(date -d@${runtime} -u +%H:%M:%S)" + transfer.sh: | + #!/usr/bin/env bash + set -o errexit + set -o nounset + set -o pipefail + + start_transfer=$(date +%s) + + if [[ "${REPLACE_EXISTING:-false}" == "true" ]]; then + echo "Removing existing model files..." + gcloud storage rm \ + --recursive \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/*" || echo "No existing model files" + fi + + echo "Transferring model to the bucket" + gcloud config set storage/parallel_composite_upload_enabled True + gcloud config set storage/parallel_composite_upload_component_prefix parallel_composite_uploads + + gcloud storage cp \ + --gzip-in-flight-all \ + --recursive \ + /local/hf/model/* \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/" + + echo "Removing temporary files" + gcloud storage rm \ + --recursive \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/parallel_composite_uploads" \ + "gs://${MODEL_BUCKET_NAME}/${MODEL_ID}/**/parallel_composite_uploads" || echo "No temporary files to removes" + + end_transfer=$(date +%s) + runtime_transfer=$((end_transfer - start_transfer)) + echo "Transfer runtime: $(date -d@${runtime_transfer} -u +%H:%M:%S)" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml new file mode 100644 index 000000000..319ec4270 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/job.yaml @@ -0,0 +1,80 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hf-model-to-gcs + namespace: replaced-by-kustomize +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: hf-model-to-gcs + spec: + containers: + - args: ["/scripts/run.sh"] + command: ["/bin/sh", "-c"] + env: + - name: HF_TOKEN_PATH + value: /var/run/secrets/huggingface.co/token + - name: HF_MAX_WORKERS + value: "2" + - name: HF_XET_CACHE + value: /local/hf/xet + - name: HF_XET_NUM_CONCURRENT_RANGE_GETS + value: "4" + - name: HF_XET_HIGH_PERFORMANCE + value: "0" + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: download + - name: MODEL_BUCKET_NAME + valueFrom: + configMapKeyRef: + key: MODEL_BUCKET_NAME + name: download + - name: REPLACE_EXISTING + value: "true" + image: gcr.io/google.com/cloudsdktool/cloud-sdk:slim + name: hf-model-to-gcs + resources: + requests: + cpu: 2000m + ephemeral-storage: 1Gi + memory: 10Gi + volumeMounts: + - mountPath: /scripts + name: scripts + - mountPath: /var/run/secrets/huggingface.co + name: huggingface-token + restartPolicy: OnFailure + securityContext: + fsGroup: 10000 + serviceAccountName: replaced-by-kustomize + terminationGracePeriodSeconds: 0 + volumes: + - configMap: + defaultMode: 0744 + name: hf-model-to-gcs + name: scripts + - csi: + driver: secrets-store-gke.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: huggingface-token-read + name: huggingface-token diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml new file mode 100644 index 000000000..73e8fd288 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/kustomization.yaml @@ -0,0 +1,69 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - downloader.env + name: download + namespace: replaced-by-kustomize + +patches: + - path: set-compute-class.yaml + +replacements: + - source: + fieldPath: data.DOWNLOADER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: download + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: SecretProviderClass + - source: + fieldPath: data.DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: download + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + name: hf-model-to-gcs + - source: + kind: SecretProviderClass + name: huggingface-token-read + fieldPath: metadata.name + targets: + - select: + kind: Job + name: hf-model-to-gcs + fieldPaths: + - spec.template.spec.volumes.[name=huggingface-token].csi.volumeAttributes.secretProviderClass + +resources: + - configmap-scripts.yaml + - job.yaml + - secretproviderclass-huggingface-tokens.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml new file mode 100644 index 000000000..739d3164a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/set-compute-class.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hf-model-to-gcs + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: model-download diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env new file mode 100644 index 000000000..969cf873f --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/downloader.tpl.env @@ -0,0 +1,6 @@ +DOWNLOADER_KUBERNETES_NAMESPACE=${huggingface_hub_downloader_kubernetes_namespace_name} +DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT=${huggingface_hub_downloader_kubernetes_service_account_name} +HUGGINGFACE_TOKEN_READ_SECRET_PROVIDER_CLASS_NAME=huggingface-token-read +HUGGINGFACE_TOKEN_WRITE_SECRET_PROVIDER_CLASS_NAME=huggingface-token-write +MODEL_BUCKET_NAME=${huggingface_hub_models_bucket_name} +MODEL_ID=${HF_MODEL_ID} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml new file mode 100644 index 000000000..dd8db6665 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/huggingface/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: huggingface-token-read + namespace: replaced-by-kustomize +spec: + parameters: + secrets: | + - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" + path: "token" + provider: gke +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: huggingface-token-write + namespace: replaced-by-kustomize +spec: + parameters: + secrets: | + - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_write_secret_manager_secret_name}/versions/latest" + path: "token" + provider: gke diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/job.yaml new file mode 100644 index 000000000..ddc728070 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/job.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize +spec: + template: + metadata: + labels: + app: reinforcement-learning-dataset-downloader + spec: + restartPolicy: OnFailure + containers: + - env: + - name: DATASET_BUCKET_NAME + valueFrom: + configMapKeyRef: + key: DATASET_BUCKET_NAME + name: reinforcement-learning-dataset-downloader + image: replaced-by-kustomize + imagePullPolicy: Always + name: reinforcement-learning-dataset-downloader + resources: {} + serviceAccountName: replaced-by-kustomize diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml new file mode 100644 index 000000000..104e572b7 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/kustomization.yaml @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: +- envs: + - reinforcement-learning-dataset-downloader.env + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize + +replacements: +- source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - spec.template.spec.containers.[name=reinforcement-learning-dataset-downloader].image + select: + kind: Job +- source: + fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job +- source: + fieldPath: data.DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: reinforcement-learning-dataset-downloader + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + +patches: +- path: patch-nodeselector.yaml + +resources: +- job.yaml +namePrefix: 26854af7- diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml new file mode 100644 index 000000000..f66df7f6c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: reinforcement-learning-dataset-downloader + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: cpu-e2-s-16-co diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env new file mode 100644 index 000000000..c40591a1d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/base/templates/reinforcement-learning-dataset-downloader.tpl.env @@ -0,0 +1,4 @@ +DATASET_DOWNLOADER_KUBERNETES_NAMESPACE=${rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name} +DATASET_DOWNLOADER_KUBERNETES_SERVICE_ACCOUNT=${rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name} +CONTAINER_IMAGE_URL=${rl_cpu_reinforcement_learning_dataset_downloader_image_url} +DATASET_BUCKET_NAME=${rl_dataset_bucket_name} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh new file mode 100755 index 000000000..e8090bc79 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-dataset-downloader/configure_dataset_downloader.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +RANDOM_HASH=$(openssl rand -hex 4) +echo "${RANDOM_HASH}" > job_random_hash.txt + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +envsubst <"${MY_PATH}/base/templates/reinforcement-learning-dataset-downloader.tpl.env" | sponge "${MY_PATH}/base/reinforcement-learning-dataset-downloader.env" + +cd "${MY_PATH}/base" +kustomize edit set nameprefix "${RANDOM_HASH}-" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml new file mode 100644 index 000000000..9c14ff05e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/job.yaml @@ -0,0 +1,82 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: model-converter + namespace: replaced-by-kustomize +spec: + ttlSecondsAfterFinished: 3600 + template: + metadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + labels: + app: model-converter + spec: + restartPolicy: Never + serviceAccountName: replaced-by-kustomize + containers: + - name: model-converter + image: replaced-by-kustomize # Replace with your pushed model-converter image + command: + - python + - src/maxtext/checkpoint_conversion/to_maxtext.py + args: + - "--hf_model_path=/gcs/$(MODEL_ID)" + - "--lazy_load_tensors=True" + # Add output flags based on your specific MaxText version, e.g., + # - "--base_output_directory=/gcs/converted_models/" + env: + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: reinforcement-learning-model-converter + resources: + requests: + cpu: "4" + memory: "32Gi" + limits: + cpu: "8" + memory: "64Gi" + volumeMounts: + - mountPath: /dev/shm + name: dev-shm + - mountPath: /gcs + name: huggingface-hub-model-bucket + volumes: + - emptyDir: + medium: Memory + name: dev-shm + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: cloud-storage-bucket-name + mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize + skipCSIBucketAccessCheck: "true" + name: huggingface-hub-model-bucket + - emptyDir: + medium: Memory + name: gke-gcsfuse-cache + - emptyDir: + medium: Memory + name: gke-gcsfuse-tmp + - emptyDir: + medium: Memory + name: gke-gcsfuse-buffer diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml new file mode 100644 index 000000000..77ef591ea --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/kustomization.yaml @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - model_converter.env + name: reinforcement-learning-model-converter + namespace: replaced-by-kustomize + +patches: + - path: set-compute-class.yaml + +replacements: + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - spec.template.spec.containers.[name=model-converter].image + select: + kind: Job + - source: + fieldPath: data.MODEL_CONVERTER_KUBERNETES_NAMESPACE + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - source: + fieldPath: data.MODEL_CONVERTER_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: reinforcement-learning-model-converter + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + +resources: + - job.yaml diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml new file mode 100644 index 000000000..00b104052 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/set-compute-class.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: model-converter + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: model-converter diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env new file mode 100644 index 000000000..1551b7a22 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/base/templates/model_converter.tpl.env @@ -0,0 +1,5 @@ +MODEL_CONVERTER_KUBERNETES_NAMESPACE=${rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name} +MODEL_CONVERTER_KUBERNETES_SERVICE_ACCOUNT=${rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name} +CONTAINER_IMAGE_URL=${rl_cpu_reinforcement_learning_model_converter_image_url} +MODEL_BUCKET_NAME=${huggingface_hub_models_bucket_name} +MODEL_ID=${HF_MODEL_ID} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh new file mode 100755 index 000000000..d9c9e7938 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/reinforcement-learning-model-converter/configure_model_converter.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +if [[ ! -v HF_MODEL_ID ]]; then + echo "HF_MODEL_ID is not set, exiting!" + exit 1 +fi + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +envsubst < "${MY_PATH}/base/templates/model_converter.tpl.env" | sponge "${MY_PATH}/base/model_converter.env" + +cd "${MY_PATH}/base" +kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/job.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/job.yaml new file mode 100644 index 000000000..20904ba21 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/job.yaml @@ -0,0 +1,43 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: reinforcement-learning-maxtext-grpo + namespace: replaced-by-kustomize +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + nodeSelector: + cloud.google.com/compute-class: "tpu-v5e-2x4" + containers: + - name: grpo-trainer + image: replaced-by-kustomize + resources: + requests: + google.com/tpu: 8 + limits: + google.com/tpu: 8 + env: + - name: MLFLOW_TRACKING_URI + value: "http://mlflow-service:5000" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: token + serviceAccountName: replaced-by-kustomize diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/kustomization.yaml new file mode 100644 index 000000000..48a2ea4da --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/base/kustomization.yaml @@ -0,0 +1,14 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml new file mode 100644 index 000000000..b7ebbf4b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: "-v5e-2x4-llama-3-1-8b-instruct" + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml new file mode 100644 index 000000000..832e2fceb --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v5e-1x1 diff --git a/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml new file mode 100644 index 000000000..b3371ea99 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/rl-on-tpu/v5e-2x4-llama-3-1-8b-instruct/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "1" + requests: + google.com/tpu: "1" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..c730c32e8 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf new file mode 120000 index 000000000..5a143590a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars new file mode 120000 index 000000000..98a694db9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf new file mode 120000 index 000000000..00625515b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_cluster_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars new file mode 120000 index 000000000..276530b81 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf new file mode 120000 index 000000000..f384bc7e1 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_huggingface_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars new file mode 120000 index 000000000..9cbd92baf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/networking.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf new file mode 120000 index 000000000..1e170e71d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_networking_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/networking_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars new file mode 120000 index 000000000..125a652cf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf new file mode 120000 index 000000000..486b3eaef --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/_platform_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf new file mode 100644 index 000000000..7668e9748 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/outputs.tf @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { + value = local.rl_cpu_reinforcement_learning_dataset_downloader_image_url +} + +output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name" { + value = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name +} + +output "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name" { + value = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name +} + +output "rl_cpu_reinforcement_learning_model_converter_image_url" { + value = local.rl_cpu_reinforcement_learning_model_converter_image_url +} + +output "rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name" { + value = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name +} + +output "rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name" { + value = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name +} + +output "rl_dataset_bucket_name" { + value = local.rl_dataset_bucket_name +} + +output "rl_tpu_reinforcement_learning_on_tpu_image_url" { + value = local.rl_tpu_reinforcement_learning_on_tpu_image_url +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning.auto.tfvars new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf new file mode 100644 index 000000000..f89fae510 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/reinforcement_learning_variables.tf @@ -0,0 +1,118 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-mlflow" + rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-mlflow-sa" + + rl_cpu_reinforcement_learning_dataset_downloader_image_url = var.rl_cpu_reinforcement_learning_dataset_downloader_image_url != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-dataset-downloader:latest" + rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-dataset-downloader" + rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-dataset-downloader-sa" + + rl_cpu_reinforcement_learning_model_converter_image_url = var.rl_cpu_reinforcement_learning_model_converter_image_url != null ? var.rl_cpu_reinforcement_learning_model_converter_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-model-converter:latest" + rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name = var.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name != null ? var.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-model-converter" + rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name = var.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name != null ? var.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-model-converter-sa" + + rl_dataset_bucket_name = var.rl_dataset_bucket_name != null ? var.rl_dataset_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-dataset" + rl_mlflow_data_bucket_name = var.rl_mlflow_data_bucket_name != null ? var.rl_mlflow_data_bucket_name : "${local.rl_project_id}-${local.unique_identifier_prefix}-mlflow-data" + rl_project_id = var.rl_project_id != null ? var.rl_project_id : var.platform_default_project_id + + rl_tpu_reinforcement_learning_on_tpu_image_url = var.rl_tpu_reinforcement_learning_on_tpu_image_url != null ? var.rl_tpu_reinforcement_learning_on_tpu_image_url : "${local.cloudbuild_ar_image_repository_url}/reinforcement-learning/rl-on-tpu:latest" + rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name = var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name != null ? var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-rl-on-tpu" + rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name = var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name != null ? var.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-rl-on-tpu-sa" +} + +variable "rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL MLflow deployment." + type = string +} + +variable "rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL MLflow deployment." + type = string +} + +variable "rl_cpu_reinforcement_learning_dataset_downloader_image_url" { + default = null + description = "The URL for the RL dataset downloader container image." + type = string +} + +variable "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL dataset downloader." + type = string +} + +variable "rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL dataset downloader." + type = string +} + +variable "rl_cpu_reinforcement_learning_model_converter_image_url" { + default = null + description = "The URL for the RL model converter container image." + type = string +} + +variable "rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL model converter." + type = string +} + +variable "rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL model converter." + type = string +} + +variable "rl_dataset_bucket_name" { + default = null + description = "The GCP bucket name for the RL dataset." + type = string +} + +variable "rl_mlflow_data_bucket_name" { + default = null + description = "The GCP bucket name for the MLflow data." + type = string +} + +variable "rl_project_id" { + default = null + description = "The GCP project ID for the RL on TPU resources." + type = string +} + +variable "rl_tpu_reinforcement_learning_on_tpu_image_url" { + default = null + description = "The URL for the RL on TPU container image." + type = string +} + +variable "rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace name for the RL on TPU deployment." + type = string +} + +variable "rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account name for the RL on TPU deployment." + type = string +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh new file mode 100755 index 000000000..2d397ec39 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +MY_PATH_IRA_ENV="$( + cd "$(dirname "${BASH_SOURCE}")" >/dev/null 2>&1 + pwd -P +)" + +ACP_REPO_DIR="$(realpath ${MY_PATH_IRA_ENV}/../../../../../../../../)" +ACP_PLATFORM_BASE_DIR="${ACP_REPO_DIR}/platforms/gke/base" +ACP_PLATFORM_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning" + +declare -a SHARED_CONFIG_PATHS=( + "${ACP_PLATFORM_BASE_DIR}/_shared_config" + "${ACP_PLATFORM_USE_CASE_DIR}/terraform/_shared_config" +) +export SHARED_CONFIG_PATHS + +source "${ACP_PLATFORM_BASE_DIR}/_shared_config/scripts/set_environment_variables.sh" + +if [[ -v HF_MODEL_ID ]]; then + HF_MODEL_ID_HASH=$(echo "${HF_MODEL_ID}" | md5sum | cut -c1-8) + export HF_MODEL_ID_HASH + + HF_MODEL_NAME="${HF_MODEL_ID##*/}" + HF_MODEL_NAME="${HF_MODEL_NAME//./-}" + HF_MODEL_NAME="${HF_MODEL_NAME,,}" + export HF_MODEL_NAME +fi diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh new file mode 100755 index 000000000..9ff8fe538 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset + +start_timestamp=$(date +%s) + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +# Set repository values +export ACP_REPO_DIR="$(realpath ${MY_PATH}/../../../../../../)" +export ACP_PLATFORM_BASE_DIR="${ACP_REPO_DIR}/platforms/gke/base" +export ACP_PLATFORM_CORE_DIR="${ACP_PLATFORM_BASE_DIR}/core" +export ACP_PLATFORM_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning" + +# Enable Terraform plugin caching and specifies location of the plugin cache directory +export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + +# Set use-case specific values +export TF_VAR_initialize_backend_use_case_name="reinforcement-learning/terraform" +export TF_VAR_resource_name_prefix="${TF_VAR_resource_name_prefix:-rl}" + +declare -a CORE_TERRASERVICES_APPLY=( + "networking" + "container_cluster" + "workloads/cluster_credentials" + "cloudbuild/initialize" + "huggingface/initialize" + "huggingface/hub_downloader" + "custom_compute_class" + "workloads/auto_monitoring" + "workloads/custom_metrics_adapter" + "workloads/inference_gateway" + "workloads/jobset" + "workloads/lws" + "workloads/priority_class" + "workloads/kueue" + "workloads/pathways" +) +CORE_TERRASERVICES_APPLY="${CORE_TERRASERVICES_APPLY[*]}" "${ACP_PLATFORM_CORE_DIR}/deploy.sh" + +# shellcheck disable=SC1091 +source "${ACP_PLATFORM_USE_CASE_DIR}/terraform/_shared_config/scripts/set_environment_variables.sh" + +declare -a use_case_terraservices=( + "initialize" +) +for terraservice in "${use_case_terraservices[@]}"; do + cd "${ACP_PLATFORM_USE_CASE_DIR}/terraform/${terraservice}" && + echo "Current directory: $(pwd)" && + rm -rf .terraform/ && + terraform init && + terraform plan -input=false -out=tfplan && + terraform apply -input=false tfplan || exit 1 + rm tfplan +done + +# shellcheck disable=SC2154 +gcloud container clusters get-credentials "${cluster_name}" \ + --region "${cluster_region}" \ + --project "${cluster_project_id}" \ + --dns-endpoint + +end_timestamp=$(date +%s) +total_runtime_value=$((end_timestamp - start_timestamp)) +echo "reinforcement-learning deploy total runtime: $(date -d@${total_runtime_value} -u +%H:%M:%S)" diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..171a27a35 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..79960dd37 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf new file mode 100644 index 000000000..6267d950e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.tf @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.rl_cpu_reinforcement_learning_dataset_downloader_image_url +} + +resource "terraform_data" "submit_docker_build" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.image_destination + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = self.input.acp_root + } + + triggers_replace = { + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/Dockerfile") + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/src", "**") : filesha256("${local.acp_root}/container-images/cpu/reinforcement-learning-dataset-downloader/src/${file}")])) + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf new file mode 100644 index 000000000..35e8c5d4d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-dataset-downloader/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_rl_images_cpu_reinforcement_learning_dataset_downloader_deploy-v1" + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..171a27a35 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..79960dd37 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf new file mode 100644 index 000000000..81c2c2ded --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/cloudbuild.tf @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.rl_cpu_reinforcement_learning_model_converter_image_url +} + +resource "terraform_data" "submit_docker_build" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.image_destination + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = self.input.acp_root + } + + triggers_replace = { + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-model-converter/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/cpu/reinforcement-learning-model-converter/Dockerfile") + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf new file mode 100644 index 000000000..178937b81 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/cpu/reinforcement-learning-model-converter/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_rl_images_cpu_reinforcement_learning_model_converter_deploy-v1" + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..171a27a35 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..79960dd37 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/cloudbuild.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/cloudbuild.tf new file mode 100644 index 000000000..23c7661c5 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/cloudbuild.tf @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.rl_tpu_reinforcement_learning_on_tpu_image_url +} + +resource "terraform_data" "submit_docker_build" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.image_destination + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = self.input.acp_root + } + + triggers_replace = { + cloudbuild_yaml_hash = filebase64sha256("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/cloudbuild.yaml") + dockerfile_hash = filebase64sha256("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/Dockerfile") + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/src", "**") : filesha256("${local.acp_root}/container-images/tpu/reinforcement-learning-on-tpu/src/${file}")])) + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/local_file.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/versions.tf new file mode 100644 index 000000000..971a10c8e --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/images/tpu/reinforcement_learning_on_tpu/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_rl_images_tpu_rl_on_tpu_deploy-v1" + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..c730c32e8 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf new file mode 120000 index 000000000..5a143590a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars new file mode 120000 index 000000000..98a694db9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf new file mode 120000 index 000000000..00625515b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_cluster_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars new file mode 120000 index 000000000..276530b81 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf new file mode 120000 index 000000000..f384bc7e1 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_huggingface_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars new file mode 120000 index 000000000..9cbd92baf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/networking.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf new file mode 120000 index 000000000..1e170e71d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_networking_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/networking_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars new file mode 120000 index 000000000..125a652cf --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf new file mode 120000 index 000000000..486b3eaef --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_platform_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..f56697856 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..f7d4bb73a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf new file mode 100644 index 000000000..efe70345a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/initialize/versions.tf @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "7.6.0" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/rl_initialize_deploy-v1" + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..2af7bbaaa --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf new file mode 120000 index 000000000..dd199215c --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars new file mode 120000 index 000000000..04c4ae417 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf new file mode 120000 index 000000000..6713167a1 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_cluster_variables.tf @@ -0,0 +1 @@ +../_shared_config/_cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars new file mode 120000 index 000000000..488145ca9 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf new file mode 120000 index 000000000..91b00dc64 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_huggingface_variables.tf @@ -0,0 +1 @@ +../_shared_config/_huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars new file mode 120000 index 000000000..f898b3b5a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf new file mode 120000 index 000000000..f928d86dd --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_platform_variables.tf @@ -0,0 +1 @@ +../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars new file mode 120000 index 000000000..f56697856 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf new file mode 120000 index 000000000..f7d4bb73a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/_reinforcement_learning_variables.tf @@ -0,0 +1 @@ +../_shared_config/reinforcement_learning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf new file mode 100644 index 000000000..06514827a --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/iam.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + cluster_wi_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject" + rl_on_tpu_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name}/sa/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name}" + rl_mlflow_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name}/sa/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name}" + rl_dataset_downloader_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name}/sa/${local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name}" +} + +resource "google_storage_bucket_iam_member" "hub_models_rl_on_tpu_ksa" { + bucket = data.google_storage_bucket.hub_models.name + member = local.rl_on_tpu_ksa_member + role = local.cluster_gcsfuse_user_role +} + +resource "google_project_iam_member" "gcsfuse_user_member_rl_cpu_dataset_downloader_ksa" { + project = data.google_project.cluster.project_id + member = local.rl_dataset_downloader_ksa_member + role = local.cluster_gcsfuse_user_role +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf new file mode 100644 index 000000000..7f61cac7d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/kubernetes.tf @@ -0,0 +1,102 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + kubeconfig_directory = "${path.module}/../../../../kubernetes/kubeconfig/" + kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" + + workloads = { + rl_reinforcement_learning_mlflow = { + directory = "${local.namespaces_directory}/${local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name}" + namespace = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name + service_account = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name + } + rl_on_tpu = { + directory = "${local.namespaces_directory}/${local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name}" + namespace = local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_namespace_name + service_account = local.rl_tpu_reinforcement_learning_on_tpu_kubernetes_service_account_name + } + rl_reinforcement_learning_dataset_downloader = { + directory = "${local.namespaces_directory}/${local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name}" + namespace = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_namespace_name + service_account = local.rl_cpu_reinforcement_learning_dataset_downloader_kubernetes_service_account_name + } + rl_reinforcement_learning_model_converter = { + directory = "${local.namespaces_directory}/${local.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name}" + namespace = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_namespace_name + service_account = local.rl_cpu_reinforcement_learning_model_converter_kubernetes_service_account_name + } + } + + manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" + namespaces_directory = "${local.manifests_directory_root}/namespace" +} + +data "local_file" "kubeconfig" { + filename = local.kubeconfig_file +} + +resource "local_file" "namespace_yaml" { + for_each = local.workloads + content = templatefile( + "${path.module}/templates/kubernetes/namespace.tftpl.yaml", + { + name = each.value.namespace + } + ) + filename = "${local.namespaces_directory}/namespace-${each.value.namespace}.yaml" +} + +module "kubectl_apply_namespace" { + for_each = local.workloads + depends_on = [ + local_file.namespace_yaml, + ] + + source = "../../../../modules/kubectl_apply" + + apply_server_side = true + delete_timeout = "60s" + error_on_delete_failure = false + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${local.namespaces_directory}/namespace-${each.value.namespace}.yaml" + manifest_includes_namespace = true +} + +resource "local_file" "serviceaccount_yaml" { + for_each = local.workloads + content = templatefile( + "${path.module}/templates/kubernetes/serviceaccount.tftpl.yaml", + { + name = each.value.service_account + namespace = each.value.namespace + } + ) + filename = "${each.value.directory}/serviceaccount-${each.value.service_account}.yaml" +} + +module "kubectl_apply_service_account" { + for_each = local.workloads + depends_on = [ + local_file.serviceaccount_yaml, + module.kubectl_apply_namespace, + ] + + source = "../../../../modules/kubectl_apply" + + apply_server_side = true + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${each.value.directory}/serviceaccount-${each.value.service_account}.yaml" + manifest_includes_namespace = true +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf new file mode 100644 index 000000000..2fe2a6484 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/mlflow.tf @@ -0,0 +1,44 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_storage_bucket_iam_member" "data_bucket_mlflow_storage_object_admin" { + bucket = google_storage_bucket.mlflow_data.name + member = local.rl_mlflow_ksa_member + role = "roles/storage.objectAdmin" +} + +resource "local_file" "mlflow_manifest" { + content = templatefile( + "${path.module}/templates/mlflow/manifests.tftpl.yaml", + { + bucket_name = google_storage_bucket.mlflow_data.name, + service_account_name = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_service_account_name, + namespace = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name, + } + ) + filename = "${local.namespaces_directory}/mlflow.yaml" +} + +module "kubectl_apply_mlflow_manifest" { + depends_on = [ + module.kubectl_apply_namespace, + ] + + source = "../../../../modules/kubectl_apply" + + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = local_file.mlflow_manifest.filename + manifest_includes_namespace = false + namespace = local.rl_cpu_reinforcement_learning_mlflow_kubernetes_namespace_name +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf new file mode 100644 index 000000000..4c878f945 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/project.tf @@ -0,0 +1,17 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_project" "cluster" { + project_id = local.cluster_project_id +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf new file mode 100644 index 000000000..8a147be2b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/storage.tf @@ -0,0 +1,34 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_storage_bucket" "hub_models" { + name = local.huggingface_hub_models_bucket_name + project = local.huggingface_hub_models_bucket_project_id +} + +resource "google_storage_bucket" "dataset" { + name = local.rl_dataset_bucket_name + project = local.rl_project_id + location = local.cluster_region + + uniform_bucket_level_access = true +} + +resource "google_storage_bucket" "mlflow_data" { + name = local.rl_mlflow_data_bucket_name + project = local.rl_project_id + location = local.cluster_region + + uniform_bucket_level_access = true +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml new file mode 100644 index 000000000..e7dff839d --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/namespace.tftpl.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ${name} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml new file mode 100644 index 000000000..a0f63c9dc --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/kubernetes/serviceaccount.tftpl.yaml @@ -0,0 +1,19 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ${name} + namespace: ${namespace} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml new file mode 100644 index 000000000..d604075ba --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/templates/mlflow/manifests.tftpl.yaml @@ -0,0 +1,77 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mlflow-tracking + namespace: ${namespace} +spec: + replicas: 1 + selector: + matchLabels: + app: mlflow-tracking + strategy: + type: RollingUpdate + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + labels: + app: mlflow-tracking + spec: + containers: + - args: + - | + mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri sqlite:///mlruns/mlflow.db + command: ["sh", "-c"] + image: ghcr.io/mlflow/mlflow:v3.10.1-full + name: mlflow + resources: + limits: + cpu: "2" + memory: 10Gi + requests: + cpu: "2" + memory: 10Gi + volumeMounts: + - mountPath: /mlruns + name: gcs-fuse-csi-ephemeral + serviceAccountName: ${service_account_name} + tolerations: + - effect: NoSchedule + key: components.gke.io/gke-managed-components + operator: Equal + value: "true" + volumes: + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: ${bucket_name} + gcsfuseLoggingSeverity: warning + mountOptions: implicit-dirs + name: gcs-fuse-csi-ephemeral +--- +apiVersion: v1 +kind: Service +metadata: + name: mlflow-tracking-svc + namespace: ${namespace} +spec: + ports: + - port: 5000 + protocol: TCP + targetPort: 5000 + selector: + app: mlflow-tracking diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf new file mode 100644 index 000000000..7da69f06b --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/rl_on_tpu/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "7.6.0" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/rl_on_tpu_deploy-v1" + } +} diff --git a/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh b/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh new file mode 100755 index 000000000..d327ca761 --- /dev/null +++ b/platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset + +start_timestamp=$(date +%s) + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +# Set repository values +export ACP_REPO_DIR="$(realpath ${MY_PATH}/../../../../../../)" +export ACP_PLATFORM_BASE_DIR="${ACP_REPO_DIR}/platforms/gke/base" +export ACP_PLATFORM_CORE_DIR="${ACP_PLATFORM_BASE_DIR}/core" +export ACP_PLATFORM_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning" + +# Enable Terraform plugin caching and specifies location of the plugin cache directory +export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + +# Set use-case specific values +export TF_VAR_initialize_backend_use_case_name="reinforcement-learning/terraform" +export TF_VAR_resource_name_prefix="${TF_VAR_resource_name_prefix:-rl}" + +# Set execution specific values +export ACP_TEARDOWN_CORE_PLATFORM=${ACP_TEARDOWN_CORE_PLATFORM:-"true"} + +# shellcheck disable=SC1091 +source "${ACP_PLATFORM_USE_CASE_DIR}/terraform/_shared_config/scripts/set_environment_variables.sh" + +# shellcheck disable=SC2154 +cd "${ACP_PLATFORM_CORE_DIR}/initialize" && + echo "Current directory: $(pwd)" && + sed -i "s/^\([[:blank:]]*bucket[[:blank:]]*=\).*$/\1 \"${terraform_bucket_name}\"/" "${ACP_PLATFORM_CORE_DIR}/initialize/backend.tf.bucket" && + cp backend.tf.bucket backend.tf && + rm -rf .terraform/ && + terraform init && + terraform plan -input=false -out=tfplan && + terraform apply -input=false tfplan || exit 1 +rm tfplan + +declare -a use_case_terraservices=( + "initialize" +) +for terraservice in "${use_case_terraservices[@]}"; do + cd "${ACP_PLATFORM_USE_CASE_DIR}/terraform/${terraservice}" && + echo "Current directory: $(pwd)" && + rm -rf .terraform/ && + terraform init && + terraform destroy -auto-approve || exit 1 + rm -rf .terraform/ \ + "terraform.tfstate"* +done + +if [ "${ACP_TEARDOWN_CORE_PLATFORM}" = "true" ]; then + declare -a CORE_TERRASERVICES_DESTROY=( + "workloads/pathways" + "workloads/kueue" + "workloads/priority_class" + "workloads/lws" + "workloads/jobset" + "workloads/inference_gateway" + "workloads/custom_metrics_adapter" + "workloads/auto_monitoring" + "custom_compute_class" + "huggingface/hub_downloader" + "huggingface/initialize" + "cloudbuild/initialize" + "workloads/cluster_credentials" + "container_cluster" + "networking" + "initialize" + ) + CORE_TERRASERVICES_DESTROY="${CORE_TERRASERVICES_DESTROY[*]}" "${ACP_PLATFORM_CORE_DIR}/teardown.sh" +else + echo "Skipping core platform teardown." +fi + +rm -rf \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/model-download/huggingface/downloader.env" \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/model-download/huggingface/secretproviderclass-huggingface-tokens.yaml" \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/online-inference-gpu/base/deployment.env" \ + "${ACP_PLATFORM_USE_CASE_DIR}/kubernetes-manifests/online-inference-tpu/base/deployment.env" + +end_timestamp=$(date +%s) +total_runtime_value=$((end_timestamp - start_timestamp)) +echo "reinforcement-learning teardown total runtime: $(date -d@${total_runtime_value} -u +%H:%M:%S)" diff --git a/test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml b/test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml new file mode 100644 index 000000000..d849b9d00 --- /dev/null +++ b/test/ci-cd/cloudbuild/platforms/gke/base/use-cases/reinforcement-learning/standard-scripts.yaml @@ -0,0 +1,153 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +options: + automapSubstitutions: true + logging: CLOUD_LOGGING_ONLY + +steps: + - args: + - "${_WAIT_FOR_TRIGGER}" + entrypoint: "test/ci-cd/scripts/cloudbuild/wait_for_trigger.sh" + env: + - "LOCATION=${LOCATION}" + - "PROJECT_ID=${PROJECT_ID}" + id: "Check triggers" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: ["-"] + + - args: + - DEBUG=${_DEBUG} + - TF_VAR_platform_default_project_id="${PROJECT_ID}-$${PROJECT_SUFFIX}" + - TF_VAR_platform_name="ch${SHORT_SHA}" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/configure_build_environment.sh" + env: + - ACP_PLATFORM_DIR="$${ACP_REPO_DIR}/platforms/gke/base" + - BUILD_ID=${BUILD_ID} + - DEBUG=${_DEBUG} + - PROJECT_ID=${PROJECT_ID} + - SHORT_SHA=${SHORT_SHA} + id: "Configure the build environment" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Check triggers" + + - args: + - "Deploy platforms/gke/base/use-cases/reinforcement-learning Standard" + - "platforms/gke/base/use-cases/reinforcement-learning/terraform/deploy-standard.sh" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/run_deploy_script.sh" + id: "Deploy platforms/gke/base/use-cases/reinforcement-learning Standard" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Configure the build environment" + + - args: + - "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh" + id: "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Deploy platforms/gke/base/use-cases/reinforcement-learning Standard" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_gpu + entrypoint: "test/ci-cd/scripts/terraservice/apply.sh" + id: "Apply reinforcement-learning Terraservice 'online_gpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_tpu + entrypoint: "test/ci-cd/scripts/terraservice/apply.sh" + id: "Apply reinforcement-learning Terraservice 'online_tpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Populate platforms/gke/base/use-cases/reinforcement-learning Hugging Face token secrets" + + - args: + - "Validate Kustomize" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh" + id: "Validate Kustomize" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Apply reinforcement-learning Terraservice 'online_gpu'" + - "Apply reinforcement-learning Terraservice 'online_tpu'" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_gpu + entrypoint: "test/ci-cd/scripts/terraservice/plan.sh" + id: "reinforcement-learning Terraservice 'online_gpu' check for changes" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Validate Kustomize" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_tpu + entrypoint: "test/ci-cd/scripts/terraservice/plan.sh" + id: "reinforcement-learning Terraservice 'online_tpu' check for changes" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Validate Kustomize" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_gpu + entrypoint: "test/ci-cd/scripts/terraservice/destroy.sh" + id: "Destroy reinforcement-learning Terraservice 'online_gpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "reinforcement-learning Terraservice 'online_gpu' check for changes" + + - args: + - platforms/gke/base/use-cases/reinforcement-learning/terraform + - online_tpu + entrypoint: "test/ci-cd/scripts/terraservice/destroy.sh" + id: "Destroy reinforcement-learning Terraservice 'online_tpu'" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "reinforcement-learning Terraservice 'online_tpu' check for changes" + + - args: + - "Teardown platforms/gke/base/use-cases/reinforcement-learning Standard" + - "platforms/gke/base/use-cases/reinforcement-learning/terraform/teardown-standard.sh" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/run_teardown_script.sh" + id: "Teardown platforms/gke/base/use-cases/reinforcement-learning Standard" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Destroy reinforcement-learning Terraservice 'online_gpu'" + - "Destroy reinforcement-learning Terraservice 'online_tpu'" + + - args: + - "Cleanup the build environment" + entrypoint: "test/ci-cd/scripts/platforms/gke/base/cleanup_build_environment.sh" + id: "Cleanup the build environment" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Teardown platforms/gke/base/use-cases/reinforcement-learning Standard" + + - entrypoint: "test/ci-cd/scripts/platforms/gke/base/set_build_status.sh" + id: "Set the build status" + name: "${LOCATION}-docker.pkg.dev/${PROJECT_ID}/ci-cd/runner:latest" + waitFor: + - "Cleanup the build environment" + +substitutions: + _DEBUG: "false" + +timeout: 90m diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh new file mode 100755 index 000000000..1492fe390 --- /dev/null +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/populate_huggingface_token_secrets.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +source /workspace/build.env +if [ "${DEBUG,,}" == "true" ]; then + set -o xtrace +fi + +STEP_ID=${1} + +exit_handler() { + exit_code=$? + + if [ ${exit_code} -ne 0 ]; then + echo "${STEP_ID}" >>/workspace/build-failed.lock + fi + + exit 0 +} +trap exit_handler EXIT + +set -- + +source "${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh" + +echo "HF_TOKEN_READ" | gcloud secrets versions add ${huggingface_hub_access_token_read_secret_manager_secret_name} \ +--data-file=- \ +--project=${huggingface_secret_manager_project_id} diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh new file mode 100755 index 000000000..f818b409f --- /dev/null +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/reinforcement-learning/validate_kustomize.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +source /workspace/build.env +if [ "${DEBUG,,}" == "true" ]; then + set -o xtrace +fi + +STEP_ID=${1} + +exit_handler() { + exit_code=$? + + if [ ${exit_code} -ne 0 ]; then + echo "${STEP_ID}" >>/workspace/build-failed.lock + fi + + exit 0 +} +trap exit_handler EXIT + +set -- + +export HF_MODEL_ID="google/gemma-3-27b-it" + +source "${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh" + +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/model-download/configure_huggingface.sh" + +export ACCELERATOR_TYPE="l4" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/batch-inference-gpu/batch-load-generator/configure_load_generator.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/batch-inference-gpu/batch-pubsub-subscriber/configure_pubsub_subscriber.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/batch-inference-gpu/vllm/configure_vllm.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-gpu/vllm/configure_vllm.sh" + +export ACCELERATOR_TYPE="v5e" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-tpu/max-diffusion/configure_max_diffusion.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-tpu/vllm/configure_vllm.sh" + +# Validate inference-perf kustomize +export ACCELERATOR_TYPE="rtx-pro-6000" +export ACCELERATOR="GPU" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh" + +# Validate offline-batch-inference-gpu kustomize +export ACCELERATOR_TYPE="rtx-pro-6000" +export HF_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" +source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/terraform/_shared_config/scripts/set_environment_variables.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/offline-batch-inference-gpu/configure_jobset.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-dataset-downloader/configure_dataset_downloader.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/reinforcement-learning/kubernetes-manifests/offline-batch-inference-gpu/offline-batch-worker/configure_worker.sh" + +find "${ACP_PLATFORM_BASE_DIR}/use-cases/reinforcement-learning/kubernetes-manifests" -name "kustomization.yaml" -print0 | while read -d $'\0' file; do + kustomize_directory_path="$(dirname "${file}")" + rendered_kubernetes_manifests_file_path="/tmp/rendered-kustomize.yaml" + + # Basic validation: + # - Render manifests with Kustomize + # - Validate manifests with kubectl-validate + kubectl kustomize "${kustomize_directory_path}" | tee "${rendered_kubernetes_manifests_file_path}" + kubectl validate "${rendered_kubernetes_manifests_file_path}" +done