diff --git a/.dev-tools/.gitignore b/.dev-tools/.gitignore index b93b0f69e..12262726f 100644 --- a/.dev-tools/.gitignore +++ b/.dev-tools/.gitignore @@ -8,6 +8,7 @@ platforms/gke/base/core/workloads/inference_gateway/manifests/* platforms/gke/base/core/workloads/jobset/manifests/* platforms/gke/base/core/workloads/kueue/manifests/* platforms/gke/base/core/workloads/lws/manifests/* +platforms/gke/base/core/workloads/nri_device_injector/manifests/* platforms/gke/base/core/workloads/nvidia_nim/* platforms/gke/base/core/workloads/priority_class/manifests/* platforms/gke/base/kubernetes/* diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 5589878f0..bb7aef62d 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM hashicorp/terraform:1.5.7 AS terraform -FROM koalaman/shellcheck:v0.10.0 AS shellcheck -FROM mvdan/shfmt:v3.10.0 AS shfmt +FROM hashicorp/terraform:1.14.8 AS terraform +FROM koalaman/shellcheck:v0.11.0 AS shellcheck +FROM mvdan/shfmt:v3.13.1 AS shfmt FROM python:3.13-bookworm AS python-builder diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e76a7e0c6..2405c019c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ { "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json", - "name": "Cloud Solutions devcontainer", + "name": "Accelerated Platforms devcontainer", "build": { "dockerfile": "Dockerfile" }, @@ -13,7 +13,9 @@ "editor.wordWrap": "off", "files.insertFinalNewline": true, "files.trimFinalNewlines": true, + "geminicodeassist.displayInlineContextHint": false, "prettier.resolveGlobalModules": true, + "python.defaultInterpreterPath": "/venv/bin/python", "redhat.telemetry.enabled": false, "telemetry.telemetryLevel": "off", "[css]": { @@ -78,6 +80,7 @@ "ms-azuretools.vscode-containers", "ms-python.black-formatter", "ms-python.isort", + "ms-python.python", "streetsidesoftware.code-spell-checker", "timonwong.shellcheck" ] diff --git a/.github/workflows/dictionary/python.txt b/.github/workflows/dictionary/python.txt index 9d8cbc9f3..45655b0ba 100644 --- a/.github/workflows/dictionary/python.txt +++ b/.github/workflows/dictionary/python.txt @@ -3,10 +3,16 @@ aiohttp aqtp asctime asgi +asynccontextmanager asyncio +certifi +cffi classmethod configparser +contextlib coveragerc +dataclass +dataclasses dataframe dbapi dbcommands @@ -17,6 +23,7 @@ fastapi fillna fromarray frombuffer +fromisoformat fsspec ftfy functools @@ -29,11 +36,13 @@ getframerate getnchannels getnframes getsampwidth +grpcio gunicorn hasattr hashlib hexdigest httpx +idna iloc imgf inplace @@ -59,7 +68,10 @@ pgvector pipreqs pmap prng +protos +pyasn pycache +pycparser pydantic pyenv pylint @@ -69,8 +81,10 @@ pythondontwritebytecode pythonpath pythonunbuffered qualname +quantiles readframes removesuffix +reqs rerank reranked retryable @@ -83,13 +97,16 @@ shutil spacy splitlines sqlalchemy +strftime tensorboard tensorboardx thejsonlogger tqdm unittests urllib +urlopen urlretrieve uvicorn venv writerow +writestr diff --git a/.github/workflows/dictionary/sglang.txt b/.github/workflows/dictionary/sglang.txt new file mode 100644 index 000000000..b14275eba --- /dev/null +++ b/.github/workflows/dictionary/sglang.txt @@ -0,0 +1,4 @@ +lmsysorg +musa +nvls +sglang diff --git a/.github/workflows/dictionary/shell.txt b/.github/workflows/dictionary/shell.txt index 637872058..7b1b5b8e3 100644 --- a/.github/workflows/dictionary/shell.txt +++ b/.github/workflows/dictionary/shell.txt @@ -16,6 +16,7 @@ nslookup pipefail pkill shuf +subshell syscall xtrace zxvf diff --git a/.gitignore b/.gitignore index 3d84f34c6..ba12e5b8f 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,10 @@ terraform.tfstate* # Test test/log/*.log test/scripts/environment_files/* + +# Generated outputs +*.log +k6-*.txt +k6-*.csv +k6-*.jsonl +k6-report.md diff --git a/README.md b/README.md index c6533e770..183ec12e5 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ the primary runtime. - [LLM Inference Optimization: Achieving faster Pod Startup with Google Cloud Storage](/use-cases/inferencing/cost-optimization/gcsfuse/AchievingFasterPodStartup.md) - [Optimizing GKE Workloads with Custom Compute Classes](/docs/guides/optimizing-gke-workloads-with-custom-compute-classes/README.md) -### [Deprecated] Playground AI/ML Platform on GKE +### \[Deprecated\] Playground AI/ML Platform on GKE The [Playground AI/ML Platform on GKE](/platforms/gke-aiml/playground/README.md) is a quick-start implementation of the platform that can be used to familiarize diff --git a/container-images/cpu/k6-benchmark/Dockerfile b/container-images/cpu/k6-benchmark/Dockerfile new file mode 100644 index 000000000..ae4beebe2 --- /dev/null +++ b/container-images/cpu/k6-benchmark/Dockerfile @@ -0,0 +1,31 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM grafana/k6:1.7.1 + +USER root + +WORKDIR /app +# Create the /output directory and ensure k6 owns it, along with /app +RUN mkdir -p /output && chown -R k6:k6 /app /output + +COPY --chown=k6:k6 scripts /app/scripts +COPY --chmod=a+x --chown=k6:k6 entrypoint.sh /app/entrypoint.sh + +# Switch back to the unprivileged k6 user +USER k6 + +ENTRYPOINT ["/app/entrypoint.sh"] + +CMD ["--help"] diff --git a/container-images/cpu/k6-benchmark/README.md b/container-images/cpu/k6-benchmark/README.md new file mode 100644 index 000000000..d1a3626d1 --- /dev/null +++ b/container-images/cpu/k6-benchmark/README.md @@ -0,0 +1,99 @@ +# k6 Benchmark Image + +This container image packages [k6](https://k6.io/) load testing tool with +specific scripts to benchmark Machine Learning inference workloads. + +It is designed to run in environments like Google Kubernetes Engine (GKE) to +generate consistent, reproducible load against target endpoints and output +granular metrics to a JSONL file for further analysis. It also includes a Python +script (`extract_metrics.py`) that can be run manually to process the k6 output +and generate a price/performance report. + +## Usage + +You can run this container image via Docker or deploy it as a Job in a +Kubernetes cluster. + +### Environment Variables + +The container accepts the following optional environment variables for metric +output naming and processing: + +- `ACCELERATOR_NAME`: A string representing the target hardware (e.g., `l4`, + `a100`, `v5p`). If not provided, it defaults to `accelerator-not-set`. +- `NODE_HOURLY_COST`: The hourly cost of the underlying node in USD. Used by the + automatic metric extraction script to compute cost per 1k images. Defaults to + `0.0`. + +The default benchmark script (`k6-diffusers-flux-2-klein-4b.js`) expects the +following environment variables: + +- `TARGET_URL`: The full URL of the inference endpoint to test (e.g., + `http://model-service:8000/generate`). +- `BATCH_SIZE`: The batch size to request in the payload (default: `1`). +- `VUS`: The number of concurrent Virtual Users to simulate (default: `1`). + +### Running via Docker + +Set the k6 script to run by setting the `CMD` to point to the script path when +starting the container: + +```bash +# Example: running a different script mounted into the container +docker run --rm \ + -e ACCELERATOR_NAME="custom" \ + -v $(pwd)/custom-script.js:/app/custom-script.js \ + -v $(pwd)/output:/output \ + k6-benchmark:latest /app/your-k6-script.js +``` + +The k6 output will be saved in the mapped `/output` directory on your host. The +filename will be dynamically generated in the format: +`--.jsonl`. For +For example: `k6-diffusers-flux-2-klein-4b-l4-20260417T120000Z.jsonl`. + +#### Supported Benchmarks + +The following benchmark scripts are included: + +- **`/app/k6-diffusers-flux-2-klein-4b.js`**: Benchmark the FLUX.2-klein-4B + image generation model. + +## Metrics Extraction + +The extraction script (`extract_metrics.py`) can be run manually after the +benchmark finishes to generate a price/performance report. + +The extraction script calculates throughput (Images/sec) and latencies (p50, +p95, p99) strictly from the `benchmark` scenario, and automatically fetches +corresponding on-node telemetry (Peak VRAM, Avg GPU Utilization) from Google +Cloud Monitoring if the dependencies are installed and it is running on Google +Cloud. + +To ensure accurate hardware metrics when multiple deployments are running in the +same project, the script can filter by pod, namespace, or node. If the `--pod` +argument is omitted, the script automatically uses the `deployment_name` +(extracted from the `TARGET_URL` hostname) as a prefix to filter for relevant +pods. + +### Script Arguments + +- `--file`: Path to the k6 `.jsonl` output file (Required). +- `--output-csv`: Path to the output CSV file where aggregated results are + stored (Optional, default: `k6-benchmark.csv`). +- `--hourly-cost`: The hourly cost of the underlying GKE node in USD. If set to + `0.0`, a warning is emitted and cost metrics will be `0.0` (Optional, default: + `0.0`). +- `--project-id`: Google Cloud Project ID to query DCGM metrics via Cloud + Monitoring. If omitted, the script dynamically fetches the project ID from the + Google Cloud Metadata server (Optional). +- `--pod`: Filter metrics by a specific pod name. If omitted, the script + automatically uses the `deployment_name` (derived from the `TARGET_URL` + hostname) as a prefix filter to match all relevant pods in the deployment + (Optional). +- `--namespace`: Filter metrics by a specific namespace (Optional). +- `--node`: Filter metrics by a specific node name (Optional). +- `--vram-metric`: The Prometheus metric string for VRAM usage (Default: + `prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge`). +- `--util-metric`: The Prometheus metric string for GPU utilization (Default: + `prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge`). diff --git a/container-images/cpu/k6-benchmark/cloudbuild.yaml b/container-images/cpu/k6-benchmark/cloudbuild.yaml new file mode 100644 index 000000000..7d2515a93 --- /dev/null +++ b/container-images/cpu/k6-benchmark/cloudbuild.yaml @@ -0,0 +1,28 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + +steps: + - args: + - build + - --tag=${_DESTINATION} + - . + id: "Build k6 benchmark image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/container-images/cpu/k6-benchmark/entrypoint.sh b/container-images/cpu/k6-benchmark/entrypoint.sh new file mode 100755 index 000000000..4c4ee4f25 --- /dev/null +++ b/container-images/cpu/k6-benchmark/entrypoint.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset + +# Default accelerator name +ACCELERATOR="${ACCELERATOR_NAME:-accelerator-not-set}" + +# Find the script name from the arguments +SCRIPT_PATH="" +for arg in "$@"; do + case "$arg" in + *.js) + SCRIPT_PATH="$arg" + ;; + esac +done + +if [ -n "${SCRIPT_PATH:-}" ]; then + SCRIPT_NAME=$(basename "$SCRIPT_PATH" .js) +else + SCRIPT_NAME="unknown-script" +fi + +TIMESTAMP=$(date -u +"%Y%m%dT%H%M%SZ") +FILENAME="${SCRIPT_NAME}-${ACCELERATOR}-${TIMESTAMP}.jsonl" +OUTPUT_FILE_PATH="/output/${FILENAME}" +echo "Configured metrics output file: ${OUTPUT_FILE_PATH}" + +if [ "$*" = "--help" ]; then + k6 --help +else + k6 run \ + --out "json=${OUTPUT_FILE_PATH}" \ + "$@" +fi diff --git a/container-images/cpu/k6-benchmark/extract_metrics.py b/container-images/cpu/k6-benchmark/extract_metrics.py new file mode 100644 index 000000000..3da346b1c --- /dev/null +++ b/container-images/cpu/k6-benchmark/extract_metrics.py @@ -0,0 +1,623 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import csv +import json +import logging +import os +import statistics +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +# Optional: Google Cloud Monitoring +try: + from google.cloud import monitoring_v3 + + HAS_GCP = True +except ImportError: + HAS_GCP = False + + +@dataclass +class ScenarioResult: + name: str + durations: List[float] + start_time: datetime + end_time: datetime + tags: Dict[str, any] + total_requests: int + successful_requests: int + vus: int + + +def parse_k6_output(filepath: str) -> List[ScenarioResult]: + """Parses k6 JSONL and extracts data for all discovered scenarios.""" + scenarios_data = {} + vus_points = [] + + logging.info(f"Parsing k6 output file: {filepath}") + with open(filepath, "r") as f: + for line in f: + if not line.strip(): + continue + try: + record = json.loads(line) + except: + continue + + metric_name = record.get("metric") + record_type = record.get("type") + if record_type != "Point": + continue + + data = record.get("data", {}) + req_tags = data.get("tags", {}) + value = data.get("value") + time_str = data.get("time") + if not time_str: + continue + + if "." in time_str: + base, frac = time_str.split(".") + frac = frac.replace("Z", "")[:6] + clean_time_str = f"{base}.{frac}Z" + else: + clean_time_str = time_str + if clean_time_str.endswith("Z"): + clean_time_str = clean_time_str.replace("Z", "+00:00") + dt = datetime.fromisoformat(clean_time_str) + + if metric_name == "vus" and value is not None: + vus_points.append((dt, int(value))) + + scenario_name = req_tags.get("scenario") + if scenario_name: + if scenario_name not in scenarios_data: + scenarios_data[scenario_name] = { + "durations": [], + "total_requests": 0, + "successful_requests": 0, + "start_time": dt, + "end_time": dt, + "tags": {}, + } + s_entry = scenarios_data[scenario_name] + if dt < s_entry["start_time"]: + s_entry["start_time"] = dt + if dt > s_entry["end_time"]: + s_entry["end_time"] = dt + if metric_name == "http_reqs": + s_entry["total_requests"] += 1 + if req_tags.get("expected_response") == "true": + s_entry["successful_requests"] += 1 + if metric_name == "http_req_duration" and value is not None: + s_entry["durations"].append(value) + if not s_entry["tags"]: + s_entry["tags"] = { + "model": req_tags.get("model", "unknown"), + "accelerator": req_tags.get("accelerator", "unknown"), + "inference_server": req_tags.get( + "inference_server", "unknown" + ), + "width": int(req_tags.get("width", 1024)), + "height": int(req_tags.get("height", 1024)), + "steps": int(req_tags.get("num_inference_steps", 20)), + "seed": req_tags.get("seed", "unknown"), + "batch_size": int(req_tags.get("batch_size", 1)), + "target_url": req_tags.get("target_url", "unknown"), + "deployment_name": req_tags.get( + "deployment_name", "unknown" + ), + } + + results = [] + for name, data in scenarios_data.items(): + if not (name.startswith("bench") or name == "benchmark"): + continue + if not data["durations"]: + continue + max_vus = 0 + import re + + m = re.search(r"_v(\d+)_", name) + if m: + max_vus = int(m.group(1)) + else: + for v_dt, v_val in vus_points: + if data["start_time"] <= v_dt <= data["end_time"]: + if v_val > max_vus: + max_vus = v_val + results.append( + ScenarioResult( + name=name, + durations=data["durations"], + start_time=data["start_time"], + end_time=data["end_time"], + tags=data["tags"], + total_requests=data["total_requests"], + successful_requests=data["successful_requests"], + vus=max_vus if max_vus > 0 else 1, + ) + ) + results.sort(key=lambda x: x.start_time) + return results + + +def get_typed_value(point_value): + if hasattr(point_value, "_pb"): + value_type = point_value._pb.WhichOneof("value") + elif hasattr(point_value, "WhichOneof"): + value_type = point_value.WhichOneof("value") + else: + value_type = None + if value_type == "double_value": + return point_value.double_value + elif value_type == "int64_value": + return point_value.int64_value + else: + if getattr(point_value, "double_value", 0.0) != 0.0: + return point_value.double_value + elif getattr(point_value, "int64_value", 0) != 0: + return point_value.int64_value + return 0.0 + + +def fetch_dcgm_metrics( + project_id, + start_time, + end_time, + vram_metric, + util_metric, + power_metric, + pod=None, + pod_is_prefix=False, + namespace=None, + node=None, +): + import sys + + print(f"DEBUG: sys.executable = {sys.executable}, HAS_GCP = {HAS_GCP}") + if not HAS_GCP or not project_id: + print("DEBUG: Exiting early because HAS_GCP is False or project_id is empty") + return "N/A", "N/A", "N/A" + try: + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{project_id}" + interval = monitoring_v3.TimeInterval( + {"start_time": start_time, "end_time": end_time} + ) + base_filter = ' AND resource.type = "prometheus_target"' + if pod: + if pod_is_prefix: + base_filter += f' AND metric.labels.pod = starts_with("{pod}")' + else: + base_filter += f' AND metric.labels.pod = "{pod}"' + if node: + base_filter += f' AND resource.labels.instance = starts_with("{node}")' + + def fetch(m_type): + full_filter = f'metric.type = "{m_type}"{base_filter}' + print(f"DEBUG: fetch_dcgm_metrics for {pod} with filter: {full_filter}") + try: + res = client.list_time_series( + request={ + "name": project_name, + "filter": full_filter, + "interval": interval, + } + ) + print(f"DEBUG: Found {sum(1 for _ in res)} time series.") + return client.list_time_series( + request={ + "name": project_name, + "filter": full_filter, + "interval": interval, + } + ) + except Exception as e: + print(f"DEBUG: Exception in fetch_dcgm_metrics: {e}") + return [] + + vram_per_gpu = {} + for result in fetch(vram_metric): + gpu_idx = result.metric.labels.get("gpu", "0") + vram_per_gpu.setdefault(gpu_idx, 0) + for point in result.points: + val = get_typed_value(point.value) + if val > vram_per_gpu[gpu_idx]: + vram_per_gpu[gpu_idx] = val + + compute_per_gpu = {} + for result in fetch(util_metric): + gpu_idx = result.metric.labels.get("gpu", "0") + compute_per_gpu.setdefault(gpu_idx, []) + for point in result.points: + compute_per_gpu[gpu_idx].append(get_typed_value(point.value)) + + power_per_gpu = {} + for result in fetch(power_metric): + gpu_idx = result.metric.labels.get("gpu", "0") + power_per_gpu.setdefault(gpu_idx, []) + for point in result.points: + power_per_gpu[gpu_idx].append(get_typed_value(point.value)) + + avg_compute_per_gpu = { + g: sum(vals) / len(vals) for g, vals in compute_per_gpu.items() if vals + } + avg_power_per_gpu = { + g: sum(vals) / len(vals) for g, vals in power_per_gpu.items() if vals + } + + total_vram = sum(vram_per_gpu.values()) + total_compute = sum(avg_compute_per_gpu.values()) + total_power = sum(avg_power_per_gpu.values()) + + avg_compute = ( + total_compute / len(avg_compute_per_gpu) if avg_compute_per_gpu else 0 + ) + avg_power = total_power / len(avg_power_per_gpu) if avg_power_per_gpu else 0 + + return { + "vram_total": f"{total_vram} MiB" if vram_per_gpu else "N/A", + "vram_per_gpu": ( + json.dumps({g: f"{v} MiB" for g, v in sorted(vram_per_gpu.items())}) + if vram_per_gpu + else "N/A" + ), + "compute_total": f"{total_compute:.2f}%" if avg_compute_per_gpu else "N/A", + "compute_avg": f"{avg_compute:.2f}%" if avg_compute_per_gpu else "N/A", + "compute_per_gpu": ( + json.dumps( + {g: f"{v:.2f}%" for g, v in sorted(avg_compute_per_gpu.items())} + ) + if avg_compute_per_gpu + else "N/A" + ), + "power_total": f"{total_power:.2f} W" if avg_power_per_gpu else "N/A", + "power_avg": f"{avg_power:.2f} W" if avg_power_per_gpu else "N/A", + "power_per_gpu": ( + json.dumps( + {g: f"{v:.2f} W" for g, v in sorted(avg_power_per_gpu.items())} + ) + if avg_power_per_gpu + else "N/A" + ), + "raw_total_vram_mib": total_vram, + } + except Exception as e: + logging.error(f"Failed to fetch metrics: {e}") + return {} + + +EXPECTED_CSV_HEADER = [ + "Source File", + "Deployment Name", + "Target URL", + "Model", + "Inference Server", + "Accelerator", + "Resolution", + "Inference Steps", + "Batch Size", + "Virtual Users (VUs)", + "Start Time (UTC)", + "End Time (UTC)", + "Total Time (s)", + "Total Requests", + "Success Rate (%)", + "Throughput (Images/s)", + "Request Throughput (RPS)", + "Request Latency p50 (s)", + "Request Latency p95 (s)", + "Request Latency p99 (s)", + "Image Latency p50 (s)", + "Image Latency p95 (s)", + "Image Latency p99 (s)", + "Peak VRAM (Total)", + "Peak VRAM (Per GPU)", + "Peak VRAM Utilization (%)", + "Compute (Total)", + "Compute (Average)", + "Compute (Per GPU)", + "Power (Total)", + "Power (Average)", + "Power (Per GPU)", + "Node Hourly Cost ($)", + "Cost per 1k Images ($)", +] + + +def get_gcp_project_id(): + import urllib.request + + try: + url = "http://metadata.google.internal/computeMetadata/v1/project/project-id" + req = urllib.request.Request(url, headers={"Metadata-Flavor": "Google"}) + with urllib.request.urlopen(req, timeout=2) as response: + return response.read().decode("utf-8") + except: + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Extract metrics from multi-scenario k6 JSONL." + ) + parser.add_argument("--file", required=True) + parser.add_argument("--output-csv", default="k6-benchmark.csv") + parser.add_argument("--hourly-cost", type=float, default=0.0) + parser.add_argument("--project-id") + parser.add_argument("--pod") + parser.add_argument("--namespace") + parser.add_argument("--node") + parser.add_argument( + "--vram-metric", default="prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge" + ) + parser.add_argument( + "--util-metric", default="prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge" + ) + parser.add_argument( + "--power-metric", + default="prometheus.googleapis.com/DCGM_FI_DEV_POWER_USAGE/gauge", + ) + + args = parser.parse_args() + if not args.project_id: + args.project_id = get_gcp_project_id() + + scenario_results = parse_k6_output(args.file) + if not scenario_results: + logging.error("No valid benchmark scenario data found.") + sys.exit(1) + + csv_rows, report_sections = [], [] + input_path = Path(args.file) + header = [ + "=" * 50, + f" GKE Price/Performance Benchmark Consolidated Report", + f" Source: {input_path.name}", + "=" * 50, + ] + + summary_cols = [ + "Scenario", + "Res", + "B", + "VU", + "Steps", + "Suc%", + "Img/s", + "RPS", + "ReqP50", + "ImgP50", + "VRAM", + "GPU%", + "Cost/1k", + ] + summary_fmt = "{:<20} {:<10} {:<2} {:<2} {:<5} {:<4} {:<7} {:<6} {:<7} {:<7} {:<7} {:<6} {:<8}" + summary_table = ["SUMMARY TABLE:", summary_fmt.format(*summary_cols), "-" * 105] + + for res in scenario_results: + total_time = (res.end_time - res.start_time).total_seconds() + batch_size = res.tags.get("batch_size", 1) + throughput = ( + (res.successful_requests * batch_size) / total_time if total_time > 0 else 0 + ) + rps = res.successful_requests / total_time if total_time > 0 else 0 + success_rate = ( + (res.successful_requests / res.total_requests) * 100 + if res.total_requests > 0 + else 0 + ) + p50 = statistics.median(res.durations) + if len(res.durations) > 1: + q = statistics.quantiles(res.durations, n=100, method="inclusive") + p95, p99 = q[94], q[98] + else: + p95 = p99 = res.durations[0] + img_p50, img_p95, img_p99 = p50 / batch_size, p95 / batch_size, p99 / batch_size + cost_per_1k = ( + (args.hourly_cost / (throughput * 3600)) * 1000 if throughput > 0 else 0 + ) + + dcgm_metrics = fetch_dcgm_metrics( + args.project_id, + res.start_time, + res.end_time, + args.vram_metric, + args.util_metric, + args.power_metric, + pod=args.pod or res.tags.get("deployment_name"), + pod_is_prefix=not args.pod, + namespace=args.namespace, + node=args.node, + ) + + vram_total = dcgm_metrics.get("vram_total", "N/A") + vram_per_gpu = dcgm_metrics.get("vram_per_gpu", "N/A") + comp_total = dcgm_metrics.get("compute_total", "N/A") + comp_avg = dcgm_metrics.get("compute_avg", "N/A") + comp_per_gpu = dcgm_metrics.get("compute_per_gpu", "N/A") + pow_total = dcgm_metrics.get("power_total", "N/A") + pow_avg = dcgm_metrics.get("power_avg", "N/A") + pow_per_gpu = dcgm_metrics.get("power_per_gpu", "N/A") + v_val_mib = dcgm_metrics.get("raw_total_vram_mib", 0) + + vram_util = "N/A" + try: + accel = res.tags.get("accelerator", "").lower() + if "l4-x4" in accel: + total_vram_max = 22528 * 4 + elif "l4-x2" in accel: + total_vram_max = 22528 * 2 + elif "l4" in accel: + total_vram_max = 22528 + elif "6000" in accel: + total_vram_max = 98304 + else: + total_vram_max = 0 + + if total_vram_max and v_val_mib > 0: + vram_util = f"{(v_val_mib / total_vram_max) * 100:.2f}%" + except: + pass + + summary_table.append( + summary_fmt.format( + res.name[:20], + f"{res.tags.get('width')}x{res.tags.get('height')}", + batch_size, + res.vus, + res.tags.get("steps", 20), + f"{success_rate:.0f}", + f"{throughput:.2f}", + f"{rps:.2f}", + f"{p50/1000:.2f}", + f"{img_p50/1000:.2f}", + f"{v_val_mib/1024:.0f}G" if v_val_mib else "N/A", + comp_avg.replace("%", ""), + f"${cost_per_1k:.2f}", + ) + ) + + report_sections.extend( + [ + "", + "=" * 50, + " GKE Price/Performance Benchmark Report", + "=" * 50, + f"Scenario: {res.name}", + f"Model: {res.tags.get('model')}", + f"Inference Server: {res.tags.get('inference_server')}", + f"Accelerator: {res.tags.get('accelerator')}", + f"Resolution: {res.tags.get('width')}x{res.tags.get('height')}", + f"Inference Steps: {res.tags.get('steps')}", + f"Batch Size: {batch_size}", + f"Virtual Users (VUs): {res.vus}", + f"Time Window: {res.start_time.strftime('%Y-%m-%d %H:%M:%S UTC')} to {res.end_time.strftime('%H:%M:%S UTC')} ({total_time:.2f}s)", + "-" * 50, + "UX Metrics (Off-Node):", + f" Total Requests: {res.total_requests}", + f" Success Rate: {success_rate:.2f}%", + f" Throughput: {throughput:.4f} Images/Second", + f" Request RPS: {rps:.4f} RPS", + f" Request Latency p50: {p50/1000:.3f} s", + f" Request Latency p95: {p95/1000:.3f} s", + f" Request Latency p99: {p99/1000:.3f} s", + f" Image Latency p50: {img_p50/1000:.3f} s", + f" Image Latency p95: {img_p95/1000:.3f} s", + f" Image Latency p99: {img_p99/1000:.3f} s", + "-" * 50, + "Hardware Metrics (On-Node DCGM):", + f" Peak VRAM (Total): {vram_total}", + f" Peak VRAM (Per GPU): {vram_per_gpu}", + f" VRAM Utilization: {vram_util}", + f" Compute (Total): {comp_total}", + f" Compute (Average): {comp_avg}", + f" Compute (Per GPU): {comp_per_gpu}", + f" Power (Total): {pow_total}", + f" Power (Average): {pow_avg}", + f" Power (Per GPU): {pow_per_gpu}", + "-" * 50, + "Business Metrics:", + f" Node Hourly Cost: ${args.hourly_cost:.4f}", + f" Cost per 1k Images: ${cost_per_1k:.4f}", + "=" * 50, + ] + ) + + csv_rows.append( + [ + input_path.name, + res.tags.get("deployment_name"), + res.tags.get("target_url"), + res.tags.get("model"), + res.tags.get("inference_server"), + res.tags.get("accelerator"), + f"{res.tags.get('width')}x{res.tags.get('height')}", + res.tags.get("steps"), + batch_size, + res.vus, + res.start_time.strftime("%Y-%m-%d %H:%M:%S"), + res.end_time.strftime("%Y-%m-%d %H:%M:%S"), + f"{total_time:.2f}", + res.total_requests, + f"{success_rate:.2f}", + f"{throughput:.4f}", + f"{rps:.4f}", + f"{p50/1000:.3f}", + f"{p95/1000:.3f}", + f"{p99/1000:.3f}", + f"{img_p50/1000:.3f}", + f"{img_p95/1000:.3f}", + f"{img_p99/1000:.3f}", + vram_total, + vram_per_gpu, + vram_util, + comp_total, + comp_avg, + comp_per_gpu, + pow_total, + pow_avg, + pow_per_gpu, + f"{args.hourly_cost:.4f}", + f"{cost_per_1k:.4f}", + ] + ) + + output_path = input_path.with_name(f"{input_path.stem}-report.txt") + with open(output_path, "w") as f: + f.write("\n".join(header + summary_table + report_sections) + "\n") + + csv_out = Path(args.output_csv) + write_h = not csv_out.exists() + existing_rows = set() + if not write_h: + with open(csv_out, "r") as f: + reader = csv.reader(f) + if next(reader, None) != EXPECTED_CSV_HEADER: + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + csv_out.rename(csv_out.with_name(f"{csv_out.stem}.mismatch.{ts}.csv")) + write_h = True + else: + for row in reader: + if len(row) > 10: + existing_rows.add((row[0], row[6], row[9])) + + with open(csv_out, "a", newline="") as f: + writer = csv.writer(f) + if write_h: + writer.writerow(EXPECTED_CSV_HEADER) + appended = 0 + for row in csv_rows: + if (row[0], row[6], row[9]) not in existing_rows: + writer.writerow(row) + appended += 1 + else: + logging.info( + f"Row for {row[0]} @ {row[6]} with {row[9]} VUs already exists. Skipping." + ) + logging.info(f"Consolidated report saved to {output_path}") + logging.info(f"Appended {appended} new rows to {args.output_csv}") + + +if __name__ == "__main__": + main() diff --git a/container-images/cpu/k6-benchmark/requirements.in b/container-images/cpu/k6-benchmark/requirements.in new file mode 100644 index 000000000..e0ab03e55 --- /dev/null +++ b/container-images/cpu/k6-benchmark/requirements.in @@ -0,0 +1,15 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +google-cloud-monitoring diff --git a/container-images/cpu/k6-benchmark/requirements.txt b/container-images/cpu/k6-benchmark/requirements.txt new file mode 100644 index 000000000..443405a24 --- /dev/null +++ b/container-images/cpu/k6-benchmark/requirements.txt @@ -0,0 +1,415 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in --generate-hashes -o requirements.txt +certifi==2026.2.25 \ + --hash=sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa \ + --hash=sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7 + # via requests +cffi==2.0.0 \ + --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \ + --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \ + --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \ + --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \ + --hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \ + --hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \ + --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \ + --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \ + --hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \ + --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \ + --hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \ + --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \ + --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \ + --hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \ + --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \ + --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \ + --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \ + --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \ + --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \ + --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \ + --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \ + --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \ + --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \ + --hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \ + --hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \ + --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \ + --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \ + --hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \ + --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \ + --hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \ + --hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \ + --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \ + --hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \ + --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \ + --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \ + --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \ + --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \ + --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \ + --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \ + --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \ + --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \ + --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \ + --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \ + --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \ + --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \ + --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \ + --hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \ + --hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \ + --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \ + --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \ + --hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \ + --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \ + --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \ + --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \ + --hash=sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9 \ + --hash=sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775 \ + --hash=sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739 \ + --hash=sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc \ + --hash=sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062 \ + --hash=sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe \ + --hash=sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9 \ + --hash=sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92 \ + --hash=sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5 \ + --hash=sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13 \ + --hash=sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d \ + --hash=sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26 \ + --hash=sha256:cb527a79772e5ef98fb1d700678fe031e353e765d1ca2d409c92263c6d43e09f \ + --hash=sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495 \ + --hash=sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b \ + --hash=sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6 \ + --hash=sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c \ + --hash=sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef \ + --hash=sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5 \ + --hash=sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18 \ + --hash=sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad \ + --hash=sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3 \ + --hash=sha256:de8dad4425a6ca6e4e5e297b27b5c824ecc7581910bf9aee86cb6835e6812aa7 \ + --hash=sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5 \ + --hash=sha256:e6e73b9e02893c764e7e8d5bb5ce277f1a009cd5243f8228f75f842bf937c534 \ + --hash=sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49 \ + --hash=sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2 \ + --hash=sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5 \ + --hash=sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453 \ + --hash=sha256:fe562eb1a64e67dd297ccc4f5addea2501664954f2692b69a76449ec7913ecbf + # via cryptography +charset-normalizer==3.4.7 \ + --hash=sha256:007d05ec7321d12a40227aae9e2bc6dca73f3cb21058999a1df9e193555a9dcc \ + --hash=sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c \ + --hash=sha256:07d9e39b01743c3717745f4c530a6349eadbfa043c7577eef86c502c15df2c67 \ + --hash=sha256:08e721811161356f97b4059a9ba7bafb23ea5ee2255402c42881c214e173c6b4 \ + --hash=sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0 \ + --hash=sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c \ + --hash=sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5 \ + --hash=sha256:12a6fff75f6bc66711b73a2f0addfc4c8c15a20e805146a02d147a318962c444 \ + --hash=sha256:12d8baf840cc7889b37c7c770f478adea7adce3dcb3944d02ec87508e2dcf153 \ + --hash=sha256:14265bfe1f09498b9d8ec91e9ec9fa52775edf90fcbde092b25f4a33d444fea9 \ + --hash=sha256:16d971e29578a5e97d7117866d15889a4a07befe0e87e703ed63cd90cb348c01 \ + --hash=sha256:177a0ba5f0211d488e295aaf82707237e331c24788d8d76c96c5a41594723217 \ + --hash=sha256:1a87ca9d5df6fe460483d9a5bbf2b18f620cbed41b432e2bddb686228282d10b \ + --hash=sha256:1c2a768fdd44ee4a9339a9b0b130049139b8ce3c01d2ce09f67f5a68048d477c \ + --hash=sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a \ + --hash=sha256:1dc8b0ea451d6e69735094606991f32867807881400f808a106ee1d963c46a83 \ + --hash=sha256:1efde3cae86c8c273f1eb3b287be7d8499420cf2fe7585c41d370d3e790054a5 \ + --hash=sha256:202389074300232baeb53ae2569a60901f7efadd4245cf3a3bf0617d60b439d7 \ + --hash=sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb \ + --hash=sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c \ + --hash=sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1 \ + --hash=sha256:2cd4a60d0e2fb04537162c62bbbb4182f53541fe0ede35cdf270a1c1e723cc42 \ + --hash=sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab \ + --hash=sha256:2fe249cb4651fd12605b7288b24751d8bfd46d35f12a20b1ba33dea122e690df \ + --hash=sha256:30b8d1d8c52a48c2c5690e152c169b673487a2a58de1ec7393196753063fcd5e \ + --hash=sha256:320ade88cfb846b8cd6b4ddf5ee9e80ee0c1f52401f2456b84ae1ae6a1a5f207 \ + --hash=sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18 \ + --hash=sha256:36836d6ff945a00b88ba1e4572d721e60b5b8c98c155d465f56ad19d68f23734 \ + --hash=sha256:38c0109396c4cfc574d502df99742a45c72c08eff0a36158b6f04000043dbf38 \ + --hash=sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110 \ + --hash=sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18 \ + --hash=sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44 \ + --hash=sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d \ + --hash=sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48 \ + --hash=sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e \ + --hash=sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5 \ + --hash=sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d \ + --hash=sha256:4e5163c14bffd570ef2affbfdd77bba66383890797df43dc8b4cc7d6f500bf53 \ + --hash=sha256:511ef87c8aec0783e08ac18565a16d435372bc1ac25a91e6ac7f5ef2b0bff790 \ + --hash=sha256:532bc9bf33a68613fd7d65e4b1c71a6a38d7d42604ecf239c77392e9b4e8998c \ + --hash=sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b \ + --hash=sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116 \ + --hash=sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d \ + --hash=sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10 \ + --hash=sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6 \ + --hash=sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2 \ + --hash=sha256:6370e8686f662e6a3941ee48ed4742317cafbe5707e36406e9df792cdb535776 \ + --hash=sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a \ + --hash=sha256:65bcd23054beab4d166035cabbc868a09c1a49d1efe458fe8e4361215df40265 \ + --hash=sha256:66671f93accb62ed07da56613636f3641f1a12c13046ce91ffc923721f23c008 \ + --hash=sha256:6696b7688f54f5af4462118f0bfa7c1621eeb87154f77fa04b9295ce7a8f2943 \ + --hash=sha256:6785f414ae0f3c733c437e0f3929197934f526d19dfaa75e18fdb4f94c6fb374 \ + --hash=sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246 \ + --hash=sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e \ + --hash=sha256:6e0d51f618228538a3e8f46bd246f87a6cd030565e015803691603f55e12afb5 \ + --hash=sha256:6ed74185b2db44f41ef35fd1617c5888e59792da9bbc9190d6c7300617182616 \ + --hash=sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15 \ + --hash=sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41 \ + --hash=sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960 \ + --hash=sha256:750e02e074872a3fad7f233b47734166440af3cdea0add3e95163110816d6752 \ + --hash=sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e \ + --hash=sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72 \ + --hash=sha256:7641bb8895e77f921102f72833904dcd9901df5d6d72a2ab8f31d04b7e51e4e7 \ + --hash=sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8 \ + --hash=sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b \ + --hash=sha256:813c0e0132266c08eb87469a642cb30aaff57c5f426255419572aaeceeaa7bf4 \ + --hash=sha256:82b271f5137d07749f7bf32f70b17ab6eaabedd297e75dce75081a24f76eb545 \ + --hash=sha256:84c018e49c3bf790f9c2771c45e9313a08c2c2a6342b162cd650258b57817706 \ + --hash=sha256:8751d2787c9131302398b11e6c8068053dcb55d5a8964e114b6e196cf16cb366 \ + --hash=sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb \ + --hash=sha256:87fad7d9ba98c86bcb41b2dc8dbb326619be2562af1f8ff50776a39e55721c5a \ + --hash=sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e \ + --hash=sha256:8e385e4267ab76874ae30db04c627faaaf0b509e1ccc11a95b3fc3e83f855c00 \ + --hash=sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f \ + --hash=sha256:94e1885b270625a9a828c9793b4d52a64445299baa1fea5a173bf1d3dd9a1a5a \ + --hash=sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1 \ + --hash=sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66 \ + --hash=sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356 \ + --hash=sha256:a6c5863edfbe888d9eff9c8b8087354e27618d9da76425c119293f11712a6319 \ + --hash=sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4 \ + --hash=sha256:adb2597b428735679446b46c8badf467b4ca5f5056aae4d51a19f9570301b1ad \ + --hash=sha256:ae196f021b5e7c78e918242d217db021ed2a6ace2bc6ae94c0fc596221c7f58d \ + --hash=sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5 \ + --hash=sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7 \ + --hash=sha256:aef65cd602a6d0e0ff6f9930fcb1c8fec60dd2cfcb6facaf4bdb0e5873042db0 \ + --hash=sha256:af21eb4409a119e365397b2adbaca4c9ccab56543a65d5dbd9f920d6ac29f686 \ + --hash=sha256:b14b2d9dac08e28bb8046a1a0434b1750eb221c8f5b87a68f4fa11a6f97b5e34 \ + --hash=sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49 \ + --hash=sha256:bb8cc7534f51d9a017b93e3e85b260924f909601c3df002bcdb58ddb4dc41a5c \ + --hash=sha256:bc17a677b21b3502a21f66a8cc64f5bfad4df8a0b8434d661666f8ce90ac3af1 \ + --hash=sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e \ + --hash=sha256:bd9b23791fe793e4968dba0c447e12f78e425c59fc0e3b97f6450f4781f3ee60 \ + --hash=sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0 \ + --hash=sha256:c0f081d69a6e58272819b70288d3221a6ee64b98df852631c80f293514d3b274 \ + --hash=sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d \ + --hash=sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0 \ + --hash=sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae \ + --hash=sha256:c593052c465475e64bbfe5dbd81680f64a67fdc752c56d7a0ae205dc8aeefe0f \ + --hash=sha256:cdd68a1fb318e290a2077696b7eb7a21a49163c455979c639bf5a5dcdc46617d \ + --hash=sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe \ + --hash=sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3 \ + --hash=sha256:cf29836da5119f3c8a8a70667b0ef5fdca3bb12f80fd06487cfa575b3909b393 \ + --hash=sha256:d4a48e5b3c2a489fae013b7589308a40146ee081f6f509e047e0e096084ceca1 \ + --hash=sha256:d560742f3c0d62afaccf9f41fe485ed69bd7661a241f86a3ef0f0fb8b1a397af \ + --hash=sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44 \ + --hash=sha256:d61f00a0869d77422d9b2aba989e2d24afa6ffd552af442e0e58de4f35ea6d00 \ + --hash=sha256:d635aab80466bc95771bb78d5370e74d36d1fe31467b6b29b8b57b2a3cd7d22c \ + --hash=sha256:dca4bbc466a95ba9c0234ef56d7dd9509f63da22274589ebd4ed7f1f4d4c54e3 \ + --hash=sha256:dd915403e231e6b1809fe9b6d9fc55cf8fb5e02765ac625d9cd623342a7905d7 \ + --hash=sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd \ + --hash=sha256:e060d01aec0a910bdccb8be71faf34e7799ce36950f8294c8bf612cba65a2c9e \ + --hash=sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b \ + --hash=sha256:e17b8d5d6a8c47c85e68ca8379def1303fd360c3e22093a807cd34a71cd082b8 \ + --hash=sha256:e5f4d355f0a2b1a31bc3edec6795b46324349c9cb25eed068049e4f472fb4259 \ + --hash=sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859 \ + --hash=sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46 \ + --hash=sha256:e80c8378d8f3d83cd3164da1ad2df9e37a666cdde7b1cb2298ed0b558064be30 \ + --hash=sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b \ + --hash=sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46 \ + --hash=sha256:ed065083d0898c9d5b4bbec7b026fd755ff7454e6e8b73a67f8c744b13986e24 \ + --hash=sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a \ + --hash=sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24 \ + --hash=sha256:f22dec1690b584cea26fade98b2435c132c1b5f68e39f5a0b7627cd7ae31f1dc \ + --hash=sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215 \ + --hash=sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063 \ + --hash=sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832 \ + --hash=sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6 \ + --hash=sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79 \ + --hash=sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464 + # via requests +cryptography==46.0.7 \ + --hash=sha256:04959522f938493042d595a736e7dbdff6eb6cc2339c11465b3ff89343b65f65 \ + --hash=sha256:128c5edfe5e5938b86b03941e94fac9ee793a94452ad1365c9fc3f4f62216832 \ + --hash=sha256:1d25aee46d0c6f1a501adcddb2d2fee4b979381346a78558ed13e50aa8a59067 \ + --hash=sha256:24402210aa54baae71d99441d15bb5a1919c195398a87b563df84468160a65de \ + --hash=sha256:258514877e15963bd43b558917bc9f54cf7cf866c38aa576ebf47a77ddbc43a4 \ + --hash=sha256:35719dc79d4730d30f1c2b6474bd6acda36ae2dfae1e3c16f2051f215df33ce0 \ + --hash=sha256:397655da831414d165029da9bc483bed2fe0e75dde6a1523ec2fe63f3c46046b \ + --hash=sha256:3986ac1dee6def53797289999eabe84798ad7817f3e97779b5061a95b0ee4968 \ + --hash=sha256:420b1e4109cc95f0e5700eed79908cef9268265c773d3a66f7af1eef53d409ef \ + --hash=sha256:42a1e5f98abb6391717978baf9f90dc28a743b7d9be7f0751a6f56a75d14065b \ + --hash=sha256:462ad5cb1c148a22b2e3bcc5ad52504dff325d17daf5df8d88c17dda1f75f2a4 \ + --hash=sha256:506c4ff91eff4f82bdac7633318a526b1d1309fc07ca76a3ad182cb5b686d6d3 \ + --hash=sha256:5ad9ef796328c5e3c4ceed237a183f5d41d21150f972455a9d926593a1dcb308 \ + --hash=sha256:5d1c02a14ceb9148cc7816249f64f623fbfee39e8c03b3650d842ad3f34d637e \ + --hash=sha256:5e51be372b26ef4ba3de3c167cd3d1022934bc838ae9eaad7e644986d2a3d163 \ + --hash=sha256:60627cf07e0d9274338521205899337c5d18249db56865f943cbe753aa96f40f \ + --hash=sha256:65814c60f8cc400c63131584e3e1fad01235edba2614b61fbfbfa954082db0ee \ + --hash=sha256:73510b83623e080a2c35c62c15298096e2a5dc8d51c3b4e1740211839d0dea77 \ + --hash=sha256:7bbc6ccf49d05ac8f7d7b5e2e2c33830d4fe2061def88210a126d130d7f71a85 \ + --hash=sha256:80406c3065e2c55d7f49a9550fe0c49b3f12e5bfff5dedb727e319e1afb9bf99 \ + --hash=sha256:84d4cced91f0f159a7ddacad249cc077e63195c36aac40b4150e7a57e84fffe7 \ + --hash=sha256:8a469028a86f12eb7d2fe97162d0634026d92a21f3ae0ac87ed1c4a447886c83 \ + --hash=sha256:91bbcb08347344f810cbe49065914fe048949648f6bd5c2519f34619142bbe85 \ + --hash=sha256:935ce7e3cfdb53e3536119a542b839bb94ec1ad081013e9ab9b7cfd478b05006 \ + --hash=sha256:9694078c5d44c157ef3162e3bf3946510b857df5a3955458381d1c7cfc143ddb \ + --hash=sha256:a1529d614f44b863a7b480c6d000fe93b59acee9c82ffa027cfadc77521a9f5e \ + --hash=sha256:abad9dac36cbf55de6eb49badd4016806b3165d396f64925bf2999bcb67837ba \ + --hash=sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325 \ + --hash=sha256:b7b412817be92117ec5ed95f880defe9cf18a832e8cafacf0a22337dc1981b4d \ + --hash=sha256:c5b1ccd1239f48b7151a65bc6dd54bcfcc15e028c8ac126d3fada09db0e07ef1 \ + --hash=sha256:cbd5fb06b62bd0721e1170273d3f4d5a277044c47ca27ee257025146c34cbdd1 \ + --hash=sha256:cdf1a610ef82abb396451862739e3fc93b071c844399e15b90726ef7470eeaf2 \ + --hash=sha256:cdfbe22376065ffcf8be74dc9a909f032df19bc58a699456a21712d6e5eabfd0 \ + --hash=sha256:d02c738dacda7dc2a74d1b2b3177042009d5cab7c7079db74afc19e56ca1b455 \ + --hash=sha256:d151173275e1728cf7839aaa80c34fe550c04ddb27b34f48c232193df8db5842 \ + --hash=sha256:d23c8ca48e44ee015cd0a54aeccdf9f09004eba9fc96f38c911011d9ff1bd457 \ + --hash=sha256:d3b99c535a9de0adced13d159c5a9cf65c325601aa30f4be08afd680643e9c15 \ + --hash=sha256:d5f7520159cd9c2154eb61eb67548ca05c5774d39e9c2c4339fd793fe7d097b2 \ + --hash=sha256:db0f493b9181c7820c8134437eb8b0b4792085d37dbb24da050476ccb664e59c \ + --hash=sha256:e06acf3c99be55aa3b516397fe42f5855597f430add9c17fa46bf2e0fb34c9bb \ + --hash=sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5 \ + --hash=sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4 \ + --hash=sha256:ebd6daf519b9f189f85c479427bbd6e9c9037862cf8fe89ee35503bd209ed902 \ + --hash=sha256:f247c8c1a1fb45e12586afbb436ef21ff1e80670b2861a90353d9b025583d246 \ + --hash=sha256:fbfd0e5f273877695cb93baf14b185f4878128b250cc9f8e617ea0c025dfb022 \ + --hash=sha256:fc9ab8856ae6cf7c9358430e49b368f3108f050031442eaeb6b9d87e4dcf4e4f \ + --hash=sha256:fcd8eac50d9138c1d7fc53a653ba60a2bee81a505f9f8850b6b2888555a45d0e \ + --hash=sha256:fdd1736fed309b4300346f88f74cd120c27c56852c3838cab416e7a166f67298 \ + --hash=sha256:ffca7aa1d00cf7d6469b988c581598f2259e46215e0140af408966a24cf086ce + # via google-auth +google-api-core==2.30.3 \ + --hash=sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8 \ + --hash=sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b + # via google-cloud-monitoring +google-auth==2.49.2 \ + --hash=sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409 \ + --hash=sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5 + # via + # google-api-core + # google-cloud-monitoring +google-cloud-monitoring==2.30.0 \ + --hash=sha256:2729f3b88a4798b7757b1d9d31b6cb562bb3544e8173765e4e5cd44d8685b1ed \ + --hash=sha256:a9530aa9aa246c490810dfa7be32d67e8340d19108acc99cbc02d1ed494fba76 + # via -r requirements.in +googleapis-common-protos==1.74.0 \ + --hash=sha256:57971e4eeeba6aad1163c1f0fc88543f965bb49129b8bb55b2b7b26ecab084f1 \ + --hash=sha256:702216f78610bb510e3f12ac3cafd281b7ac45cc5d86e90ad87e4d301a3426b5 + # via + # google-api-core + # grpcio-status +grpcio==1.80.0 \ + --hash=sha256:00168469238b022500e486c1c33916acf2f2a9b2c022202cf8a1885d2e3073c1 \ + --hash=sha256:02e64bb0bb2da14d947a49e6f120a75e947250aebe65f9629b62bb1f5c14e6e9 \ + --hash=sha256:05d55e1798756282cddd52d56c896b3e7d673e3a8798c2f1cd05ba249a3bb4de \ + --hash=sha256:09e5e478b3d14afd23f12e49e8b44c8684ac3c5f08561c43a5b9691c54d136ab \ + --hash=sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921 \ + --hash=sha256:1b97cd29a8eda100b559b455331c487a80915b6ea6bd91cf3e89836c4ee8d957 \ + --hash=sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f \ + --hash=sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257 \ + --hash=sha256:2bea16af2750fd0a899bf1abd9022244418b55d1f37da2202249ba4ba673838d \ + --hash=sha256:2dcc70e9f0ba987526e8e8603a610fb4f460e42899e74e7a518bf3c68fe1bf05 \ + --hash=sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd \ + --hash=sha256:31b9ac4ad1aa28ffee5503821fafd09e4da0a261ce1c1281c6c8da0423c83b6e \ + --hash=sha256:33eb763f18f006dc7fee1e69831d38d23f5eccd15b2e0f92a13ee1d9242e5e02 \ + --hash=sha256:367ce30ba67d05e0592470428f0ec1c31714cab9ef19b8f2e37be1f4c7d32fae \ + --hash=sha256:3b01e1f5464c583d2f567b2e46ff0d516ef979978f72091fd81f5ab7fa6e2e7f \ + --hash=sha256:3cb8130ba457d2aa09fa6b7c3ed6b6e4e6a2685fce63cb803d479576c4d80e21 \ + --hash=sha256:3d4147a97c8344d065d01bbf8b6acec2cf86fb0400d40696c8bdad34a64ffc0e \ + --hash=sha256:43168871f170d1e4ed16ae03d10cd21efa29f190e710a624cee7e5ae07da6f4f \ + --hash=sha256:448c884b668b868562b1bda833c5fce6272d26e1926ec46747cda05741d302c1 \ + --hash=sha256:4560cf0e86514595dbbd330cd65b7afad4b5c4b8c4905c041cfffa138d45e6fd \ + --hash=sha256:46c2390b59d67f84e882694d489f5b45707c657832d7934859ceb8c33f467069 \ + --hash=sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411 \ + --hash=sha256:4ed39fbdcf9b87370f6e8df4e39ca7b38b3e5e9d1b0013c7b6be9639d6578d14 \ + --hash=sha256:50a9871536d71c4fba24ee856abc03a87764570f0c457dd8db0b4018f379fed9 \ + --hash=sha256:51b4a7189b0bef2aa30adce3c78f09c83526cf3dddb24c6a96555e3b97340440 \ + --hash=sha256:52d143637e3872633fc7dd7c3c6a1c84e396b359f3a72e215f8bf69fd82084fc \ + --hash=sha256:5c07e82e822e1161354e32da2662f741a4944ea955f9f580ec8fb409dd6f6060 \ + --hash=sha256:627fb7312171cdc52828bd6fac8d7028ff2a64b89f1957b6f3416caa2218d141 \ + --hash=sha256:68e5851ac4b9afe07e7f84483803ad167852570d65326b34d54ca560bfa53fb6 \ + --hash=sha256:7b641fc3f1dc647bfd80bd713addc68f6d145956f64677e56d9ebafc0bd72388 \ + --hash=sha256:8502122a3cc1714038e39a0b071acb1207ca7844208d5ea0d091317555ee7106 \ + --hash=sha256:873ff5d17d68992ef6605330127425d2fc4e77e612fa3c3e0ed4e668685e3140 \ + --hash=sha256:886457a7768e408cdce226ad1ca67d2958917d306523a0e21e1a2fdaa75c9c9c \ + --hash=sha256:8ac393b58aa16991a2f1144ec578084d544038c12242da3a215966b512904d0f \ + --hash=sha256:8eb613f02d34721f1acf3626dfdb3545bd3c8505b0e52bf8b5710a28d02e8aa7 \ + --hash=sha256:92d787312e613754d4d8b9ca6d3297e69994a7912a32fa38c4c4e01c272974b0 \ + --hash=sha256:93b6f823810720912fd131f561f91f5fed0fda372b6b7028a2681b8194d5d294 \ + --hash=sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f \ + --hash=sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff \ + --hash=sha256:a1dc80fe55685b4a543555e6eef975303b36c8db1023b1599b094b92aa77965f \ + --hash=sha256:a361c20ec1ccd3c3953d20fb6d7b4125093bdd10dff44c5e2bbb39e58917cedc \ + --hash=sha256:a72d84ad0514db063e21887fbacd1fd7acb4d494a564cae22227cd45c7fbf199 \ + --hash=sha256:aacdfb4ed3eb919ca997504d27e03d5dba403c85130b8ed450308590a738f7a4 \ + --hash=sha256:ba0915d51fd4ced2db5ff719f84e270afe0e2d4c45a7bdb1e8d036e4502928c2 \ + --hash=sha256:ba0db34f7e1d803a878284cd70e4c63cb6ae2510ba51937bf8f45ba997cefcf7 \ + --hash=sha256:bac1d573dfa84ce59a5547073e28fa7326d53352adda6912e362da0b917fcef4 \ + --hash=sha256:c51bf8ac4575af2e0678bccfb07e47321fc7acb5049b4482832c5c195e04e13a \ + --hash=sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0 \ + --hash=sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193 \ + --hash=sha256:ce1794f4ea6cc3ca29463f42d665c32ba1b964b48958a66497917fe9069f26e6 \ + --hash=sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de \ + --hash=sha256:d8e11f167935b3eb089ac9038e1a063e6d7dbe995c0bb4a661e614583352e76f \ + --hash=sha256:dc053420fc75749c961e2a4c906398d7c15725d36ccc04ae6d16093167223b58 \ + --hash=sha256:deb10a1528473c11f72a0939eed36d83e847d7cbb63e8cc5611fb7a912d38614 \ + --hash=sha256:dfab85db094068ff42e2a3563f60ab3dddcc9d6488a35abf0132daec13209c8a \ + --hash=sha256:e172cf795a3ba5246d3529e4d34c53db70e888fa582a8ffebd2e6e48bc0cba50 \ + --hash=sha256:e9e408fc016dffd20661f0126c53d8a31c2821b5c13c5d67a0f5ed5de93319ad \ + --hash=sha256:ec0a592e926071b4abad50c1495cd0d0d513324b3ff5e7267067c33ba27506e4 \ + --hash=sha256:f14b618fc30de822681ee986cfdcc2d9327229dc4c98aed16896761cacd468b9 \ + --hash=sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2 \ + --hash=sha256:f7691a6788ad9196872f95716df5bc643ebba13c97140b7a5ee5c8e75d1dea81 + # via + # google-api-core + # google-cloud-monitoring + # grpcio-status +grpcio-status==1.80.0 \ + --hash=sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe \ + --hash=sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd + # via google-api-core +idna==3.11 \ + --hash=sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea \ + --hash=sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902 + # via requests +proto-plus==1.27.2 \ + --hash=sha256:6432f75893d3b9e70b9c412f1d2f03f65b11fb164b793d14ae2ca01821d22718 \ + --hash=sha256:b2adde53adadf75737c44d3dcb0104fde65250dfc83ad59168b4aa3e574b6a24 + # via + # google-api-core + # google-cloud-monitoring +protobuf==6.33.6 \ + --hash=sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326 \ + --hash=sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901 \ + --hash=sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3 \ + --hash=sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a \ + --hash=sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135 \ + --hash=sha256:bd56799fb262994b2c2faa1799693c95cc2e22c62f56fb43af311cae45d26f0e \ + --hash=sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3 \ + --hash=sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2 \ + --hash=sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593 \ + --hash=sha256:f443a394af5ed23672bc6c486be138628fbe5c651ccbc536873d7da23d1868cf + # via + # google-api-core + # google-cloud-monitoring + # googleapis-common-protos + # grpcio-status + # proto-plus +pyasn1==0.6.3 \ + --hash=sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf \ + --hash=sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde + # via pyasn1-modules +pyasn1-modules==0.4.2 \ + --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \ + --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6 + # via google-auth +pycparser==3.0 \ + --hash=sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29 \ + --hash=sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992 + # via cffi +requests==2.33.1 \ + --hash=sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517 \ + --hash=sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a + # via google-api-core +typing-extensions==4.15.0 \ + --hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \ + --hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548 + # via grpcio +urllib3==2.6.3 \ + --hash=sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed \ + --hash=sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4 + # via requests diff --git a/container-images/cpu/k6-benchmark/scripts/k6-diffusers-flux-2-klein-4b.js b/container-images/cpu/k6-benchmark/scripts/k6-diffusers-flux-2-klein-4b.js new file mode 100644 index 000000000..59360c4d2 --- /dev/null +++ b/container-images/cpu/k6-benchmark/scripts/k6-diffusers-flux-2-klein-4b.js @@ -0,0 +1,250 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import http from "k6/http"; +import { check, sleep } from "k6"; +import exec from "k6/execution"; + +const TARGET_URL = __ENV.TARGET_URL || "http://localhost:8000/generate"; +const ACCELERATOR_NAME = __ENV.ACCELERATOR_NAME || "unknown"; +const INFERENCE_SERVER_TYPE = __ENV.INFERENCE_SERVER_TYPE || "unknown"; + +// Extract hostname for deployment_name tag +const urlMatch = TARGET_URL.match(/https?:\/\/([^\/:]+)/); +const DEPLOYMENT_NAME = urlMatch ? urlMatch[1] : "unknown"; + +// Parse dynamic scenarios +if (!__ENV.SCENARIOS_JSON) { + throw new Error("SCENARIOS_JSON environment variable is required."); +} + +let configScenarios = []; +try { + configScenarios = JSON.parse(__ENV.SCENARIOS_JSON); +} catch (e) { + throw new Error(`Failed to parse SCENARIOS_JSON: ${e.message}`); +} + +const MODEL_ID = configScenarios[0].model_id || "unknown"; +const SEED = 42; + +// Validate first scenario for warmup +if ( + !configScenarios[0].width || + !configScenarios[0].height || + !configScenarios[0].steps +) { + throw new Error( + "Each scenario in SCENARIOS_JSON must specify 'width', 'height', and 'steps'.", + ); +} + +console.log( + `Loaded ${configScenarios.length} benchmark scenarios for model ${MODEL_ID}: ${JSON.stringify(configScenarios)}`, +); + +// Lookup table for scenario configurations +const SCENARIO_CONFIGS = { + warmup: { + batch: configScenarios[0].batch, + vus: configScenarios[0].vus, + steps: configScenarios[0].steps, + width: configScenarios[0].width, + height: configScenarios[0].height, + model_id: MODEL_ID, + }, +}; + +// Build k6 scenarios object +const scenarios = { + warmup: { + executor: "constant-vus", + vus: configScenarios[0].vus, + duration: "5m", + exec: "generate", + tags: { + scenario: "warmup", + batch_size: configScenarios[0].batch.toString(), + vus: configScenarios[0].vus.toString(), + num_inference_steps: configScenarios[0].steps.toString(), + width: configScenarios[0].width.toString(), + height: configScenarios[0].height.toString(), + inference_server: INFERENCE_SERVER_TYPE, + }, + }, +}; + +let currentTimeOffsetSeconds = 300; // 5m warmup +const COOL_DOWN_SECONDS = 30; + +configScenarios.forEach((s, index) => { + if (!s.width || !s.height || !s.steps) { + throw new Error( + `Scenario ${index} is missing required fields: width, height, or steps.`, + ); + } + const accelTag = (__ENV.ACCELERATOR_NAME || "unknown") + .toLowerCase() + .replace(/_/g, "-"); + const scenarioName = `bench_${accelTag}_b${s.batch}_v${s.vus}_s${s.steps}_r${s.width}x${s.height}`; + const startTime = currentTimeOffsetSeconds + index * COOL_DOWN_SECONDS; + + scenarios[scenarioName] = { + executor: "constant-vus", + vus: s.vus, + duration: s.duration || "10m", + startTime: `${startTime}s`, + exec: "generate", + tags: { + scenario: scenarioName, + batch_size: s.batch.toString(), + vus: s.vus.toString(), + num_inference_steps: s.steps.toString(), + width: s.width.toString(), + height: s.height.toString(), + inference_server: INFERENCE_SERVER_TYPE, + }, + }; + + SCENARIO_CONFIGS[scenarioName] = { + batch: s.batch, + vus: s.vus, + steps: s.steps, + width: s.width, + height: s.height, + model_id: s.model_id || MODEL_ID, + }; + + let durationSeconds = 600; // 10m default + if (typeof s.duration === "string") { + if (s.duration.endsWith("m")) + durationSeconds = parseInt(s.duration.slice(0, -1)) * 60; + else if (s.duration.endsWith("s")) + durationSeconds = parseInt(s.duration.slice(0, -1)); + } + currentTimeOffsetSeconds += durationSeconds; +}); + +export const options = { + tags: { + model: MODEL_ID, + accelerator: ACCELERATOR_NAME, + seed: SEED.toString(), + target_url: TARGET_URL, + deployment_name: DEPLOYMENT_NAME, + }, + discardResponseBodies: false, // Need body for validation and error reporting + scenarios: scenarios, + thresholds: { + http_req_failed: ["rate<0.05"], + }, +}; + +const params = { + headers: { + "Content-Type": "application/json", + }, + timeout: "120s", +}; + +export function setup() { + console.log(`Starting dynamic k6 load test against: ${TARGET_URL}`); + console.log( + `Running ${Object.keys(scenarios).length} scenarios (including warmup)`, + ); +} + +let lastScenario = ""; +let consecutiveFailures = 0; +let abortCurrentScenario = false; + +export function generate() { + const scenarioName = exec.scenario.name; + const config = SCENARIO_CONFIGS[scenarioName]; + + if (!config) { + throw new Error(`No configuration found for scenario: ${scenarioName}`); + } + + if (scenarioName !== lastScenario) { + console.log( + `VU ${exec.vu.idInTest} starting scenario: ${scenarioName} (Batch: ${config.batch}, VUs: ${config.vus})`, + ); + lastScenario = scenarioName; + consecutiveFailures = 0; + abortCurrentScenario = false; + } + + if (abortCurrentScenario) { + sleep(1); + return; + } + + let payload; + let endpoint = TARGET_URL; + + if (INFERENCE_SERVER_TYPE === "sglang") { + endpoint = `${TARGET_URL}/v1/images/generations`; + payload = JSON.stringify({ + model: `/gcs/${config.model_id}`, + prompt: + "A highly detailed, cinematic photograph of a futuristic city skyline at sunset, neon lights, 8k resolution, photorealistic", + n: config.batch, + size: `${config.width}x${config.height}`, + num_inference_steps: config.steps, + seed: SEED, + response_format: "b64_json", + }); + } else { + payload = JSON.stringify({ + prompt: + "A highly detailed, cinematic photograph of a futuristic city skyline at sunset, neon lights, 8k resolution, photorealistic", + width: config.width, + height: config.height, + num_inference_steps: config.steps, + seed: SEED, + batch_size: config.batch, + }); + } + + if (consecutiveFailures === 0) { + console.log(`Endpoint: ${endpoint}`); + console.log(`Payload: ${payload}`); + } + + const res = http.post(endpoint, payload, params); + + const success = check(res, { + "is status 200": (r) => r.status === 200, + "has body": (r) => r.body && r.body.length > 0, + }); + + if (!success) { + consecutiveFailures++; + if (consecutiveFailures === 1) { + console.error( + `Request failed! Status: ${res.status}. Body: ${res.body || "empty"}`, + ); + } + if (consecutiveFailures >= 3) { + console.error(`Scenario ${scenarioName} aborted due to 3 consecutive failures.`); + abortCurrentScenario = true; + } + sleep(1); + } else { + consecutiveFailures = 0; + } + + sleep(0.01); +} diff --git a/container-images/gpu/diffusers-flux/Dockerfile b/container-images/gpu/diffusers-flux/Dockerfile index 9af36bb6d..3de731de0 100644 --- a/container-images/gpu/diffusers-flux/Dockerfile +++ b/container-images/gpu/diffusers-flux/Dockerfile @@ -25,8 +25,8 @@ COPY --from=primary requirements.txt . # Install Python and dependencies RUN apt-get update && \ - apt-get install -y python3 python3-pip && \ - pip install --no-cache-dir -r requirements.txt + apt-get install -y python3 python3-pip && \ + pip install --no-cache-dir -r requirements.txt COPY --from=primary app.py . diff --git a/container-images/gpu/sglang-diffusers/Dockerfile b/container-images/gpu/sglang-diffusers/Dockerfile new file mode 100644 index 000000000..31f9d2fd4 --- /dev/null +++ b/container-images/gpu/sglang-diffusers/Dockerfile @@ -0,0 +1,23 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM lmsysorg/sglang:v0.5.9-cu129-amd64-runtime + +RUN pip install -e "python[diffusion]" + +ENTRYPOINT [ "sglang" ] + +EXPOSE 30000 + +CMD [ "--help" ] diff --git a/container-images/gpu/sglang-diffusers/cloudbuild.yaml b/container-images/gpu/sglang-diffusers/cloudbuild.yaml new file mode 100644 index 000000000..e25d7488b --- /dev/null +++ b/container-images/gpu/sglang-diffusers/cloudbuild.yaml @@ -0,0 +1,29 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +images: + - ${_DESTINATION} + +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_8 + +steps: + - args: + - build + - --tag=${_DESTINATION} + - . + id: "Build SGLang-Diffusers image" + name: "docker.io/docker:28.3.3-dind-alpine3.22" + waitFor: ["-"] diff --git a/cspell.json b/cspell.json index d4bfb93ef..096367f74 100644 --- a/cspell.json +++ b/cspell.json @@ -74,6 +74,10 @@ "name": "ray", "path": ".github/workflows/dictionary/ray.txt" }, + { + "name": "sglang", + "path": ".github/workflows/dictionary/sglang.txt" + }, { "name": "shell", "path": ".github/workflows/dictionary/shell.txt" @@ -113,6 +117,7 @@ "nvidia", "python", "ray", + "sglang", "shell", "svg", "terraform", diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-k6-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-k6-hf-model.md new file mode 100644 index 000000000..567cdbb25 --- /dev/null +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-k6-hf-model.md @@ -0,0 +1,423 @@ +# GKE Inference Benchmarking with k6 + +This example sets up a benchmarking job on Google Kubernetes Engine (GKE), +leveraging the Inference reference-architecture for model deployment and the k6 +open-source tool for scalable benchmarking. + +This implementation deploys the k6 as a Kubernetes Job and can be customized +with different load scenarios and datasets. + +This example is built on top of the +[GKE Inference reference architecture](/docs/platforms/gke/base/use-cases/inference-ref-arch/README.md). + +## Before you begin + +1. Deploy and configure the + [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md). + +### Requirements + +This guide was designed to be run from +[Cloud Shell](https://cloud.google.com/shell) in the Google Cloud console. Cloud +Shell has the following tools installed: + +- [Google Cloud Command Line Interface (`gcloud` CLI)](https://cloud.google.com/cli) +- `curl` +- `envsubst` +- `jq` +- `kubectl` +- `sponge` + +## Create and configure the Google Cloud resources + +1. Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +1. Update terraform environment variables depending on the accelerators being + used (GPU/TPU/BOTH). Example: + + ```shell + export TF_VAR_enable_gpu=true + export TF_VAR_enable_tpu=false + ``` + +1. Deploy the benchmark infrastructure: + + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + rm -rf "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench/.terraform/terraform.tfstate" && \ + terraform -chdir="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench" init && \ + terraform -chdir="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench" plan -input=false -out=tfplan && \ + terraform -chdir="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench" apply -input=false tfplan && \ + rm "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench/tfplan" + ``` + +## Define the Benchmarking Configuration + +1. Choose a model: + + - [**FLUX.2-klein-4B**](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B): + + ```shell + export HF_MODEL_ID="black-forest-labs/flux.2-klein-4b" + ``` + +1. Select an accelerator: + + | Model | l4 | RTX Pro 6000 | + | --------------- | --- | ------------ | + | flux.2-klein-4b | ✅ | ✅ | + + - 1x **NVIDIA Tesla L4 24GB**, running on a `g2-standard-16` Google + Kubernetes Engine node: + + ```shell + export ACCELERATOR_TYPE="l4" + ``` + + - **NVIDIA RTX Pro 6000**: + + - 1x **NVIDIA RTX Pro 6000**: + + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000" + ``` + + - 1/2 (half) of a **NVIDIA RTX Pro 6000**: + + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000-1-2" + ``` + + - 1/4 (one fourth) of a **NVIDIA RTX Pro 6000**: + + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000-1-4" + ``` + + - 1/8 (one eight) of a **NVIDIA RTX Pro 6000**: + + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000-1-8" + ``` + + Ensure that you have enough quota in your project to provision the selected + accelerator type. For more information, see about viewing GPU quotas, see + [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota). + +1. Configure sequential benchmarking scenarios using the `K6_SCENARIOS_JSON` + variable. This variable accepts a JSON array of objects, where each object + represents a specific load configuration to be tested sequentially. + + ```shell + export K6_SCENARIOS_JSON='[{"batch": 1, "vus": 1}, {"batch": 2, "vus": 4}, {"batch": 4, "vus": 4}]' + ``` + + **JSON Attribute Definitions:** + + - **`batch`**: The number of prompts sent in a single inference request. + Larger batch sizes generally increase GPU utilization but also increase + request latency. + - **`vus`**: Virtual Users. The number of concurrent worker threads sending + requests to the server. Increasing VUs helps saturate the GPU by filling + compute gaps between individual requests. + - **`duration`** (Optional): The length of time to run this specific scenario + (e.g., `"10m"`, `"300s"`). Defaults to `10m` if not specified. + + **Execution Workflow:** The k6 script automatically performs a **5-minute + warmup** using the first configured scenario's VU count to ensure the model + is loaded and compiled. Between each subsequent scenario, the script enforces + a **30-second cool-down period** to allow hardware metrics to return to + baseline for clean analysis. + +1. Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +### Destructive Testing & Stability + +When testing "frontier" configurations (e.g., Batch Size > 18 on RTX 6000), +there is a high risk of triggering a **CUDA Out-of-Memory (OOM)** error. This +will cause the inference server Pod to crash and restart. + +To ensure benchmark integrity, follow these stability guidelines: + +1. **Order Destructive Tests Last**: Always place scenarios that are likely to + crash the server at the very end of your `K6_SCENARIOS_JSON` array. This + prevents a crash from polluting the results of subsequent tests. +1. **The Re-compilation Penalty**: The inference server uses `torch.compile` + with CUDA Graphs. If the server restarts due to an OOM, it requires **3-5 + minutes** to re-warm and re-compile before it can achieve peak performance. +1. **Recovery Warmups**: If you must run more tests after a potentially + destructive scenario in the same Job, insert a "Recovery Warmup" scenario + immediately after the risky one. A recovery scenario should use a small load + (e.g., `{"batch": 1, "vus": 1, "duration": "5m"}`) to give the server time to + restart and re-compile while k6 records the errors during the transition. + +### Tested Configurations Summary + +The following table illustrates the configurations that we tested serving +`flux.2-klein-4b`, and which ones run the benchmark suite to completion, +assuming no other load on the inference server. + +| Accelerator | Backend | Resolution | Batch Size | VUs | Steps | Status | +| -------------------- | ------- | ---------- | ---------- | --- | ----- | -------- | +| **NVIDIA L4** | SGLang | 1024x1024 | 1 | 1-4 | 20 | ✅ | +| **NVIDIA L4** | SGLang | 1024x1024 | 2+ | 1 | 20 | ❌ (OOM) | +| **NVIDIA L4** | SGLang | 768x768 | 1 | 1 | 20 | ✅ | +| **NVIDIA L4** | SGLang | 512x512 | 1-4 | 1-4 | 10-20 | ✅ | +| **NVIDIA L4 x2** | SGLang | 1024x1024 | 1 | 1-4 | 20 | ✅ | +| **NVIDIA L4 x2** | SGLang | 1024x1024 | 2+ | 1 | 20 | ❌ (OOM) | +| **NVIDIA L4 x2** | SGLang | 512x512 | 1 | 1 | 20 | ✅ | +| **NVIDIA L4 x4** | SGLang | 1024x1024 | 1 | 1-2 | 20 | ✅ | +| **NVIDIA L4 x4** | SGLang | 1024x1024 | 2+ | 1 | 20 | ❌ (OOM) | +| **NVIDIA L4 x4** | SGLang | 512x512 | 1 | 1 | 20 | ✅ | +| **RTX Pro 6000** | SGLang | 1024x1024 | 1-24 | 1-8 | 10-50 | ✅ | +| **RTX Pro 6000** | SGLang | 512x512 | 1-4 | 1-4 | 10-20 | ✅ | +| **RTX Pro 6000** | SGLang | 768x768 | 1 | 1 | 20 | ✅ | +| **RTX Pro 6000 1/2** | SGLang | 1024x1024 | 1-24 | 1-8 | 10-50 | ✅ | +| **RTX Pro 6000 1/2** | SGLang | 512x512 | 1-4 | 1-4 | 10-20 | ✅ | +| **RTX Pro 6000 1/2** | SGLang | 768x768 | 1 | 1 | 20 | ✅ | +| **RTX Pro 6000 1/4** | SGLang | 1024x1024 | 1-3 | 1-4 | 10-20 | ✅ | +| **RTX Pro 6000 1/4** | SGLang | 1024x1024 | 4+ | 1-8 | 10-20 | ❌ (OOM) | +| **RTX Pro 6000 1/4** | SGLang | 512x512 | 1-4 | 1-4 | 10-20 | ✅ | +| **RTX Pro 6000 1/4** | SGLang | 768x768 | 1 | 1 | 20 | ✅ | +| **RTX Pro 6000 1/8** | SGLang | All | N/A | N/A | N/A | ❌ (OOM) | + +## Automated Execution (Recommended) + +You can use the provided orchestrator script to automate the entire lifecycle +(build, deploy, monitor, and analyze) in a single command using the environment +variables defined in the previous steps. The script supports running benchmarks +sequentially across multiple accelerators by providing a comma-separated list: + +```shell +# Example: Testing multiple resolutions and batch sizes in one run +export ACCELERATOR_TYPE="l4,rtx-pro-6000" +export K6_SCENARIOS_JSON='[ + {"batch": 1, "vus": 1, "width": 512, "height": 512, "steps": 10}, + {"batch": 4, "vus": 4, "width": 768, "height": 768, "steps": 20}, + {"batch": 16, "vus": 1, "width": 1024, "height": 1024, "steps": 50} +]' + +./platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/run_benchmark.sh \ + --accelerator "${ACCELERATOR_TYPE}" \ + --model "${HF_MODEL_ID}" \ + --scenarios "${K6_SCENARIOS_JSON}" \ + --build +``` + +**Script Flags:** + +- `--accelerator`: The accelerator type(s). Supports a single value (e.g., `l4`) + or a comma-separated list for sequential runs (e.g., `l4,rtx-pro-6000`). +- `--model`: The Hugging Face model ID. +- `--scenarios`: The JSON array of benchmark scenarios. Each scenario MUST + specify `batch`, `vus`, `width`, `height`, and `steps`. +- `--build`: (Optional) Rebuild and push the k6 benchmark container image once + before starting the runs. +- `--sync-only`: (Optional) Skip executing the benchmark workload on the + cluster, and jump straight to downloading the latest results from GCS and + running the data aggregation pipeline. +- `--manual-cost`: (Optional) Override the default on-demand hourly price. + +## Manual Execution + +If you prefer to run the benchmarking steps individually, follow the +instructions below. + +### Build the benchmark container image + +1. Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +1. Build the container image for the Diffusers inference server. + + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + rm -rf ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform/ terraform.tfstate* && \ + terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark init && \ + terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark plan -input=false -out=tfplan && \ + terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark apply -input=false tfplan && \ + rm ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/tfplan + ``` + + The build usually takes about 1 minute. + +### Deploy the benchmark workload + +1. Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +1. Set the benchmark parameters: + + ```shell + export K6_SCENARIOS_JSON='[{"batch": 1, "vus": 1, "width": 1024, "height": 1024, "steps": 20}]' + ``` + +1. Configure the deployment: + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh" + ``` + +1. Deploy the benchmark workload. + + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/${HF_MODEL_NAME}" + ``` + +1. Watch the deployment until it is ready. + + ```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/k6-benchmark-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/k6-benchmark-${HF_MODEL_NAME} --all-containers --tail 10" + ``` + + When the deployment is ready, you will see the following: + + ```text + NAME READY UP-TO-DATE AVAILABLE AGE + k6-benchmark- 1/1 1 1 ### + ``` + + You can press `CTRL`+`c` to terminate the watch. + +### Analyze and Interpret Results + +1. Download the files where the benchmarker collected data points: + + ```shell + gcloud storage cp -r gs://${hub_models_bucket_bench_results_name}/ . + ``` + +1. Set up the environment to run the metrics summarization script: + + ```shell + # Create and activate a Python virtual environment + python3 -m venv .venv + . .venv/bin/activate + + # Install dependencies + pip install --require-hashes -r "${ACP_REPO_DIR}/container-images/cpu/k6-benchmark/requirements.txt" + ``` + +1. Set the hourly cost in USD for the Compute Engine machine you're using to run + the model by initializing the `MODEL_MACHINE_HOURLY_COST_USD` variable. For + example, if a machine costs `1.147208384` USD per hour, you initialize + `MODEL_MACHINE_HOURLY_COST_USD` as follows: + + ```shell + export MODEL_MACHINE_HOURLY_COST_USD="1.147208384" + ``` + + For more information about machine pricing, see: + + - [Accelerator-optimized pricing](https://cloud.google.com/products/compute/pricing/accelerator-optimized) + +1. Run the metrics aggregation and reporting script: + + ```shell + for f in "${hub_models_bucket_bench_results_name}"/*"${HF_MODEL_NAME}"*"${ACCELERATOR_TYPE}"*.jsonl; do + echo "Processing $f..." + python3 "${ACP_REPO_DIR}/container-images/cpu/k6-benchmark/extract_metrics.py" \ + --file "$f" \ + --hourly-cost "${MODEL_MACHINE_HOURLY_COST_USD}" \ + --project-id "${cluster_project_id}" \ + --output-csv k6-benchmark.csv + done + ``` + +1. Review aggregated results for each run by examining the contents of the + aggregated results files: + + ```shell + for f in "${hub_models_bucket_bench_results_name}"/*report.txt; do + echo "Visualizing $f contents:" + cat "$f" + done + ``` + + The output is similar to the following: + + ```text + ================================================== + GKE Performance Consolidated Report + Source: k6-diffusers-flux-2-klein-4b-rtx-pro-6000-20260422T123505Z.jsonl + ================================================== + SUMMARY TABLE: + Scenario Img/s Lat p50 GPU % Cost/1k + ------------------------------------------------------------ + bench_b1_v1 0.3779 2.648 84.95% $3.3074 + bench_b2_v4 0.4497 9.087 99.89% $2.7798 + + -------------------------------------------------- + SCENARIO: bench_b2_v4 (Batch: 2, VUs: 4) + -------------------------------------------------- + UX Metrics: 0.4497 Img/s, 0.2248 RPS, Success: 100.00% + Latency (Req): p50=18.174s, p95=18.196s, p99=30.436s + Latency (Img): p50=9.087s, p95=9.098s, p99=15.218s + Hardware: VRAM=25984.0 MiB (26.43%), Compute=99.89%, Power=591.53 W + Economics: Cost/1k Images = $2.7798 + ``` + +1. Review the aggregated results across all runs: + + ```shell + column -s, -t < k6-benchmark.csv | less -S + ``` + + The output is similar to the following: + + ```text + Source File Deployment Name Target URL Model Accelerator Resolution Inference Steps Batch Size VUs Start Time (UTC) End Time (UTC) Total Time (s) Total Requests Throughput (Images/s) Request Latency p50 (s) Peak VRAM Average Compute Cost per 1k Images ($) + k6-diffusers-flux-2-klein-4b-rtx-pro-6000-20260421.jsonl diffusers-rtx-pro-6000-flux http://... flux-2-klein-4b rtx-pro-6000 1024x1024 20 2 4 2026-04-22 12:40:28 2026-04-22 12:50:10 582.66 131 0.4497 18.174 25984.0 MiB 99.89% 2.7798 + ``` + +## Key LLM Performance Metrics Metric Description Optimization Focus + +- **_Time-to-First-Token (TTFT)_**: Latency from request start to the first + output token. Crucial for perceived responsiveness in chatbots. + +- **_Time-per-Output-Token (TPOT)_**: Average time to generate subsequent + tokens. Key measure of generation speed and sustained throughput. + +- **_Total Latency (P95/P99)_**: End-to-end time for the entire response. + Represents the experience of users with the slowest responses. + +- **_Throughput (Tokens/s)_**: Total tokens generated per second under load. + Measure of infrastructure efficiency and capacity. + +## Clean up + +1. Delete the benchmarking job. + + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/${HF_MODEL_NAME}" + ``` + +1. Destroy the benchmarking resources. + + > Note: This will only destroy your benchmarking results GCS bucket only if + > its empty + + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench && \ + rm -rf .terraform/ terraform.tfstate* && \ + terraform init && + terraform destroy -auto-approve + ``` diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md index 475f60081..a34a47766 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md @@ -8,233 +8,285 @@ This example is built on top of the ## Before you begin -- The - [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md) - is deployed and configured. +1. Deploy and configure the + [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md). -- Get access to the models. +1. Get access to the models. - - For FLUX.1-schnell: + - For FLUX.1-schnell: - - Accept the conditions to access its files and content on the Hugging Face - model page. - - [**black-forest-labs/FLUX.1-schnell**](https://huggingface.co/black-forest-labs/FLUX.1-schnell) + - Accept the conditions to access its files and content on the Hugging Face + model page. + - [**black-forest-labs/FLUX.1-schnell**](https://huggingface.co/black-forest-labs/FLUX.1-schnell) -- Ensure your - [Hugging Face Hub **Read** access token](/platforms/gke/base/core/huggingface/initialize/README.md) - has been added to Secret Manager. + - For FLUX.2-klein-4B: The model is not gated, so there's no license check. + +1. Ensure your + [Hugging Face Hub **Read** access token](/platforms/gke/base/core/huggingface/initialize/README.md) + has been added to Secret Manager. ## Create and configure the Google Cloud resources -- Deploy the online GPU resources. +1. Deploy the online GPU resources. - ```shell - export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" - cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \ - rm -rf .terraform/ terraform.tfstate* && \ - terraform init && \ - terraform plan -input=false -out=tfplan && \ - terraform apply -input=false tfplan && \ - rm tfplan - ``` + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \ + rm -rf .terraform/ terraform.tfstate* && \ + terraform init && \ + terraform plan -input=false -out=tfplan && \ + terraform apply -input=false tfplan && \ + rm tfplan + ``` ## Download the model to Cloud Storage -- Choose the model. +1. Choose the model. + + - [**FLUX.1-schnell**](https://huggingface.co/black-forest-labs/FLUX.1-schnell): + + ```shell + export HF_MODEL_ID="black-forest-labs/flux.1-schnell" + ``` - - **FLUX.1-Schnell**: + - [**FLUX.2-klein-4B**](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B): - ```shell - export HF_MODEL_ID="black-forest-labs/flux.1-schnell" - ``` + ```shell + export HF_MODEL_ID="black-forest-labs/flux.2-klein-4b" + ``` -- Source the environment configuration. +1. Source the environment configuration. - ```shell - source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" - ``` + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` -- Configure the model download job. +1. Configure the model download job. - ```shell - "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh" - ``` + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh" + ``` -- Deploy the model download job. +1. Deploy the model download job. - ```shell - kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" - ``` + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" + ``` -- Watch the model download job until it is complete. +1. Watch the model download job until it is complete. - ```shell - watch --color --interval 5 --no-title \ - "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete' - echo '\nLogs(last 10 lines):' - kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10" - ``` + ```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10" + ``` - When the job is complete, you will see the following: + When the job is complete, you will see the following: - ```text - NAME STATUS COMPLETIONS DURATION AGE - XXXXXXXX-hf-model-to-gcs Complete 1/1 ### ### - ``` + ```text + NAME STATUS COMPLETIONS DURATION AGE + XXXXXXXX-hf-model-to-gcs Complete 1/1 ### ### + ``` - You can press `CTRL`+`c` to terminate the watch. + You can press `CTRL`+`c` to terminate the watch. -- Delete the model download job. +1. Delete the model download job. - ```shell - kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" - ``` + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" + ``` ## Build the container image -- Source the environment configuration. +1. Source the environment configuration. - ```shell - source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" - ``` + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` -- Build the container image for the Diffusers inference server. +1. Build the container image for the Diffusers inference server. - ```shell - export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" - cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux && \ - rm -rf .terraform/ terraform.tfstate* && \ - terraform init && \ - terraform plan -input=false -out=tfplan && \ - terraform apply -input=false tfplan && \ - rm tfplan - ``` + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + rm -rf ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/.terraform/ terraform.tfstate* && \ + terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux init && \ + terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux plan -input=false -out=tfplan && \ + terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux apply -input=false tfplan && \ + rm ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/tfplan + ``` - > The build usually takes 10 to 15 minutes. + The build usually takes about 25 minutes. ## Deploy the inference workload -- Source the environment configuration. +1. Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +1. Check the model name. + + ```shell + echo "HF_MODEL_NAME=${HF_MODEL_NAME}" + ``` + + > If the `HF_MODEL_NAME` variable is not set, ensure that `HF_MODEL_ID` is + > set and source the `set_environment_variables.sh` script: + > + > ```shell + > source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + > ``` + + 1. Select an accelerator. + + | Model | NVIDIA L4 | 2x NVIDIA L4 | 4x NVIDIA L4 | NVIDIA H100 | NVIDIA RTX Pro 6000 | 1/2 NVIDIA RTX Pro 6000 | 1/4 NVIDIA RTX Pro 6000 | 1/8 NVIDIA RTX Pro 6000 | + | --------------- | --------- | ------------ | ------------ | ----------- | ------------------- | ----------------------- | ----------------------- | ----------------------- | + | flux.1-schnell | ✅ | Not tested | Not tested | ✅ | Not tested | Not tested | Not tested | Not tested | + | flux.2-klein-4B | ✅ | ✅ | ✅ | Not tested | ✅ | ✅ | ✅ | ❌ | + + > When using fractional GPUs (1/2, 1/4, 1/8), you might see a warning in the + > logs of the `inference-server` container: + > `No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'`. You can + > ignore this warning. It's due to the GPU virtualization layer masking + > hardware probes from the PyTorch JIT compiler. It does not affect inference + > performance or stability. + + - **NVIDIA Tesla L4 24GB**: + + - 1x **NVIDIA Tesla L4**: + + ```shell + export ACCELERATOR_TYPE="l4" + ``` + + - 2x **NVIDIA Tesla L4**: + + ```shell + export ACCELERATOR_TYPE="l4-x2" + ``` + + - 4x **NVIDIA Tesla L4**: + + ```shell + export ACCELERATOR_TYPE="l4-x4" + ``` - ```shell - source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" - ``` + - 1x **NVIDIA H100 80GB**: -- Set the environment variables for the workload. + ```shell + export ACCELERATOR_TYPE="h100" + ``` - - Check the model name. + - **NVIDIA RTX Pro 6000**: - ```shell - echo "HF_MODEL_NAME=${HF_MODEL_NAME}" - ``` + - 1x **NVIDIA RTX Pro 6000**: - > If the `HF_MODEL_NAME` variable is not set, ensure that `HF_MODEL_ID` is - > set and source the `set_environment_variables.sh` script: - > - > ```shell - > source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"` - > ``` + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000" + ``` - - Select an accelerator. + - 1/2x (half) of a **NVIDIA RTX Pro 6000**: - | Model | l4 | h100 | - | -------------- | --- | ---- | - | flux.1-schnell | ✅ | ✅ | + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000-1-2" + ``` - - **NVIDIA Tesla L4 24GB**: + - 1/4x (one fourth) of a **NVIDIA RTX Pro 6000**: - ```shell - export ACCELERATOR_TYPE="l4" - ``` + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000-1-4" + ``` - - **NVIDIA H100 80GB**: + - 1/8x (one eight) of a **NVIDIA RTX Pro 6000**: - ```shell - export ACCELERATOR_TYPE="h100" - ``` + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000-1-8" + ``` - Ensure that you have enough quota in your project to provision the selected - accelerator type. For more information, see about viewing GPU quotas, see - [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota). + Ensure that you have enough quota in your project to provision the selected + accelerator type. For more information, see about viewing GPU quotas, see + [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota). -- Configure the deployment. +1. Configure the deployment. - ```shell - "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" - ``` + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" + ``` -- Deploy the inference workload. +1. Deploy the inference workload. - ```shell - kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" - ``` + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + ``` -- Watch the deployment until it is ready. +1. Watch the deployment until it is ready. - ```shell - watch --color --interval 5 --no-title \ - "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' - echo '\nLogs(last 10 lines):' - kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10" - ``` + ```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10" + ``` - When the deployment is ready, you will see the following: + When the deployment is ready, you will see the following: - ```text - NAME READY UP-TO-DATE AVAILABLE AGE - diffusers-- 1/1 1 1 ### - ``` + ```text + NAME READY UP-TO-DATE AVAILABLE AGE + diffusers-- 1/1 1 1 ### + ``` - You can press `CTRL`+`c` to terminate the watch. + You can press `CTRL`+`c` to terminate the watch. -- Send a test request. +1. Send a test request. - ```shell - kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} port-forward service/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} 8000:8000 >/dev/null & - PF_PID=$! - while ! echo -e '\x1dclose\x0d' | telnet localhost 8000 >/dev/null 2>&1; do - sleep 0.1 - done - curl http://localhost:8000/generate \ - --data '{ - "height": 512, - "num_inference_steps": 4, - "prompt": "A photo of a dog playing fetch in a park.", - "width": 512 - }' \ - --header "Content-Type: application/json" \ - --output ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png \ - --request POST \ - --show-error \ - --silent - ls -alh ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png - kill -9 ${PF_PID} - ``` + ```shell + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} port-forward service/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} 8000:8000 >/dev/null & + PF_PID=$! + while ! echo -e '\x1dclose\x0d' | telnet localhost 8000 >/dev/null 2>&1; do + sleep 0.1 + done + curl http://localhost:8000/generate \ + --data '{ + "height": 512, + "num_inference_steps": 4, + "prompt": "A photo of a dog playing fetch in a park.", + "width": 512 + }' \ + --header "Content-Type: application/json" \ + --output ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png \ + --request POST \ + --show-error \ + --silent + ls -alh ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png + kill -9 ${PF_PID} + ``` -- Delete the workload. +1. Delete the workload. - ```shell - kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" - ``` + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + ``` ## Clean up -- Destroy the container image. - - ```shell - export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" - cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux && \ - rm -rf .terraform/ terraform.tfstate* && \ - terraform init && - terraform destroy -auto-approve - ``` - -- Destroy the online GPU resources. - - ```shell - export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" - cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \ - rm -rf .terraform/ terraform.tfstate* && \ - terraform init && - terraform destroy -auto-approve - ``` +1. Destroy the container image. + + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux && \ + rm -rf .terraform/ terraform.tfstate* && \ + terraform init && + terraform destroy -auto-approve + ``` + +1. Destroy the online GPU resources. + + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \ + rm -rf .terraform/ terraform.tfstate* && \ + terraform init && + terraform destroy -auto-approve + ``` diff --git a/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh index 8f18078f9..9a0502324 100755 --- a/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh +++ b/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh @@ -15,7 +15,14 @@ # limitations under the License. BASH_SOURCE_MY_PATH="$( - cd "$(dirname "${BASH_SOURCE}")" >/dev/null 2>&1 + SCRIPT_SOURCE="${BASH_SOURCE[0]:-}" + if [[ -z "${SCRIPT_SOURCE:-}" ]]; then + # Fallback in case BASH_SOURCE is not defined, such as when sourcing this + # script from a non-Bash shell + SCRIPT_SOURCE="$0" + fi + + cd "$(dirname "${SCRIPT_SOURCE}")" >/dev/null 2>&1 || return 1 pwd -P )" diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/n4/custom-compute-cpu-n4-8.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/n4/custom-compute-cpu-n4-8.yaml new file mode 100644 index 000000000..034e9f7f5 --- /dev/null +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/n4/custom-compute-cpu-n4-8.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: cpu-n4-8 +spec: + activeMigration: + optimizeRulePriority: true + nodePoolConfig: + imageStreaming: + enabled: true + nodePoolAutoCreation: + enabled: true + priorities: + # Use reservations if available + - machineType: n4-standard-8 + maxPodsPerNode: 64 + reservations: + affinity: AnyBestEffort + spot: false + + # Use on-demand + - machineType: n4-standard-8 + maxPodsPerNode: 64 + spot: false diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-2.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-2.yaml new file mode 100644 index 000000000..a5d3c9f0e --- /dev/null +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-2.yaml @@ -0,0 +1,96 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: gpu-rtx-pro-6000-96gb-x1-2 +spec: + activeMigration: + optimizeRulePriority: true + nodePoolConfig: + imageStreaming: + enabled: true + nodePoolAutoCreation: + enabled: true + priorities: + # Use a specific reservation + # - gpu: + # count: 1 + # driverVersion: latest + # type: nvidia-rtx-pro-6000 + # machineType: g4-standard-24 + # maxPodsPerNode: 32 + # reservations: + # affinity: Specific + # specific: + # - name: nvidia-rtx-pro-6000-specific + # reservationBlock: + # name: + # spot: false + + # Use any reservation + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-24 + maxPodsPerNode: 32 + reservations: + affinity: AnyBestEffort + spot: false + + # Use on-demand + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-24 + maxPodsPerNode: 32 + spot: false + + # Use DWS FlexStart with 7 day limit + - flexStart: + enabled: true + nodeRecycling: + leadTimeSeconds: 3600 + gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-24 + maxPodsPerNode: 32 + maxRunDurationSeconds: 604800 + + # Use DWS FlexStart with 1 day limit + - flexStart: + enabled: true + nodeRecycling: + leadTimeSeconds: 3600 + gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-24 + maxPodsPerNode: 32 + maxRunDurationSeconds: 86400 + + # Use spot + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-24 + maxPodsPerNode: 32 + spot: true diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-4.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-4.yaml new file mode 100644 index 000000000..2fd214151 --- /dev/null +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-4.yaml @@ -0,0 +1,96 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: gpu-rtx-pro-6000-96gb-x1-4 +spec: + activeMigration: + optimizeRulePriority: true + nodePoolConfig: + imageStreaming: + enabled: true + nodePoolAutoCreation: + enabled: true + priorities: + # Use a specific reservation + # - gpu: + # count: 1 + # driverVersion: latest + # type: nvidia-rtx-pro-6000 + # machineType: g4-standard-12 + # maxPodsPerNode: 32 + # reservations: + # affinity: Specific + # specific: + # - name: nvidia-rtx-pro-6000-specific + # reservationBlock: + # name: + # spot: false + + # Use any reservation + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-12 + maxPodsPerNode: 32 + reservations: + affinity: AnyBestEffort + spot: false + + # Use on-demand + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-12 + maxPodsPerNode: 32 + spot: false + + # Use DWS FlexStart with 7 day limit + - flexStart: + enabled: true + nodeRecycling: + leadTimeSeconds: 3600 + gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-12 + maxPodsPerNode: 32 + maxRunDurationSeconds: 604800 + + # Use DWS FlexStart with 1 day limit + - flexStart: + enabled: true + nodeRecycling: + leadTimeSeconds: 3600 + gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-12 + maxPodsPerNode: 32 + maxRunDurationSeconds: 86400 + + # Use spot + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-12 + maxPodsPerNode: 32 + spot: true diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-8.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-8.yaml new file mode 100644 index 000000000..df6c0baa1 --- /dev/null +++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-8.yaml @@ -0,0 +1,96 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: cloud.google.com/v1 +kind: ComputeClass +metadata: + name: gpu-rtx-pro-6000-96gb-x1-8 +spec: + activeMigration: + optimizeRulePriority: true + nodePoolConfig: + imageStreaming: + enabled: true + nodePoolAutoCreation: + enabled: true + priorities: + # Use a specific reservation + # - gpu: + # count: 1 + # driverVersion: latest + # type: nvidia-rtx-pro-6000 + # machineType: g4-standard-6 + # maxPodsPerNode: 32 + # reservations: + # affinity: Specific + # specific: + # - name: nvidia-rtx-pro-6000-specific + # reservationBlock: + # name: + # spot: false + + # Use any reservation + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-6 + maxPodsPerNode: 32 + reservations: + affinity: AnyBestEffort + spot: false + + # Use on-demand + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-6 + maxPodsPerNode: 32 + spot: false + + # Use DWS FlexStart with 7 day limit + - flexStart: + enabled: true + nodeRecycling: + leadTimeSeconds: 3600 + gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-6 + maxPodsPerNode: 32 + maxRunDurationSeconds: 604800 + + # Use DWS FlexStart with 1 day limit + - flexStart: + enabled: true + nodeRecycling: + leadTimeSeconds: 3600 + gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-6 + maxPodsPerNode: 32 + maxRunDurationSeconds: 86400 + + # Use spot + - gpu: + count: 1 + driverVersion: latest + type: nvidia-rtx-pro-6000 + machineType: g4-standard-6 + maxPodsPerNode: 32 + spot: true diff --git a/platforms/gke/base/core/nvidia/initialize/README.md b/platforms/gke/base/core/nvidia/initialize/README.md index 38eb27567..ae31bfde4 100644 --- a/platforms/gke/base/core/nvidia/initialize/README.md +++ b/platforms/gke/base/core/nvidia/initialize/README.md @@ -1,4 +1,4 @@ -# NVIDIA initialize +# NVIDIA NGC initialization - Set environment variables. diff --git a/platforms/gke/base/core/workloads/nri_device_injector/.terraform.lock.hcl b/platforms/gke/base/core/workloads/nri_device_injector/.terraform.lock.hcl new file mode 100644 index 000000000..27d625960 --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/.terraform.lock.hcl @@ -0,0 +1,42 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "7.6.0" + constraints = "7.6.0" + hashes = [ + "h1:JYsO3fV5OtaNuRTdjGZC1Z3Ku1ZIrRJGwXwsBjtWudk=", + "zh:0c70c768b0a34d7a61de70d0e85cf0057820556647bbce2384972a45d7092e4e", + "zh:0cb7aab89cd435c5c8dab9231ea176d64fdf1df1125db15a6b9ead978a93c0b2", + "zh:32f25c42214bb356bb67cef6057c9904f2878cd053a7760e5ee3737619f28638", + "zh:38b05b1171ab086c88b95d379120fb6c28c9e895ae924557c11c35e138319119", + "zh:39d8206d453a614fa0be3aeac8ea3921fb3ab7ed122205cbbcc2a41ca6176cb5", + "zh:58d9059aa6b4aab5ede4fc173dcdc7b4d042d0b1a1ab55407dd345931d7f4815", + "zh:a4bc001c8ac7700d0107155296250c3b8969511e1a488f3b318f3db62362eef2", + "zh:cc75e25db4bb672ebc200a89d6cff9ff0b9911e14e188d1b4429bb3511d2b35f", + "zh:d7f7639930735f17b2b4f73814204a9a050186ea7e1c2671a52e0fa7ddf7a001", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:ff1190ae618dae9243de59caf4149abb4a9b775cb6439f119cd32a30f1a21820", + "zh:ff15b7b86787f6fd186211e7c37a72f2cc70374b284aaf063e1f989717441161", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.5.3" + constraints = "2.5.3" + hashes = [ + "h1:1Nkh16jQJMp0EuDmvP/96f5Unnir0z12WyDuoR6HjMo=", + "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927", + "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e", + "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf", + "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d", + "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09", + "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f", + "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1", + "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6", + "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47", + "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82", + ] +} diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_cluster.auto.tfvars b/platforms/gke/base/core/workloads/nri_device_injector/_cluster.auto.tfvars new file mode 120000 index 000000000..4d9954e5a --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_cluster_variables.tf b/platforms/gke/base/core/workloads/nri_device_injector/_cluster_variables.tf new file mode 120000 index 000000000..3f2c29e19 --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/_cluster_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_platform.auto.tfvars b/platforms/gke/base/core/workloads/nri_device_injector/_platform.auto.tfvars new file mode 120000 index 000000000..c3133e727 --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_platform_variables.tf b/platforms/gke/base/core/workloads/nri_device_injector/_platform_variables.tf new file mode 120000 index 000000000..c68738baa --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_workloads.auto.tfvars b/platforms/gke/base/core/workloads/nri_device_injector/_workloads.auto.tfvars new file mode 120000 index 000000000..b65551f53 --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/_workloads.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/workloads.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_workloads_variables.tf b/platforms/gke/base/core/workloads/nri_device_injector/_workloads_variables.tf new file mode 120000 index 000000000..fec5c48ce --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/_workloads_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/workloads_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/core/workloads/nri_device_injector/main.tf b/platforms/gke/base/core/workloads/nri_device_injector/main.tf new file mode 100644 index 000000000..ea8b5384c --- /dev/null +++ b/platforms/gke/base/core/workloads/nri_device_injector/main.tf @@ -0,0 +1,62 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + kubeconfig_directory = "${path.module}/../../../kubernetes/kubeconfig" + kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" + + manifests_directory = "${local.namespace_directory}/kube-system" + namespace_directory = "${local.manifests_directory_root}/namespace" + version_manifests_directory = "${path.module}/manifests" +} + +data "local_file" "kubeconfig" { + filename = local.kubeconfig_file +} + +resource "terraform_data" "manifests" { + input = { + manifests_dir = local.manifests_directory + version_manifests_dir = local.version_manifests_directory + } + + provisioner "local-exec" { + command = </dev/null 2>&1 && pwd -P)" +if [[ -z "${ACP_REPO_DIR:-}" ]]; then + ACP_REPO_DIR="$(cd "${SCRIPT_DIR}/../../../../../../" >/dev/null 2>&1 && pwd -P)" + export ACP_REPO_DIR +fi + +ENV_SCRIPT="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" +if [[ -f "${ENV_SCRIPT}" ]]; then + echo "[INFO] Sourcing environment variables..." + source "${ENV_SCRIPT}" +else + echo "[ERROR] Could not find environment script at ${ENV_SCRIPT}" + exit 1 +fi + +# --- Default Price Mapping (On-Demand) --- +# Prices as of April 2026. Users should seek updated prices on: +# https://cloud.google.com/products/compute/pricing +function get_hourly_cost() { + case "$1" in + "l4") echo "1.147208384" ;; # g2-standard-16 + 1x L4 + "l4-x2") echo "2.000832696" ;; # g2-standard-24 + 2x L4 + "l4-x4") echo "4.001665392" ;; # g2-standard-48 + 4x L4 + "rtx-pro-6000") echo "4.4999" ;; # g4-standard-48 + 1x RTX 6000 (96GB) + "rtx-pro-6000-1-2") echo "2.5874425" ;; # g4-standard-24 + 1/2x RTX 6000 + "rtx-pro-6000-1-4") echo "1.29372125" ;; # g4-standard-12 + 1/4x RTX 6000 + "rtx-pro-6000-1-8") echo "0.646860625" ;; # g4-standard-6 + 1/8x RTX 6000 + *) echo "0.0" ;; + esac +} + +# --- CLI Arguments --- +BUILD_IMAGE=false +SYNC_ONLY=false +SCENARIOS_JSON="" +MANUAL_COST="" +ACCELERATORS_INPUT="" + +while [[ "$#" -gt 0 ]]; do + case $1 in + --build) BUILD_IMAGE=true ;; + --sync-only) SYNC_ONLY=true ;; + --scenarios) + SCENARIOS_JSON="$2" + shift + ;; + --accelerator) + ACCELERATORS_INPUT="$2" + shift + ;; + --model) + export HF_MODEL_ID="$2" + shift + ;; + --manual-cost) + MANUAL_COST="$2" + shift + ;; + *) + echo "Unknown parameter: $1" + exit 1 + ;; + esac + shift +done + +if [[ -z "${ACCELERATORS_INPUT}" ]]; then + echo "[ERROR] --accelerator is required (can be comma-separated list)" + exit 1 +fi +if [[ -z "${HF_MODEL_ID}" ]]; then + echo "[ERROR] --model is required" + exit 1 +fi + +# Minify scenarios JSON and inject model_id to prevent Kustomize parsing errors +if [[ "${SYNC_ONLY}" != "true" ]]; then + if [[ -z "${SCENARIOS_JSON}" ]]; then + SCENARIOS_JSON='[{"batch": 1, "vus": 1}]' + fi + export K6_SCENARIOS_JSON + K6_SCENARIOS_JSON=$(echo "${SCENARIOS_JSON}" | jq -c --arg m "${HF_MODEL_ID}" 'map(. + {model_id: $m})') +fi + +# --- Phase 1: Build (Once) --- +if [[ "${BUILD_IMAGE}" == "true" ]]; then + echo "[INFO] Building benchmark container image..." + cd "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark" + terraform init -input=false && terraform apply -auto-approve -input=false +fi + +# --- Phase 2-5: Sequential Accelerator Loop --- +IFS=',' read -ra ADDR <<<"${ACCELERATORS_INPUT}" +for ACCEL in "${ADDR[@]}"; do + export ACCELERATOR_TYPE="${ACCEL}" + echo "" + echo "======================================================================" + echo " STARTING SUITE FOR ACCELERATOR: ${ACCELERATOR_TYPE}" + echo "======================================================================" + + if [[ "${SYNC_ONLY}" != "true" ]]; then + # Refresh deployment config for this accelerator + echo "[INFO] Configuring deployment for ${HF_MODEL_NAME} on ${ACCELERATOR_TYPE}..." + CONFIG_DIR="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark" + pushd "${CONFIG_DIR}" >/dev/null + source "./configure_deployment.sh" + popd >/dev/null + + echo "[INFO] Cleaning up existing benchmark jobs..." + kubectl delete --ignore-not-found --kustomize "${CONFIG_DIR}/${HF_MODEL_NAME}" + kubectl wait --for=delete pod -l job-name="k6-benchmark-${HF_MODEL_NAME}" -n "${ira_online_gpu_kubernetes_namespace_name}" --timeout=60s || true + + echo "[INFO] Launching benchmark Job..." + kubectl apply --kustomize "${CONFIG_DIR}/${HF_MODEL_NAME}" + + # --- Phase 4: Monitoring --- + echo "[INFO] Monitoring benchmark Job..." + + LOG_PID="" + + # Smart Monitoring Loop + (while true; do + # Get current Pod name and status + POD_INFO=$(kubectl get pods -n "${ira_online_gpu_kubernetes_namespace_name}" -l job-name="k6-benchmark-${HF_MODEL_NAME}" -o jsonpath='{.items[0].metadata.name} {.items[0].status.phase}' 2>/dev/null || echo "None None") + read -r POD_NAME POD_STATUS <<<"$POD_INFO" + + TIMESTAMP=$(date +"%T") + + if [[ "${POD_STATUS}" == "Running" ]]; then + # If Pod is running but we aren't tailing logs yet, start tailing + if [[ -z "${LOG_PID}" ]] || ! kill -0 "${LOG_PID}" 2>/dev/null; then + echo "[${TIMESTAMP}] Pod is Running. Starting log stream..." + kubectl logs -n "${ira_online_gpu_kubernetes_namespace_name}" "${POD_NAME}" -c k6-benchmark -f & + LOG_PID=$! + fi + echo "[HEARTBEAT] ${TIMESTAMP} | Pod: ${POD_NAME} | Status: ${POD_STATUS}" + elif [[ "${POD_STATUS}" == "Pending" ]]; then + # If pending, show the latest event to track scale-up/image pull + EVENT=$(kubectl get events -n "${ira_online_gpu_kubernetes_namespace_name}" --field-selector involvedObject.name="${POD_NAME}" --sort-by='.lastTimestamp' -o jsonpath='{.items[-1].message}' 2>/dev/null || echo "Waiting for events...") + echo "[HEARTBEAT] ${TIMESTAMP} | Status: Pending | Last Event: ${EVENT}" + elif [[ "${POD_STATUS}" == "None" ]]; then + echo "[HEARTBEAT] ${TIMESTAMP} | Waiting for Pod to be created..." + else + echo "[HEARTBEAT] ${TIMESTAMP} | Status: ${POD_STATUS}" + fi + + sleep 60 + done) & + MONITOR_PID=$! + + echo "[INFO] Waiting for Job completion (max 6h)..." + TIMEOUT=21600 # 6 hours + ELAPSED=0 + SLEEP_INTERVAL=10 + while true; do + # Check terminal state + STATUS=$(kubectl get job "k6-benchmark-${HF_MODEL_NAME}" -n "${ira_online_gpu_kubernetes_namespace_name}" -o jsonpath='{.status.conditions[?(@.status=="True")].type}' 2>/dev/null || echo "Unknown") + + if [[ "$STATUS" == *"Complete"* ]]; then + echo "[INFO] Job completed successfully." + break + elif [[ "$STATUS" == *"Failed"* ]]; then + echo "[ERROR] Job failed or was aborted by k6 thresholds. Check container logs for details." + break + elif [[ "$STATUS" == "Unknown" ]]; then + if ! kubectl get job "k6-benchmark-${HF_MODEL_NAME}" -n "${ira_online_gpu_kubernetes_namespace_name}" &>/dev/null; then + echo "[WARN] Job not found. Stopping wait." + break + fi + fi + + if [ "$ELAPSED" -ge "$TIMEOUT" ]; then + echo "[ERROR] Timeout reached waiting for Job completion." + exit 1 + fi + + sleep $SLEEP_INTERVAL + ELAPSED=$((ELAPSED + SLEEP_INTERVAL)) + done + + kill $MONITOR_PID 2>/dev/null || true + # Ensure the background log process (inside the monitor subshell) is also cleaned up + # We kill the process group to be sure + pkill -P $MONITOR_PID 2>/dev/null || true + else + echo "[INFO] --sync-only flag detected. Skipping Job deployment and monitoring." + fi + + echo "[INFO] Syncing results from GCS..." + RESULTS_DIR="${ACP_REPO_DIR}/${hub_models_bucket_bench_results_name}" + mkdir -p "${RESULTS_DIR}" + gcloud storage cp -r "gs://${hub_models_bucket_bench_results_name}/*.jsonl" "${RESULTS_DIR}/" + + # Find the most recent file for this SPECIFIC accelerator run + LATEST_JSONL=$(ls -t "${RESULTS_DIR}"/*"${HF_MODEL_NAME}"*"${ACCELERATOR_TYPE}"*.jsonl | head -n 1) + COST="${MANUAL_COST:-$(get_hourly_cost "${ACCELERATOR_TYPE}")}" + + echo "[INFO] ----------------------------------------------------------------------" + echo "[INFO] Analyzing: ${LATEST_JSONL}" + echo "[INFO] Accelerator: ${ACCELERATOR_TYPE} | Cost: \$${COST}/hr" + echo "[INFO] DISCLAIMER: This rate represents public on-demand pricing." + echo "[INFO] It does NOT account for CUDs, SUDs, or custom private pricing." + echo "[INFO] For current and accurate pricing, visit:" + echo "[INFO] https://cloud.google.com/products/compute/pricing" + echo "[INFO] ----------------------------------------------------------------------" + + . "${ACP_REPO_DIR}/.venv/bin/activate" + python3 "${ACP_REPO_DIR}/container-images/cpu/k6-benchmark/extract_metrics.py" \ + --file "${LATEST_JSONL}" \ + --hourly-cost "${COST}" \ + --project-id "${cluster_project_id}" \ + --namespace "${ira_online_gpu_kubernetes_namespace_name}" \ + --output-csv "${ACP_REPO_DIR}/k6-benchmark.csv" + + if [[ "${SYNC_ONLY}" != "true" ]]; then + echo "[INFO] Cleaning up Job resources for ${ACCELERATOR_TYPE}..." + kubectl delete --ignore-not-found --kustomize "${CONFIG_DIR}/${HF_MODEL_NAME}" + fi +done + +echo "" +echo "======================================================================" +echo " ALL BENCHMARK SUITES COMPLETE" +echo " Final Aggregated CSV: ${ACP_REPO_DIR}/k6-benchmark.csv" +echo "======================================================================" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh index 93ece036e..92fa0c342 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh @@ -13,10 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -set -o errexit -set -o nounset -set -o pipefail - MY_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 pwd -P diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/job.yaml new file mode 100644 index 000000000..6ee64692a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/job.yaml @@ -0,0 +1,71 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: k6-benchmark + namespace: replaced-by-kustomize +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: k6-benchmark + annotations: + gke-gcsfuse/volumes: "true" + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" + spec: + restartPolicy: OnFailure + containers: + - args: [] + command: [] + image: replaced-by-kustomize + imagePullPolicy: Always + name: k6-benchmark + resources: {} + volumeMounts: + - mountPath: /output + name: k6-benchmark-bucket-results + readOnly: false + env: + - name: TARGET_URL + valueFrom: + configMapKeyRef: + key: K6_TARGET_URL + name: deployment + - name: ACCELERATOR_NAME + valueFrom: + configMapKeyRef: + key: ACCELERATOR_NAME + name: deployment + - name: SCENARIOS_JSON + valueFrom: + configMapKeyRef: + key: K6_SCENARIOS_JSON + name: deployment + - name: INFERENCE_SERVER_TYPE + valueFrom: + configMapKeyRef: + key: INFERENCE_SERVER_TYPE + name: deployment + serviceAccountName: replaced-by-kustomize + terminationGracePeriodSeconds: 0 + volumes: + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: replaced-by-kustomize + mountOptions: "implicit-dirs,uid=12345,gid=12345" + name: k6-benchmark-bucket-results diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/kustomization.yaml new file mode 100644 index 000000000..7f988c7f4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/kustomization.yaml @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - deployment.env + name: deployment + namespace: replaced-by-kustomize + +resources: + - job.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/templates/deployment.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/templates/deployment.tpl.env new file mode 100644 index 000000000..32ad48a33 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/templates/deployment.tpl.env @@ -0,0 +1,8 @@ +INFERENCE_KUBERNETES_NAMESPACE=${ira_online_gpu_kubernetes_namespace_name} +INFERENCE_KUBERNETES_SERVICE_ACCOUNT=${ira_inference_perf_bench_kubernetes_service_account_name} +CONTAINER_IMAGE_URL=${ira_cpu_k6_benchmark_image_url} +ACCELERATOR_NAME=${ACCELERATOR_TYPE} +BENCHMARK_RESULTS_BUCKET_NAME=${hub_models_bucket_bench_results_name} +K6_TARGET_URL=${K6_TARGET_URL} +K6_SCENARIOS_JSON=${K6_SCENARIOS_JSON} +INFERENCE_SERVER_TYPE=${K6_INFERENCE_SERVER_TYPE} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh new file mode 100755 index 000000000..f2e41dbc9 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +if [[ -z "${ACCELERATOR_TYPE:-}" ]]; then + echo "ACCELERATOR_TYPE is not set" + return 1 +fi + +if [[ -z "${HF_MODEL_NAME:-}" ]]; then + echo "HF_MODEL_NAME is not set" + echo "If the HF_MODEL_NAME variable is not set, ensure that HF_MODEL_ID is set and source the set_environment_variables.sh script:" + echo "source \"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh\"" + return 1 +fi + +if [[ -z "${K6_SCENARIOS_JSON:-}" ]]; then + echo "K6_SCENARIOS_JSON is not set." + return 1 +fi +export K6_SCENARIOS_JSON + +echo "Configuring deployment for ${HF_MODEL_NAME} running on ${ACCELERATOR_TYPE}" + +if [[ "${HF_MODEL_NAME:-}" == "flux-2-klein-4b" ]]; then + K6_TARGET_URL="http://diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}:8000" + K6_INFERENCE_SERVER_TYPE="sglang" +elif [[ "${HF_MODEL_NAME:-}" == "flux-1-schnell" ]]; then + K6_TARGET_URL="http://diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}:8000/generate" + K6_INFERENCE_SERVER_TYPE="diffusers" +else + echo "Model not supported: ${HF_MODEL_NAME:-"HF_MODEL_NAME variable not set"}" + return 1 +fi + +export K6_TARGET_URL +export K6_INFERENCE_SERVER_TYPE + +envsubst <"${MY_PATH}/base/templates/deployment.tpl.env" | sponge "${MY_PATH}/base/deployment.env" + +echo "Deployment configuration:" +cat "${MY_PATH}/base/deployment.env" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..a859b82b4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,86 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -flux-2-klein-4b + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.app + select: + kind: Job + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.containers.[name=k6-benchmark].image + select: + kind: Job + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.BENCHMARK_RESULTS_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=k6-benchmark-bucket-results].csi.volumeAttributes.bucketName + select: + kind: Job + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..832c00108 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: k6-benchmark + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: cpu-n4-8 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..e81641a62 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: k6-benchmark + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: k6-benchmark + args: + - "/app/scripts/k6-diffusers-flux-2-klein-4b.js" + resources: + limits: + cpu: "2" + memory: 1G + requests: + cpu: "2" + memory: 1G diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..c8f0f2217 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/runtime.env @@ -0,0 +1 @@ +APP_LABEL=k6-benchmark-flux-2-klein-4b diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/kustomization.yaml new file mode 100644 index 000000000..353b1abc9 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/kustomization.yaml @@ -0,0 +1,29 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../base + +configMapGenerator: + - files: + - patch-entrypoint.sh + name: entrypoint-patch + namespace: replaced-by-kustomize + +patches: + - path: patch-server.yaml + - path: patch-gcsfuse.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-entrypoint.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-entrypoint.sh new file mode 100644 index 000000000..c7f27a2fe --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-entrypoint.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit + +# Dynamic python patch of parallel_state.py +python3 -c " +path = '/sgl-workspace/sglang/python/sglang/multimodal_gen/runtime/distributed/parallel_state.py' +with open(path, 'r') as f: + content = f.read() + +old_code = ''' extra_args = ( + {} + if ( + current_platform.is_mps() + or current_platform.is_musa() + or current_platform.is_npu() + ) + else dict(device_id=device_id) + )''' + +new_code = ''' extra_args = ( + {} + if ( + current_platform.is_mps() + or current_platform.is_musa() + or current_platform.is_npu() + or world_size == 1 + ) + else dict(device_id=device_id) + )''' + +if old_code in content: + content = content.replace(old_code, new_code) + with open(path, 'w') as f: + f.write(content) + print('[Patch] Successfully patched parallel_state.py!') +else: + print('[Patch] Target code not found or already patched.') +" + +# Execute normal SGLang binary entrypoint with arguments +exec sglang "$@" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-gcsfuse.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-gcsfuse.yaml new file mode 100644 index 000000000..5f647c181 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-gcsfuse.yaml @@ -0,0 +1,36 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This patch overrides the base GCS Fuse configuration for virtual GPU slices. +# Deviation from base: +# 1. file-cache:max-size-mb is set to 0 (disabled). +# 2. parallel downloads are disabled (implied by removing the flag). +# Rationale: Virtual GPU slices run on memory-constrained host node pools. +# Disabling the RAM-based file cache prevents node-level evictions and sidecar +# OOM crashes during model weight prefetching. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + volumes: + - name: huggingface-hub-model-bucket + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:0,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-server.yaml new file mode 100644 index 000000000..1e296d9b7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-server.yaml @@ -0,0 +1,63 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + # The normal sglang command is wrapped by patch-entrypoint.sh. + # This startup wrapper dynamically patches sglang's parallel_state.py + # to omit the 'device_id' argument in torch.distributed.init_process_group + # when world_size == 1. + # + # Why: PyTorch's eager device connection hook (eager_connect_single_device) + # is triggered when device_id is passed, which attempts direct GPU/NCCL hardware + # binding calls that crash with 'cudaErrorNotSupported' inside virtualized + # MIG / vGPU slice sandboxes under GKE. Delaying NCCL init (which is never + # called on tp=1 setups) bypasses the crash entirely. + # + # Tracked in SGLang Issue: https://github.com/sgl-project/sglang/issues/25670 + command: + - "/scripts/patch-entrypoint.sh" + args: + - "serve" + - "--mem-fraction-static=$(GPU_MEMORY_UTILIZATION)" + - "--model-path=/gcs/$(MODEL_ID)" + - "--tp-size=$(TENSOR_PARALLEL_SIZE)" + - "--trust-remote-code" + - "--port=30000" + - "--host=0.0.0.0" + env: + - name: NCCL_P2P_DISABLE + value: "1" + - name: NCCL_SHM_DISABLE + value: "1" + - name: NCCL_NVLS_ENABLE + value: "0" + - name: NCCL_DEBUG + value: "INFO" + volumeMounts: + - mountPath: /scripts + name: entrypoint-patch-volume + volumes: + - name: entrypoint-patch-volume + configMap: + name: entrypoint-patch + defaultMode: 0755 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml index 127b3cad0..26ae92e37 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml @@ -31,7 +31,7 @@ spec: gke-gcsfuse/memory-limit: "0" gke-gcsfuse/volumes: "true" labels: - ai.gke.io/inference-server: diffusers + ai.gke.io/inference-server: replaced-by-kustomize ai.gke.io/model: replaced-by-kustomize app: diffusers spec: @@ -85,8 +85,8 @@ spec: path: /health port: 8000 scheme: HTTP - initialDelaySeconds: 60 - periodSeconds: 10 + initialDelaySeconds: 120 + periodSeconds: 30 successThreshold: 1 timeoutSeconds: 1 resources: {} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml index 22dc66f91..fc01c9452 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml @@ -26,3 +26,13 @@ resources: - ../../base - deployment.yaml - service.yaml +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env index b36920254..bcc398a74 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env @@ -1 +1,2 @@ -CONTAINER_IMAGE_URL=${ira_online_gpu_diffusers_flux_image_url} +CONTAINER_IMAGE_URL=${DIFFUSERS_CONTAINER_IMAGE_URL} +INFERENCE_SERVER=${DIFFUSERS_INFERENCE_SERVER} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh index 2863274d0..685fe995a 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh @@ -26,4 +26,27 @@ source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_var "${MY_PATH}/../configure_deployment.sh" -envsubst < "${MY_PATH}/base/templates/diffusers.tpl.env" | sponge "${MY_PATH}/base/diffusers.env" +if [[ "${HF_MODEL_ID}" == "black-forest-labs/flux.1-schnell" ]]; then + DIFFUSERS_CONTAINER_IMAGE_URL="${ira_online_gpu_diffusers_flux_image_url}" + DIFFUSERS_INFERENCE_SERVER="diffusers" +elif [[ "${HF_MODEL_ID}" == "black-forest-labs/flux.2-klein-4b" ]]; then + DIFFUSERS_CONTAINER_IMAGE_URL="${ira_online_gpu_diffusers_sglang_diffusers_image_url}" + DIFFUSERS_INFERENCE_SERVER="sglang" +else + echo "[ERROR] Set a container image URL for model: ${HF_MODEL_ID:-"no model set"}" + return 1 +fi + +export DIFFUSERS_CONTAINER_IMAGE_URL +export DIFFUSERS_INFERENCE_SERVER + +envsubst <"${MY_PATH}/base/templates/diffusers.tpl.env" | sponge "${MY_PATH}/base/diffusers.env" + +echo "Configurations for ${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + +echo "Deployment configuration:" +cat "${MY_PATH}/base/diffusers.env" +echo + +echo "Runtime configuration:" +cat "${MY_PATH}/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}/runtime.env" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml index 5ced51113..e39e58664 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml @@ -28,6 +28,15 @@ patches: - path: patch-resources.yaml replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment - source: fieldPath: data.APP_LABEL kind: ConfigMap diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml index 6cc31e249..a8408c99e 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml @@ -28,6 +28,15 @@ patches: - path: patch-resources.yaml replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment - source: fieldPath: data.APP_LABEL kind: ConfigMap diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..f0c05f7a4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,142 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -l4-flux-2-klein-4b + +patches: + - path: patch-server.yaml + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..42621a442 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-l4-24gb-s16-x1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..5b87d66b7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..2d997452a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + cpu: "6" + memory: 45G + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 45G + nvidia.com/gpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-server.yaml new file mode 100644 index 000000000..4725a1810 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-server.yaml @@ -0,0 +1,34 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + command: + - sglang + - serve + args: + - "--mem-fraction-static=$(GPU_MEMORY_UTILIZATION)" + - "--model-path=/gcs/$(MODEL_ID)" + - "--tp-size=$(TENSOR_PARALLEL_SIZE)" + - "--trust-remote-code" + - "--port=30000" + - "--host=0.0.0.0" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..1392ebab2 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/runtime.env @@ -0,0 +1,5 @@ +APP_LABEL=diffusers-l4-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.95 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..537b6b731 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,142 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -l4-x2-flux-2-klein-4b + +patches: + - path: patch-server.yaml + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..3a1d2988a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-l4-24gb-x2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..cc8584d48 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..4a23f238d --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + cpu: "20" + memory: 80G + nvidia.com/gpu: "2" + requests: + cpu: "20" + memory: 80G + nvidia.com/gpu: "2" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-server.yaml new file mode 100644 index 000000000..1c267ac3c --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-server.yaml @@ -0,0 +1,46 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + env: + - name: NUM_GPUS + valueFrom: + configMapKeyRef: + name: runtime + key: NUM_GPUS + - name: TENSOR_PARALLEL_SIZE + valueFrom: + configMapKeyRef: + name: runtime + key: TENSOR_PARALLEL_SIZE + command: + - sglang + - serve + args: + - "--model-path=/gcs/$(MODEL_ID)" + - "--tp-size=$(TENSOR_PARALLEL_SIZE)" + - "--trust-remote-code" + - "--port=30000" + - "--host=0.0.0.0" + - "--backend=sglang" + - "--num-gpus=$(NUM_GPUS)" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..2f2783f52 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=diffusers-l4-x2-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.95 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=2 +NUM_GPUS=2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..0bbc8c8cd --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,142 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -l4-x4-flux-2-klein-4b + +patches: + - path: patch-server.yaml + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..330f065e4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-l4-24gb-x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..cc8584d48 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..04b58a68f --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + cpu: "40" + memory: 160G + nvidia.com/gpu: "4" + requests: + cpu: "40" + memory: 160G + nvidia.com/gpu: "4" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-server.yaml new file mode 100644 index 000000000..1c267ac3c --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-server.yaml @@ -0,0 +1,46 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + env: + - name: NUM_GPUS + valueFrom: + configMapKeyRef: + name: runtime + key: NUM_GPUS + - name: TENSOR_PARALLEL_SIZE + valueFrom: + configMapKeyRef: + name: runtime + key: TENSOR_PARALLEL_SIZE + command: + - sglang + - serve + args: + - "--model-path=/gcs/$(MODEL_ID)" + - "--tp-size=$(TENSOR_PARALLEL_SIZE)" + - "--trust-remote-code" + - "--port=30000" + - "--host=0.0.0.0" + - "--backend=sglang" + - "--num-gpus=$(NUM_GPUS)" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..60aeb744b --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=diffusers-l4-x4-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.95 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=4 +NUM_GPUS=4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..f0935a835 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,141 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -rtx-pro-6000-1-2-flux-2-klein-4b + +patches: + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base-vgpu diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..157faff63 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1-2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..cc8584d48 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..8193521e2 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + metadata: + annotations: + devices.gke.io/container.inference-server: |+ + - path: /dev/nvidia-caps/nvidia-cap1 + - path: /dev/nvidia-caps/nvidia-cap2 + - path: /dev/nvidia-caps/nvidia-cap3 + - path: /dev/nvidia-caps/nvidia-cap4 + spec: + containers: + - name: inference-server + resources: + limits: + cpu: "6" + memory: 45G + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 45G + nvidia.com/gpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..f35250665 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/runtime.env @@ -0,0 +1,5 @@ +APP_LABEL=diffusers-rtx-pro-6000-1-2-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.95 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..ab6afae70 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,141 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -rtx-pro-6000-1-4-flux-2-klein-4b + +patches: + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base-vgpu diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..d50d8bf60 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1-4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..cc8584d48 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..c6655431e --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + metadata: + annotations: + devices.gke.io/container.inference-server: |+ + - path: /dev/nvidia-caps/nvidia-cap1 + - path: /dev/nvidia-caps/nvidia-cap2 + - path: /dev/nvidia-caps/nvidia-cap3 + - path: /dev/nvidia-caps/nvidia-cap4 + spec: + containers: + - name: inference-server + resources: + limits: + cpu: "6" + memory: 38G + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 38G + nvidia.com/gpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..646fdd22f --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/runtime.env @@ -0,0 +1,5 @@ +APP_LABEL=diffusers-rtx-pro-6000-1-4-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.95 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..51bed340a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,152 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -rtx-pro-6000-1-8-flux-2-klein-4b + +patches: + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + - target: + group: apps + version: v1 + kind: Deployment + name: diffusers + patch: |- + - op: remove + path: /spec/template/spec/volumes/3/emptyDir/medium + - op: remove + path: /spec/template/spec/volumes/4/emptyDir/medium + - op: remove + path: /spec/template/spec/volumes/5/emptyDir/medium + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base-vgpu diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..fd81a2a54 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1-8 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..cc8584d48 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..b90a86b0b --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + metadata: + annotations: + devices.gke.io/container.inference-server: |+ + - path: /dev/nvidia-caps/nvidia-cap1 + - path: /dev/nvidia-caps/nvidia-cap2 + - path: /dev/nvidia-caps/nvidia-cap3 + - path: /dev/nvidia-caps/nvidia-cap4 + spec: + tolerations: + - effect: NoSchedule + key: node.kubernetes.io/memory-pressure + operator: Exists + initContainers: + - name: preload-model + image: us-central1-docker.pkg.dev/accelerated-platforms-dev/frr-l4/gpu/sglang:latest + command: ["/bin/bash", "-c"] + args: + - | + set -eo pipefail + echo "########### Copying model files to local SSD storage..." + mkdir -p /local-model/black-forest-labs + cp -a /gcs/black-forest-labs/flux.2-klein-4b /local-model/black-forest-labs/ + echo "########### Copy completed successfully!" + ls -lh /local-model/black-forest-labs/flux.2-klein-4b + volumeMounts: + - mountPath: /gcs/black-forest-labs/flux.2-klein-4b + name: huggingface-hub-model-bucket + readOnly: true + - mountPath: /local-model + name: local-model-storage + containers: + - name: inference-server + args: + - serve + - --mem-fraction-static=$(GPU_MEMORY_UTILIZATION) + - --model-path=/local-model/black-forest-labs/flux.2-klein-4b + - --tp-size=$(TENSOR_PARALLEL_SIZE) + - --trust-remote-code + - --port=30000 + - --host=0.0.0.0 + - --text-encoder-cpu-offload + - --vae-cpu-offload + - --pin-cpu-memory=False + resources: + limits: + cpu: "4" + memory: 23500Mi + nvidia.com/gpu: "1" + requests: + cpu: "4" + memory: 12Gi + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /local-model + name: local-model-storage + readOnly: true + - $patch: delete + name: fetch-safetensors + volumes: + - name: local-model-storage + emptyDir: {} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..5021ea1d5 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/runtime.env @@ -0,0 +1,5 @@ +APP_LABEL=diffusers-rtx-pro-6000-1-8-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.75 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/kustomization.yaml new file mode 100644 index 000000000..ab55bbdca --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/kustomization.yaml @@ -0,0 +1,142 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -rtx-pro-6000-flux-2-klein-4b + +patches: + - path: patch-server.yaml + - path: patch-ports.yaml + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.INFERENCE_SERVER + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/inference-server] + select: + kind: Deployment + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: diffusers + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-nodeselector.yaml new file mode 100644 index 000000000..44af184b7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-ports.yaml new file mode 100644 index 000000000..cc8584d48 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-ports.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + readinessProbe: + httpGet: + port: 30000 +--- +apiVersion: v1 +kind: Service +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 30000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-resources.yaml new file mode 100644 index 000000000..2d997452a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + cpu: "6" + memory: 45G + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 45G + nvidia.com/gpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-server.yaml new file mode 100644 index 000000000..4725a1810 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-server.yaml @@ -0,0 +1,34 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: diffusers + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + command: + - sglang + - serve + args: + - "--mem-fraction-static=$(GPU_MEMORY_UTILIZATION)" + - "--model-path=/gcs/$(MODEL_ID)" + - "--tp-size=$(TENSOR_PARALLEL_SIZE)" + - "--trust-remote-code" + - "--port=30000" + - "--host=0.0.0.0" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/runtime.env new file mode 100644 index 000000000..a4eaea70a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/runtime.env @@ -0,0 +1,5 @@ +APP_LABEL=diffusers-rtx-pro-6000-flux-2-klein-4b +GPU_MEMORY_UTILIZATION=0.95 +MODEL_ID=black-forest-labs/flux.2-klein-4b +MODEL_NAME=flux-2-klein-4b +TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md b/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md index d235a13fd..d855ec88b 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md @@ -149,10 +149,10 @@ For more information about providing values for Terraform input variables, see - Configure the platform. - - [Optional] + - \[Optional\] [Hugging Face initialization](/platforms/gke/base/core/huggingface/initialize/README.md) - - [Optional] - [NVIDIA initialization](/platforms/gke/base/core/nvidia/initialize/README.md) + - \[Optional\] + [NVIDIA NGC initialization](/platforms/gke/base/core/nvidia/initialize/README.md) ### Resources created @@ -180,6 +180,7 @@ For more information about providing values for Terraform input variables, see CPU
    +
  • cpu-e2-s-16
  • cpu-n4-s-8
    • @@ -204,10 +205,14 @@ For more information about providing values for Terraform input variables, see
    • gpu-l4-24gb-x2
    • gpu-l4-24gb-x4
    • gpu-l4-24gb-x8
    • +
    • gpu-rtx-pro-6000-96gb-x1
    • +
    • gpu-rtx-pro-6000-96gb-x1-2
    • +
    • gpu-rtx-pro-6000-96gb-x1-4
    • +
    • gpu-rtx-pro-6000-96gb-x1-8
      • -
        - + TPU
          diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf index 6992f930d..2450e427f 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf @@ -52,10 +52,13 @@ locals { ira_offline_batch_project_id = var.ira_offline_batch_project_id != null ? var.ira_offline_batch_project_id : var.platform_default_project_id - ira_online_gpu_diffusers_flux_image_url = var.ira_online_gpu_diffusers_flux_image_url != null ? var.ira_online_gpu_diffusers_flux_image_url : "${local.cloudbuild_ar_image_repository_url}/gpu-diffusers/flux:latest" - ira_online_gpu_kubernetes_namespace_name = var.ira_online_gpu_kubernetes_namespace_name != null ? var.ira_online_gpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-online-gpu" - ira_online_gpu_kubernetes_service_account_name = var.ira_online_gpu_kubernetes_service_account_name != null ? var.ira_online_gpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-online-gpu" - ira_online_gpu_vllm_image_url = var.ira_online_gpu_vllm_image_url != null ? var.ira_online_gpu_vllm_image_url : "${local.cloudbuild_ar_image_repository_url}/vllm/gpu:latest" + ira_cpu_k6_benchmark_image_url = var.ira_cpu_k6_benchmark_image_url != null ? var.ira_cpu_k6_benchmark_image_url : "${local.cloudbuild_ar_image_repository_url}/cpu/k6-benchmark:latest" + + ira_online_gpu_diffusers_flux_image_url = var.ira_online_gpu_diffusers_flux_image_url != null ? var.ira_online_gpu_diffusers_flux_image_url : "${local.cloudbuild_ar_image_repository_url}/gpu-diffusers/flux:latest" + ira_online_gpu_diffusers_sglang_diffusers_image_url = var.ira_online_gpu_sglang_diffusers_image_url != null ? var.ira_online_gpu_sglang_diffusers_image_url : "${local.cloudbuild_ar_image_repository_url}/gpu/sglang:latest" + ira_online_gpu_kubernetes_namespace_name = var.ira_online_gpu_kubernetes_namespace_name != null ? var.ira_online_gpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-online-gpu" + ira_online_gpu_kubernetes_service_account_name = var.ira_online_gpu_kubernetes_service_account_name != null ? var.ira_online_gpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-online-gpu" + ira_online_gpu_vllm_image_url = var.ira_online_gpu_vllm_image_url != null ? var.ira_online_gpu_vllm_image_url : "${local.cloudbuild_ar_image_repository_url}/vllm/gpu:latest" ira_online_tpu_kubernetes_namespace_name = var.ira_online_tpu_kubernetes_namespace_name != null ? var.ira_online_tpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-online-tpu" ira_online_tpu_kubernetes_service_account_name = var.ira_online_tpu_kubernetes_service_account_name != null ? var.ira_online_tpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-online-tpu" @@ -219,6 +222,11 @@ variable "ira_online_gpu_diffusers_flux_image_url" { type = string } +variable "ira_cpu_k6_benchmark_image_url" { + default = null + description = "The URL for the k6 benchmark container image." + type = string +} variable "ira_online_gpu_kubernetes_namespace_name" { default = null @@ -238,6 +246,12 @@ variable "ira_online_gpu_vllm_image_url" { type = string } +variable "ira_online_gpu_sglang_diffusers_image_url" { + default = null + description = "The URL for the GPU SGLang Diffusers container image." + type = string +} + variable "ira_online_tpu_kubernetes_namespace_name" { default = null description = "The Kubernetes namespace for the online TPU inference workloads." @@ -291,4 +305,3 @@ variable "enable_tpu" { description = "Turns on inference-perf resources for TPU cluster" type = bool } - diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf index 8f573d8f7..824a1a8ca 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf @@ -152,10 +152,18 @@ output "ira_offline_batch_project_id" { value = local.ira_offline_batch_project_id } +output "ira_cpu_k6_benchmark_image_url" { + value = local.ira_cpu_k6_benchmark_image_url +} + output "ira_online_gpu_diffusers_flux_image_url" { value = local.ira_online_gpu_diffusers_flux_image_url } +output "ira_online_gpu_diffusers_sglang_diffusers_image_url" { + value = local.ira_online_gpu_diffusers_sglang_diffusers_image_url +} + output "ira_online_gpu_kubernetes_namespace_name" { value = local.ira_online_gpu_kubernetes_namespace_name } @@ -239,4 +247,3 @@ output "workflow_api_service_account_oauth_display_name" { output "workflow_api_service_account_project_id" { value = local.workflow_api_service_account_project_id } - diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh index ecfbd0c4e..da72d5651 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh @@ -14,7 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. MY_PATH_IRA_ENV="$( - cd "$(dirname "${BASH_SOURCE}")" >/dev/null 2>&1 + SCRIPT_SOURCE="${BASH_SOURCE[0]:-}" + if [[ -z "${SCRIPT_SOURCE:-}" ]]; then + # Fallback in case BASH_SOURCE is not defined, such as when sourcing this + # script from a non-Bash shell + SCRIPT_SOURCE="$0" + fi + + cd "$(dirname "${SCRIPT_SOURCE}")" >/dev/null 2>&1 || return 1 pwd -P )" @@ -36,6 +43,7 @@ if [[ -v HF_MODEL_ID ]]; then HF_MODEL_NAME="${HF_MODEL_ID##*/}" HF_MODEL_NAME="${HF_MODEL_NAME//./-}" - HF_MODEL_NAME="${HF_MODEL_NAME,,}" + # Don't use ,, to make this portable across shells + HF_MODEL_NAME="$(echo "${HF_MODEL_NAME}" | tr '[:upper:]' '[:lower:]')" export HF_MODEL_NAME fi diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh b/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh index c83ae354b..d2f6a6a66 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh @@ -50,6 +50,7 @@ declare -a CORE_TERRASERVICES_APPLY=( "workloads/jobset" "workloads/lws" "workloads/priority_class" + "workloads/nri_device_injector" "workloads/kueue" ) CORE_TERRASERVICES_APPLY="${CORE_TERRASERVICES_APPLY[*]}" "${ACP_PLATFORM_CORE_DIR}/deploy.sh" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform.lock.hcl b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform.lock.hcl new file mode 100644 index 000000000..45a622858 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform.lock.hcl @@ -0,0 +1,42 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "6.49.2" + constraints = "6.49.2" + hashes = [ + "h1:+B64rc5fCMrWtjZIsQGx/fftYiQnSInfqrLs76PZNH0=", + "zh:04dbba38cc201d8f35f21c65fe5fe022b2ef30712c59d0b04df1182ee484ee29", + "zh:37478f37b696e214049a7c1e397a6ebcf6b10e3652a6275c5e99ef972a0cd17f", + "zh:3a68292e88e6612ed014e22d53a693859071337fcc49a244936094ae8f2b82d8", + "zh:4adc8c706652b6c170c520bd3815abba7e145aeec26a2abdfa8a98ae85fbfc0d", + "zh:5e8dbf922be32eb54c370260fd71e8124d4d7a3bddc2d0e6b47b15efc30a2224", + "zh:632bccc9396e61947242095738164ae27db060b1c172422b41e3b12e80236ecc", + "zh:66ee64a5621199868c8fa68492124d38b37e1d733d240508c595b124b5123cb7", + "zh:6843060f0673a4e556c248672171c8a29c7faeaee9954cdffeb19a55de7e5184", + "zh:87d3b0bd397de17ea6c8b34c898afb9f08eda28c6c6272d8dd75fe17ceef77f3", + "zh:9d2f0f93f4506dc0002c2dec1b2117626b6376c214653b71629a933ce77e3523", + "zh:e80ccae3d640dca17b496220e3f42f6f0cc4c6fb80ffae9e2bbaea446373c137", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.5.3" + constraints = "2.5.3" + hashes = [ + "h1:1Nkh16jQJMp0EuDmvP/96f5Unnir0z12WyDuoR6HjMo=", + "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927", + "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e", + "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf", + "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d", + "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09", + "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f", + "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1", + "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6", + "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47", + "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82", + ] +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..238bf8e95 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild_variables.tf new file mode 120000 index 000000000..8fade6147 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch.auto.tfvars new file mode 120000 index 000000000..c89c6eab2 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/inference-ref-arch.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch_variables.tf new file mode 120000 index 000000000..b2f96723d --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/inference-ref-arch_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform.auto.tfvars new file mode 120000 index 000000000..c9c406bba --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../_shared_config/_platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform_variables.tf new file mode 120000 index 000000000..7ec64070d --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform_variables.tf @@ -0,0 +1 @@ +../../../_shared_config/_platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/cloudbuild.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/cloudbuild.tf new file mode 100644 index 000000000..b7640b5f5 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/cloudbuild.tf @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + image_destination = local.ira_async_cpu_load_generator_image_url +} + +resource "terraform_data" "submit_docker_build_k6_benchmark" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.ira_cpu_k6_benchmark_image_url + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = "${local.acp_root}/container-images/cpu/k6-benchmark" + } + + triggers_replace = { + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/cpu/k6-benchmark", "**") : filesha256("${local.acp_root}/container-images/cpu/k6-benchmark/${file}")])) + } +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/local_file.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/local_file.tf new file mode 100644 index 000000000..2635bb2b3 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/local_file.tf @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + acp_root = "${path.module}/../../../../../../../../.." +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/versions.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/versions.tf new file mode 100644 index 000000000..34a59fbc9 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "6.49.2" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/acp_ira_images_cpu_batch_load_generator_deploy-v1" + } +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf index ab6a646b1..5a11160ef 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf @@ -45,3 +45,31 @@ EOT source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/gpu/diffusers-flux/src", "**") : filesha256("${local.acp_root}/container-images/gpu/diffusers-flux/src/${file}")])) } } + +resource "terraform_data" "submit_sglang_diffusers" { + input = { + acp_root = local.acp_root + cloudbuild_project_id = local.cloudbuild_project_id + cloudbuild_service_account_id = local.cloudbuild_service_account_id + cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name + image_destination = local.ira_online_gpu_diffusers_sglang_diffusers_image_url + } + + provisioner "local-exec" { + command = <<-EOT +gcloud builds submit \ +--config="cloudbuild.yaml" \ +--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \ +--project="${self.input.cloudbuild_project_id}" \ +--quiet \ +--service-account="${self.input.cloudbuild_service_account_id}" \ +--substitutions=_DESTINATION="${self.input.image_destination}" +EOT + interpreter = ["bash", "-c"] + working_dir = "${local.acp_root}/container-images/gpu/sglang-diffusers" + } + + triggers_replace = { + source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/gpu/sglang-diffusers", "**") : filesha256("${local.acp_root}/container-images/gpu/sglang-diffusers/${file}")])) + } +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh b/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh index 1d0e1ba9a..e59cf3f97 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh @@ -69,6 +69,7 @@ done if [ "${ACP_TEARDOWN_CORE_PLATFORM}" = "true" ]; then declare -a CORE_TERRASERVICES_DESTROY=( "workloads/kueue" + "workloads/nri_device_injector" "workloads/priority_class" "workloads/lws" "workloads/jobset" diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh index 102621046..3a507e469 100755 --- a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh @@ -47,9 +47,14 @@ export ACCELERATOR_TYPE="l4" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-load-generator/configure_load_generator.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/vllm/configure_vllm.sh" -"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm/configure_vllm.sh" +# Validate diffusers kustomize +export HF_MODEL_ID="black-forest-labs/flux.1-schnell" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" +export HF_MODEL_ID="black-forest-labs/flux.2-klein-4b" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" + export ACCELERATOR_TYPE="v5e" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/max-diffusion/configure_max_diffusion.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/vllm/configure_vllm.sh" @@ -65,6 +70,12 @@ export APP_LABEL="vllm-rtx-pro-6000-gemma-3-27b-it-sd-eagle" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh" +# Validate k6-benchmark kustomize +export ACCELERATOR_TYPE="l4" +export HF_MODEL_NAME="HF_MODEL_NAME" +export K6_REQUEST_BATCH_SIZE=1 +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh" + # Validate offline-batch-inference-gpu kustomize export ACCELERATOR_TYPE="rtx-pro-6000" export HF_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"