diff --git a/.dev-tools/.gitignore b/.dev-tools/.gitignore
index b93b0f69e..12262726f 100644
--- a/.dev-tools/.gitignore
+++ b/.dev-tools/.gitignore
@@ -8,6 +8,7 @@ platforms/gke/base/core/workloads/inference_gateway/manifests/*
 platforms/gke/base/core/workloads/jobset/manifests/*
 platforms/gke/base/core/workloads/kueue/manifests/*
 platforms/gke/base/core/workloads/lws/manifests/*
+platforms/gke/base/core/workloads/nri_device_injector/manifests/*
 platforms/gke/base/core/workloads/nvidia_nim/*
 platforms/gke/base/core/workloads/priority_class/manifests/*
 platforms/gke/base/kubernetes/*
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 5589878f0..bb7aef62d 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM hashicorp/terraform:1.5.7 AS terraform
-FROM koalaman/shellcheck:v0.10.0 AS shellcheck
-FROM mvdan/shfmt:v3.10.0 AS shfmt
+FROM hashicorp/terraform:1.14.8 AS terraform
+FROM koalaman/shellcheck:v0.11.0 AS shellcheck
+FROM mvdan/shfmt:v3.13.1 AS shfmt
 
 FROM python:3.13-bookworm AS python-builder
 
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index e76a7e0c6..2405c019c 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
-  "name": "Cloud Solutions devcontainer",
+  "name": "Accelerated Platforms devcontainer",
   "build": {
     "dockerfile": "Dockerfile"
   },
@@ -13,7 +13,9 @@
         "editor.wordWrap": "off",
         "files.insertFinalNewline": true,
         "files.trimFinalNewlines": true,
+        "geminicodeassist.displayInlineContextHint": false,
         "prettier.resolveGlobalModules": true,
+        "python.defaultInterpreterPath": "/venv/bin/python",
         "redhat.telemetry.enabled": false,
         "telemetry.telemetryLevel": "off",
         "[css]": {
@@ -78,6 +80,7 @@
         "ms-azuretools.vscode-containers",
         "ms-python.black-formatter",
         "ms-python.isort",
+        "ms-python.python",
         "streetsidesoftware.code-spell-checker",
         "timonwong.shellcheck"
       ]
diff --git a/.github/workflows/dictionary/python.txt b/.github/workflows/dictionary/python.txt
index 9d8cbc9f3..45655b0ba 100644
--- a/.github/workflows/dictionary/python.txt
+++ b/.github/workflows/dictionary/python.txt
@@ -3,10 +3,16 @@ aiohttp
 aqtp
 asctime
 asgi
+asynccontextmanager
 asyncio
+certifi
+cffi
 classmethod
 configparser
+contextlib
 coveragerc
+dataclass
+dataclasses
 dataframe
 dbapi
 dbcommands
@@ -17,6 +23,7 @@ fastapi
 fillna
 fromarray
 frombuffer
+fromisoformat
 fsspec
 ftfy
 functools
@@ -29,11 +36,13 @@ getframerate
 getnchannels
 getnframes
 getsampwidth
+grpcio
 gunicorn
 hasattr
 hashlib
 hexdigest
 httpx
+idna
 iloc
 imgf
 inplace
@@ -59,7 +68,10 @@ pgvector
 pipreqs
 pmap
 prng
+protos
+pyasn
 pycache
+pycparser
 pydantic
 pyenv
 pylint
@@ -69,8 +81,10 @@ pythondontwritebytecode
 pythonpath
 pythonunbuffered
 qualname
+quantiles
 readframes
 removesuffix
+reqs
 rerank
 reranked
 retryable
@@ -83,13 +97,16 @@ shutil
 spacy
 splitlines
 sqlalchemy
+strftime
 tensorboard
 tensorboardx
 thejsonlogger
 tqdm
 unittests
 urllib
+urlopen
 urlretrieve
 uvicorn
 venv
 writerow
+writestr
diff --git a/.github/workflows/dictionary/sglang.txt b/.github/workflows/dictionary/sglang.txt
new file mode 100644
index 000000000..b14275eba
--- /dev/null
+++ b/.github/workflows/dictionary/sglang.txt
@@ -0,0 +1,4 @@
+lmsysorg
+musa
+nvls
+sglang
diff --git a/.github/workflows/dictionary/shell.txt b/.github/workflows/dictionary/shell.txt
index 637872058..7b1b5b8e3 100644
--- a/.github/workflows/dictionary/shell.txt
+++ b/.github/workflows/dictionary/shell.txt
@@ -16,6 +16,7 @@ nslookup
 pipefail
 pkill
 shuf
+subshell
 syscall
 xtrace
 zxvf
diff --git a/.gitignore b/.gitignore
index 3d84f34c6..ba12e5b8f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,10 @@ terraform.tfstate*
 # Test
 test/log/*.log
 test/scripts/environment_files/*
+
+# Generated outputs
+*.log
+k6-*.txt
+k6-*.csv
+k6-*.jsonl
+k6-report.md
diff --git a/README.md b/README.md
index c6533e770..183ec12e5 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ the primary runtime.
 - [LLM Inference Optimization: Achieving faster Pod Startup with Google Cloud Storage](/use-cases/inferencing/cost-optimization/gcsfuse/AchievingFasterPodStartup.md)
 - [Optimizing GKE Workloads with Custom Compute Classes](/docs/guides/optimizing-gke-workloads-with-custom-compute-classes/README.md)
 
-### [Deprecated] Playground AI/ML Platform on GKE
+### \[Deprecated\] Playground AI/ML Platform on GKE
 
 The [Playground AI/ML Platform on GKE](/platforms/gke-aiml/playground/README.md)
 is a quick-start implementation of the platform that can be used to familiarize
diff --git a/container-images/cpu/k6-benchmark/Dockerfile b/container-images/cpu/k6-benchmark/Dockerfile
new file mode 100644
index 000000000..ae4beebe2
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/Dockerfile
@@ -0,0 +1,31 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM grafana/k6:1.7.1
+
+USER root
+
+WORKDIR /app
+# Create the /output directory and ensure k6 owns it, along with /app
+RUN mkdir -p /output && chown -R k6:k6 /app /output
+
+COPY --chown=k6:k6 scripts /app/scripts
+COPY --chmod=a+x --chown=k6:k6 entrypoint.sh /app/entrypoint.sh
+
+# Switch back to the unprivileged k6 user
+USER k6
+
+ENTRYPOINT ["/app/entrypoint.sh"]
+
+CMD ["--help"]
diff --git a/container-images/cpu/k6-benchmark/README.md b/container-images/cpu/k6-benchmark/README.md
new file mode 100644
index 000000000..d1a3626d1
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/README.md
@@ -0,0 +1,99 @@
+# k6 Benchmark Image
+
+This container image packages [k6](https://k6.io/) load testing tool with
+specific scripts to benchmark Machine Learning inference workloads.
+
+It is designed to run in environments like Google Kubernetes Engine (GKE) to
+generate consistent, reproducible load against target endpoints and output
+granular metrics to a JSONL file for further analysis. It also includes a Python
+script (`extract_metrics.py`) that can be run manually to process the k6 output
+and generate a price/performance report.
+
+## Usage
+
+You can run this container image via Docker or deploy it as a Job in a
+Kubernetes cluster.
+
+### Environment Variables
+
+The container accepts the following optional environment variables for metric
+output naming and processing:
+
+- `ACCELERATOR_NAME`: A string representing the target hardware (e.g., `l4`,
+  `a100`, `v5p`). If not provided, it defaults to `accelerator-not-set`.
+- `NODE_HOURLY_COST`: The hourly cost of the underlying node in USD. Used by the
+  automatic metric extraction script to compute cost per 1k images. Defaults to
+  `0.0`.
+
+The default benchmark script (`k6-diffusers-flux-2-klein-4b.js`) expects the
+following environment variables:
+
+- `TARGET_URL`: The full URL of the inference endpoint to test (e.g.,
+  `http://model-service:8000/generate`).
+- `BATCH_SIZE`: The batch size to request in the payload (default: `1`).
+- `VUS`: The number of concurrent Virtual Users to simulate (default: `1`).
+
+### Running via Docker
+
+Set the k6 script to run by setting the `CMD` to point to the script path when
+starting the container:
+
+```bash
+# Example: running a different script mounted into the container
+docker run --rm \
+  -e ACCELERATOR_NAME="custom" \
+  -v $(pwd)/custom-script.js:/app/custom-script.js \
+  -v $(pwd)/output:/output \
+  k6-benchmark:latest /app/your-k6-script.js
+```
+
+The k6 output will be saved in the mapped `/output` directory on your host. The
+filename will be dynamically generated in the format:
+`<name-of-k6-script>-<ACCELERATOR_NAME>-<experiment-start-timestamp>.jsonl`. For
+For example: `k6-diffusers-flux-2-klein-4b-l4-20260417T120000Z.jsonl`.
+
+#### Supported Benchmarks
+
+The following benchmark scripts are included:
+
+- **`/app/k6-diffusers-flux-2-klein-4b.js`**: Benchmark the FLUX.2-klein-4B
+  image generation model.
+
+## Metrics Extraction
+
+The extraction script (`extract_metrics.py`) can be run manually after the
+benchmark finishes to generate a price/performance report.
+
+The extraction script calculates throughput (Images/sec) and latencies (p50,
+p95, p99) strictly from the `benchmark` scenario, and automatically fetches
+corresponding on-node telemetry (Peak VRAM, Avg GPU Utilization) from Google
+Cloud Monitoring if the dependencies are installed and it is running on Google
+Cloud.
+
+To ensure accurate hardware metrics when multiple deployments are running in the
+same project, the script can filter by pod, namespace, or node. If the `--pod`
+argument is omitted, the script automatically uses the `deployment_name`
+(extracted from the `TARGET_URL` hostname) as a prefix to filter for relevant
+pods.
+
+### Script Arguments
+
+- `--file`: Path to the k6 `.jsonl` output file (Required).
+- `--output-csv`: Path to the output CSV file where aggregated results are
+  stored (Optional, default: `k6-benchmark.csv`).
+- `--hourly-cost`: The hourly cost of the underlying GKE node in USD. If set to
+  `0.0`, a warning is emitted and cost metrics will be `0.0` (Optional, default:
+  `0.0`).
+- `--project-id`: Google Cloud Project ID to query DCGM metrics via Cloud
+  Monitoring. If omitted, the script dynamically fetches the project ID from the
+  Google Cloud Metadata server (Optional).
+- `--pod`: Filter metrics by a specific pod name. If omitted, the script
+  automatically uses the `deployment_name` (derived from the `TARGET_URL`
+  hostname) as a prefix filter to match all relevant pods in the deployment
+  (Optional).
+- `--namespace`: Filter metrics by a specific namespace (Optional).
+- `--node`: Filter metrics by a specific node name (Optional).
+- `--vram-metric`: The Prometheus metric string for VRAM usage (Default:
+  `prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge`).
+- `--util-metric`: The Prometheus metric string for GPU utilization (Default:
+  `prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge`).
diff --git a/container-images/cpu/k6-benchmark/cloudbuild.yaml b/container-images/cpu/k6-benchmark/cloudbuild.yaml
new file mode 100644
index 000000000..7d2515a93
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/cloudbuild.yaml
@@ -0,0 +1,28 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+images:
+  - ${_DESTINATION}
+
+options:
+  logging: CLOUD_LOGGING_ONLY
+
+steps:
+  - args:
+      - build
+      - --tag=${_DESTINATION}
+      - .
+    id: "Build k6 benchmark image"
+    name: "docker.io/docker:28.3.3-dind-alpine3.22"
+    waitFor: ["-"]
diff --git a/container-images/cpu/k6-benchmark/entrypoint.sh b/container-images/cpu/k6-benchmark/entrypoint.sh
new file mode 100755
index 000000000..4c4ee4f25
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/entrypoint.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+set -o nounset
+
+# Default accelerator name
+ACCELERATOR="${ACCELERATOR_NAME:-accelerator-not-set}"
+
+# Find the script name from the arguments
+SCRIPT_PATH=""
+for arg in "$@"; do
+  case "$arg" in
+  *.js)
+    SCRIPT_PATH="$arg"
+    ;;
+  esac
+done
+
+if [ -n "${SCRIPT_PATH:-}" ]; then
+  SCRIPT_NAME=$(basename "$SCRIPT_PATH" .js)
+else
+  SCRIPT_NAME="unknown-script"
+fi
+
+TIMESTAMP=$(date -u +"%Y%m%dT%H%M%SZ")
+FILENAME="${SCRIPT_NAME}-${ACCELERATOR}-${TIMESTAMP}.jsonl"
+OUTPUT_FILE_PATH="/output/${FILENAME}"
+echo "Configured metrics output file: ${OUTPUT_FILE_PATH}"
+
+if [ "$*" = "--help" ]; then
+  k6 --help
+else
+  k6 run \
+    --out "json=${OUTPUT_FILE_PATH}" \
+    "$@"
+fi
diff --git a/container-images/cpu/k6-benchmark/extract_metrics.py b/container-images/cpu/k6-benchmark/extract_metrics.py
new file mode 100644
index 000000000..3da346b1c
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/extract_metrics.py
@@ -0,0 +1,623 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+import json
+import logging
+import os
+import statistics
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+# Optional: Google Cloud Monitoring
+try:
+    from google.cloud import monitoring_v3
+
+    HAS_GCP = True
+except ImportError:
+    HAS_GCP = False
+
+
+@dataclass
+class ScenarioResult:
+    name: str
+    durations: List[float]
+    start_time: datetime
+    end_time: datetime
+    tags: Dict[str, any]
+    total_requests: int
+    successful_requests: int
+    vus: int
+
+
+def parse_k6_output(filepath: str) -> List[ScenarioResult]:
+    """Parses k6 JSONL and extracts data for all discovered scenarios."""
+    scenarios_data = {}
+    vus_points = []
+
+    logging.info(f"Parsing k6 output file: {filepath}")
+    with open(filepath, "r") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            try:
+                record = json.loads(line)
+            except:
+                continue
+
+            metric_name = record.get("metric")
+            record_type = record.get("type")
+            if record_type != "Point":
+                continue
+
+            data = record.get("data", {})
+            req_tags = data.get("tags", {})
+            value = data.get("value")
+            time_str = data.get("time")
+            if not time_str:
+                continue
+
+            if "." in time_str:
+                base, frac = time_str.split(".")
+                frac = frac.replace("Z", "")[:6]
+                clean_time_str = f"{base}.{frac}Z"
+            else:
+                clean_time_str = time_str
+            if clean_time_str.endswith("Z"):
+                clean_time_str = clean_time_str.replace("Z", "+00:00")
+            dt = datetime.fromisoformat(clean_time_str)
+
+            if metric_name == "vus" and value is not None:
+                vus_points.append((dt, int(value)))
+
+            scenario_name = req_tags.get("scenario")
+            if scenario_name:
+                if scenario_name not in scenarios_data:
+                    scenarios_data[scenario_name] = {
+                        "durations": [],
+                        "total_requests": 0,
+                        "successful_requests": 0,
+                        "start_time": dt,
+                        "end_time": dt,
+                        "tags": {},
+                    }
+                s_entry = scenarios_data[scenario_name]
+                if dt < s_entry["start_time"]:
+                    s_entry["start_time"] = dt
+                if dt > s_entry["end_time"]:
+                    s_entry["end_time"] = dt
+                if metric_name == "http_reqs":
+                    s_entry["total_requests"] += 1
+                    if req_tags.get("expected_response") == "true":
+                        s_entry["successful_requests"] += 1
+                if metric_name == "http_req_duration" and value is not None:
+                    s_entry["durations"].append(value)
+                    if not s_entry["tags"]:
+                        s_entry["tags"] = {
+                            "model": req_tags.get("model", "unknown"),
+                            "accelerator": req_tags.get("accelerator", "unknown"),
+                            "inference_server": req_tags.get(
+                                "inference_server", "unknown"
+                            ),
+                            "width": int(req_tags.get("width", 1024)),
+                            "height": int(req_tags.get("height", 1024)),
+                            "steps": int(req_tags.get("num_inference_steps", 20)),
+                            "seed": req_tags.get("seed", "unknown"),
+                            "batch_size": int(req_tags.get("batch_size", 1)),
+                            "target_url": req_tags.get("target_url", "unknown"),
+                            "deployment_name": req_tags.get(
+                                "deployment_name", "unknown"
+                            ),
+                        }
+
+    results = []
+    for name, data in scenarios_data.items():
+        if not (name.startswith("bench") or name == "benchmark"):
+            continue
+        if not data["durations"]:
+            continue
+        max_vus = 0
+        import re
+
+        m = re.search(r"_v(\d+)_", name)
+        if m:
+            max_vus = int(m.group(1))
+        else:
+            for v_dt, v_val in vus_points:
+                if data["start_time"] <= v_dt <= data["end_time"]:
+                    if v_val > max_vus:
+                        max_vus = v_val
+        results.append(
+            ScenarioResult(
+                name=name,
+                durations=data["durations"],
+                start_time=data["start_time"],
+                end_time=data["end_time"],
+                tags=data["tags"],
+                total_requests=data["total_requests"],
+                successful_requests=data["successful_requests"],
+                vus=max_vus if max_vus > 0 else 1,
+            )
+        )
+    results.sort(key=lambda x: x.start_time)
+    return results
+
+
+def get_typed_value(point_value):
+    if hasattr(point_value, "_pb"):
+        value_type = point_value._pb.WhichOneof("value")
+    elif hasattr(point_value, "WhichOneof"):
+        value_type = point_value.WhichOneof("value")
+    else:
+        value_type = None
+    if value_type == "double_value":
+        return point_value.double_value
+    elif value_type == "int64_value":
+        return point_value.int64_value
+    else:
+        if getattr(point_value, "double_value", 0.0) != 0.0:
+            return point_value.double_value
+        elif getattr(point_value, "int64_value", 0) != 0:
+            return point_value.int64_value
+        return 0.0
+
+
+def fetch_dcgm_metrics(
+    project_id,
+    start_time,
+    end_time,
+    vram_metric,
+    util_metric,
+    power_metric,
+    pod=None,
+    pod_is_prefix=False,
+    namespace=None,
+    node=None,
+):
+    import sys
+
+    print(f"DEBUG: sys.executable = {sys.executable}, HAS_GCP = {HAS_GCP}")
+    if not HAS_GCP or not project_id:
+        print("DEBUG: Exiting early because HAS_GCP is False or project_id is empty")
+        return "N/A", "N/A", "N/A"
+    try:
+        client = monitoring_v3.MetricServiceClient()
+        project_name = f"projects/{project_id}"
+        interval = monitoring_v3.TimeInterval(
+            {"start_time": start_time, "end_time": end_time}
+        )
+        base_filter = ' AND resource.type = "prometheus_target"'
+        if pod:
+            if pod_is_prefix:
+                base_filter += f' AND metric.labels.pod = starts_with("{pod}")'
+            else:
+                base_filter += f' AND metric.labels.pod = "{pod}"'
+        if node:
+            base_filter += f' AND resource.labels.instance = starts_with("{node}")'
+
+        def fetch(m_type):
+            full_filter = f'metric.type = "{m_type}"{base_filter}'
+            print(f"DEBUG: fetch_dcgm_metrics for {pod} with filter: {full_filter}")
+            try:
+                res = client.list_time_series(
+                    request={
+                        "name": project_name,
+                        "filter": full_filter,
+                        "interval": interval,
+                    }
+                )
+                print(f"DEBUG: Found {sum(1 for _ in res)} time series.")
+                return client.list_time_series(
+                    request={
+                        "name": project_name,
+                        "filter": full_filter,
+                        "interval": interval,
+                    }
+                )
+            except Exception as e:
+                print(f"DEBUG: Exception in fetch_dcgm_metrics: {e}")
+                return []
+
+        vram_per_gpu = {}
+        for result in fetch(vram_metric):
+            gpu_idx = result.metric.labels.get("gpu", "0")
+            vram_per_gpu.setdefault(gpu_idx, 0)
+            for point in result.points:
+                val = get_typed_value(point.value)
+                if val > vram_per_gpu[gpu_idx]:
+                    vram_per_gpu[gpu_idx] = val
+
+        compute_per_gpu = {}
+        for result in fetch(util_metric):
+            gpu_idx = result.metric.labels.get("gpu", "0")
+            compute_per_gpu.setdefault(gpu_idx, [])
+            for point in result.points:
+                compute_per_gpu[gpu_idx].append(get_typed_value(point.value))
+
+        power_per_gpu = {}
+        for result in fetch(power_metric):
+            gpu_idx = result.metric.labels.get("gpu", "0")
+            power_per_gpu.setdefault(gpu_idx, [])
+            for point in result.points:
+                power_per_gpu[gpu_idx].append(get_typed_value(point.value))
+
+        avg_compute_per_gpu = {
+            g: sum(vals) / len(vals) for g, vals in compute_per_gpu.items() if vals
+        }
+        avg_power_per_gpu = {
+            g: sum(vals) / len(vals) for g, vals in power_per_gpu.items() if vals
+        }
+
+        total_vram = sum(vram_per_gpu.values())
+        total_compute = sum(avg_compute_per_gpu.values())
+        total_power = sum(avg_power_per_gpu.values())
+
+        avg_compute = (
+            total_compute / len(avg_compute_per_gpu) if avg_compute_per_gpu else 0
+        )
+        avg_power = total_power / len(avg_power_per_gpu) if avg_power_per_gpu else 0
+
+        return {
+            "vram_total": f"{total_vram} MiB" if vram_per_gpu else "N/A",
+            "vram_per_gpu": (
+                json.dumps({g: f"{v} MiB" for g, v in sorted(vram_per_gpu.items())})
+                if vram_per_gpu
+                else "N/A"
+            ),
+            "compute_total": f"{total_compute:.2f}%" if avg_compute_per_gpu else "N/A",
+            "compute_avg": f"{avg_compute:.2f}%" if avg_compute_per_gpu else "N/A",
+            "compute_per_gpu": (
+                json.dumps(
+                    {g: f"{v:.2f}%" for g, v in sorted(avg_compute_per_gpu.items())}
+                )
+                if avg_compute_per_gpu
+                else "N/A"
+            ),
+            "power_total": f"{total_power:.2f} W" if avg_power_per_gpu else "N/A",
+            "power_avg": f"{avg_power:.2f} W" if avg_power_per_gpu else "N/A",
+            "power_per_gpu": (
+                json.dumps(
+                    {g: f"{v:.2f} W" for g, v in sorted(avg_power_per_gpu.items())}
+                )
+                if avg_power_per_gpu
+                else "N/A"
+            ),
+            "raw_total_vram_mib": total_vram,
+        }
+    except Exception as e:
+        logging.error(f"Failed to fetch metrics: {e}")
+        return {}
+
+
+EXPECTED_CSV_HEADER = [
+    "Source File",
+    "Deployment Name",
+    "Target URL",
+    "Model",
+    "Inference Server",
+    "Accelerator",
+    "Resolution",
+    "Inference Steps",
+    "Batch Size",
+    "Virtual Users (VUs)",
+    "Start Time (UTC)",
+    "End Time (UTC)",
+    "Total Time (s)",
+    "Total Requests",
+    "Success Rate (%)",
+    "Throughput (Images/s)",
+    "Request Throughput (RPS)",
+    "Request Latency p50 (s)",
+    "Request Latency p95 (s)",
+    "Request Latency p99 (s)",
+    "Image Latency p50 (s)",
+    "Image Latency p95 (s)",
+    "Image Latency p99 (s)",
+    "Peak VRAM (Total)",
+    "Peak VRAM (Per GPU)",
+    "Peak VRAM Utilization (%)",
+    "Compute (Total)",
+    "Compute (Average)",
+    "Compute (Per GPU)",
+    "Power (Total)",
+    "Power (Average)",
+    "Power (Per GPU)",
+    "Node Hourly Cost ($)",
+    "Cost per 1k Images ($)",
+]
+
+
+def get_gcp_project_id():
+    import urllib.request
+
+    try:
+        url = "http://metadata.google.internal/computeMetadata/v1/project/project-id"
+        req = urllib.request.Request(url, headers={"Metadata-Flavor": "Google"})
+        with urllib.request.urlopen(req, timeout=2) as response:
+            return response.read().decode("utf-8")
+    except:
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract metrics from multi-scenario k6 JSONL."
+    )
+    parser.add_argument("--file", required=True)
+    parser.add_argument("--output-csv", default="k6-benchmark.csv")
+    parser.add_argument("--hourly-cost", type=float, default=0.0)
+    parser.add_argument("--project-id")
+    parser.add_argument("--pod")
+    parser.add_argument("--namespace")
+    parser.add_argument("--node")
+    parser.add_argument(
+        "--vram-metric", default="prometheus.googleapis.com/DCGM_FI_DEV_FB_USED/gauge"
+    )
+    parser.add_argument(
+        "--util-metric", default="prometheus.googleapis.com/DCGM_FI_DEV_GPU_UTIL/gauge"
+    )
+    parser.add_argument(
+        "--power-metric",
+        default="prometheus.googleapis.com/DCGM_FI_DEV_POWER_USAGE/gauge",
+    )
+
+    args = parser.parse_args()
+    if not args.project_id:
+        args.project_id = get_gcp_project_id()
+
+    scenario_results = parse_k6_output(args.file)
+    if not scenario_results:
+        logging.error("No valid benchmark scenario data found.")
+        sys.exit(1)
+
+    csv_rows, report_sections = [], []
+    input_path = Path(args.file)
+    header = [
+        "=" * 50,
+        f" GKE Price/Performance Benchmark Consolidated Report",
+        f" Source: {input_path.name}",
+        "=" * 50,
+    ]
+
+    summary_cols = [
+        "Scenario",
+        "Res",
+        "B",
+        "VU",
+        "Steps",
+        "Suc%",
+        "Img/s",
+        "RPS",
+        "ReqP50",
+        "ImgP50",
+        "VRAM",
+        "GPU%",
+        "Cost/1k",
+    ]
+    summary_fmt = "{:<20} {:<10} {:<2} {:<2} {:<5} {:<4} {:<7} {:<6} {:<7} {:<7} {:<7} {:<6} {:<8}"
+    summary_table = ["SUMMARY TABLE:", summary_fmt.format(*summary_cols), "-" * 105]
+
+    for res in scenario_results:
+        total_time = (res.end_time - res.start_time).total_seconds()
+        batch_size = res.tags.get("batch_size", 1)
+        throughput = (
+            (res.successful_requests * batch_size) / total_time if total_time > 0 else 0
+        )
+        rps = res.successful_requests / total_time if total_time > 0 else 0
+        success_rate = (
+            (res.successful_requests / res.total_requests) * 100
+            if res.total_requests > 0
+            else 0
+        )
+        p50 = statistics.median(res.durations)
+        if len(res.durations) > 1:
+            q = statistics.quantiles(res.durations, n=100, method="inclusive")
+            p95, p99 = q[94], q[98]
+        else:
+            p95 = p99 = res.durations[0]
+        img_p50, img_p95, img_p99 = p50 / batch_size, p95 / batch_size, p99 / batch_size
+        cost_per_1k = (
+            (args.hourly_cost / (throughput * 3600)) * 1000 if throughput > 0 else 0
+        )
+
+        dcgm_metrics = fetch_dcgm_metrics(
+            args.project_id,
+            res.start_time,
+            res.end_time,
+            args.vram_metric,
+            args.util_metric,
+            args.power_metric,
+            pod=args.pod or res.tags.get("deployment_name"),
+            pod_is_prefix=not args.pod,
+            namespace=args.namespace,
+            node=args.node,
+        )
+
+        vram_total = dcgm_metrics.get("vram_total", "N/A")
+        vram_per_gpu = dcgm_metrics.get("vram_per_gpu", "N/A")
+        comp_total = dcgm_metrics.get("compute_total", "N/A")
+        comp_avg = dcgm_metrics.get("compute_avg", "N/A")
+        comp_per_gpu = dcgm_metrics.get("compute_per_gpu", "N/A")
+        pow_total = dcgm_metrics.get("power_total", "N/A")
+        pow_avg = dcgm_metrics.get("power_avg", "N/A")
+        pow_per_gpu = dcgm_metrics.get("power_per_gpu", "N/A")
+        v_val_mib = dcgm_metrics.get("raw_total_vram_mib", 0)
+
+        vram_util = "N/A"
+        try:
+            accel = res.tags.get("accelerator", "").lower()
+            if "l4-x4" in accel:
+                total_vram_max = 22528 * 4
+            elif "l4-x2" in accel:
+                total_vram_max = 22528 * 2
+            elif "l4" in accel:
+                total_vram_max = 22528
+            elif "6000" in accel:
+                total_vram_max = 98304
+            else:
+                total_vram_max = 0
+
+            if total_vram_max and v_val_mib > 0:
+                vram_util = f"{(v_val_mib / total_vram_max) * 100:.2f}%"
+        except:
+            pass
+
+        summary_table.append(
+            summary_fmt.format(
+                res.name[:20],
+                f"{res.tags.get('width')}x{res.tags.get('height')}",
+                batch_size,
+                res.vus,
+                res.tags.get("steps", 20),
+                f"{success_rate:.0f}",
+                f"{throughput:.2f}",
+                f"{rps:.2f}",
+                f"{p50/1000:.2f}",
+                f"{img_p50/1000:.2f}",
+                f"{v_val_mib/1024:.0f}G" if v_val_mib else "N/A",
+                comp_avg.replace("%", ""),
+                f"${cost_per_1k:.2f}",
+            )
+        )
+
+        report_sections.extend(
+            [
+                "",
+                "=" * 50,
+                " GKE Price/Performance Benchmark Report",
+                "=" * 50,
+                f"Scenario:            {res.name}",
+                f"Model:               {res.tags.get('model')}",
+                f"Inference Server:    {res.tags.get('inference_server')}",
+                f"Accelerator:         {res.tags.get('accelerator')}",
+                f"Resolution:          {res.tags.get('width')}x{res.tags.get('height')}",
+                f"Inference Steps:     {res.tags.get('steps')}",
+                f"Batch Size:          {batch_size}",
+                f"Virtual Users (VUs): {res.vus}",
+                f"Time Window:         {res.start_time.strftime('%Y-%m-%d %H:%M:%S UTC')} to {res.end_time.strftime('%H:%M:%S UTC')} ({total_time:.2f}s)",
+                "-" * 50,
+                "UX Metrics (Off-Node):",
+                f"  Total Requests:    {res.total_requests}",
+                f"  Success Rate:      {success_rate:.2f}%",
+                f"  Throughput:        {throughput:.4f} Images/Second",
+                f"  Request RPS:       {rps:.4f} RPS",
+                f"  Request Latency p50: {p50/1000:.3f} s",
+                f"  Request Latency p95: {p95/1000:.3f} s",
+                f"  Request Latency p99: {p99/1000:.3f} s",
+                f"  Image Latency p50:   {img_p50/1000:.3f} s",
+                f"  Image Latency p95:   {img_p95/1000:.3f} s",
+                f"  Image Latency p99:   {img_p99/1000:.3f} s",
+                "-" * 50,
+                "Hardware Metrics (On-Node DCGM):",
+                f"  Peak VRAM (Total):   {vram_total}",
+                f"  Peak VRAM (Per GPU): {vram_per_gpu}",
+                f"  VRAM Utilization:    {vram_util}",
+                f"  Compute (Total):     {comp_total}",
+                f"  Compute (Average):   {comp_avg}",
+                f"  Compute (Per GPU):   {comp_per_gpu}",
+                f"  Power (Total):       {pow_total}",
+                f"  Power (Average):     {pow_avg}",
+                f"  Power (Per GPU):     {pow_per_gpu}",
+                "-" * 50,
+                "Business Metrics:",
+                f"  Node Hourly Cost:  ${args.hourly_cost:.4f}",
+                f"  Cost per 1k Images:  ${cost_per_1k:.4f}",
+                "=" * 50,
+            ]
+        )
+
+        csv_rows.append(
+            [
+                input_path.name,
+                res.tags.get("deployment_name"),
+                res.tags.get("target_url"),
+                res.tags.get("model"),
+                res.tags.get("inference_server"),
+                res.tags.get("accelerator"),
+                f"{res.tags.get('width')}x{res.tags.get('height')}",
+                res.tags.get("steps"),
+                batch_size,
+                res.vus,
+                res.start_time.strftime("%Y-%m-%d %H:%M:%S"),
+                res.end_time.strftime("%Y-%m-%d %H:%M:%S"),
+                f"{total_time:.2f}",
+                res.total_requests,
+                f"{success_rate:.2f}",
+                f"{throughput:.4f}",
+                f"{rps:.4f}",
+                f"{p50/1000:.3f}",
+                f"{p95/1000:.3f}",
+                f"{p99/1000:.3f}",
+                f"{img_p50/1000:.3f}",
+                f"{img_p95/1000:.3f}",
+                f"{img_p99/1000:.3f}",
+                vram_total,
+                vram_per_gpu,
+                vram_util,
+                comp_total,
+                comp_avg,
+                comp_per_gpu,
+                pow_total,
+                pow_avg,
+                pow_per_gpu,
+                f"{args.hourly_cost:.4f}",
+                f"{cost_per_1k:.4f}",
+            ]
+        )
+
+    output_path = input_path.with_name(f"{input_path.stem}-report.txt")
+    with open(output_path, "w") as f:
+        f.write("\n".join(header + summary_table + report_sections) + "\n")
+
+    csv_out = Path(args.output_csv)
+    write_h = not csv_out.exists()
+    existing_rows = set()
+    if not write_h:
+        with open(csv_out, "r") as f:
+            reader = csv.reader(f)
+            if next(reader, None) != EXPECTED_CSV_HEADER:
+                ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+                csv_out.rename(csv_out.with_name(f"{csv_out.stem}.mismatch.{ts}.csv"))
+                write_h = True
+            else:
+                for row in reader:
+                    if len(row) > 10:
+                        existing_rows.add((row[0], row[6], row[9]))
+
+    with open(csv_out, "a", newline="") as f:
+        writer = csv.writer(f)
+        if write_h:
+            writer.writerow(EXPECTED_CSV_HEADER)
+        appended = 0
+        for row in csv_rows:
+            if (row[0], row[6], row[9]) not in existing_rows:
+                writer.writerow(row)
+                appended += 1
+            else:
+                logging.info(
+                    f"Row for {row[0]} @ {row[6]} with {row[9]} VUs already exists. Skipping."
+                )
+    logging.info(f"Consolidated report saved to {output_path}")
+    logging.info(f"Appended {appended} new rows to {args.output_csv}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/container-images/cpu/k6-benchmark/requirements.in b/container-images/cpu/k6-benchmark/requirements.in
new file mode 100644
index 000000000..e0ab03e55
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/requirements.in
@@ -0,0 +1,15 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+google-cloud-monitoring
diff --git a/container-images/cpu/k6-benchmark/requirements.txt b/container-images/cpu/k6-benchmark/requirements.txt
new file mode 100644
index 000000000..443405a24
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/requirements.txt
@@ -0,0 +1,415 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in --generate-hashes -o requirements.txt
+certifi==2026.2.25 \
+    --hash=sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa \
+    --hash=sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7
+    # via requests
+cffi==2.0.0 \
+    --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \
+    --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \
+    --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \
+    --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \
+    --hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \
+    --hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \
+    --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \
+    --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \
+    --hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \
+    --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \
+    --hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \
+    --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \
+    --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \
+    --hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \
+    --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \
+    --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \
+    --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \
+    --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \
+    --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \
+    --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \
+    --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \
+    --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \
+    --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \
+    --hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \
+    --hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \
+    --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \
+    --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \
+    --hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \
+    --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \
+    --hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \
+    --hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \
+    --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \
+    --hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \
+    --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \
+    --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \
+    --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \
+    --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \
+    --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \
+    --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \
+    --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \
+    --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \
+    --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \
+    --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \
+    --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \
+    --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \
+    --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \
+    --hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \
+    --hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \
+    --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \
+    --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \
+    --hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \
+    --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \
+    --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \
+    --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \
+    --hash=sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9 \
+    --hash=sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775 \
+    --hash=sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739 \
+    --hash=sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc \
+    --hash=sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062 \
+    --hash=sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe \
+    --hash=sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9 \
+    --hash=sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92 \
+    --hash=sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5 \
+    --hash=sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13 \
+    --hash=sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d \
+    --hash=sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26 \
+    --hash=sha256:cb527a79772e5ef98fb1d700678fe031e353e765d1ca2d409c92263c6d43e09f \
+    --hash=sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495 \
+    --hash=sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b \
+    --hash=sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6 \
+    --hash=sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c \
+    --hash=sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef \
+    --hash=sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5 \
+    --hash=sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18 \
+    --hash=sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad \
+    --hash=sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3 \
+    --hash=sha256:de8dad4425a6ca6e4e5e297b27b5c824ecc7581910bf9aee86cb6835e6812aa7 \
+    --hash=sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5 \
+    --hash=sha256:e6e73b9e02893c764e7e8d5bb5ce277f1a009cd5243f8228f75f842bf937c534 \
+    --hash=sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49 \
+    --hash=sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2 \
+    --hash=sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5 \
+    --hash=sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453 \
+    --hash=sha256:fe562eb1a64e67dd297ccc4f5addea2501664954f2692b69a76449ec7913ecbf
+    # via cryptography
+charset-normalizer==3.4.7 \
+    --hash=sha256:007d05ec7321d12a40227aae9e2bc6dca73f3cb21058999a1df9e193555a9dcc \
+    --hash=sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c \
+    --hash=sha256:07d9e39b01743c3717745f4c530a6349eadbfa043c7577eef86c502c15df2c67 \
+    --hash=sha256:08e721811161356f97b4059a9ba7bafb23ea5ee2255402c42881c214e173c6b4 \
+    --hash=sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0 \
+    --hash=sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c \
+    --hash=sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5 \
+    --hash=sha256:12a6fff75f6bc66711b73a2f0addfc4c8c15a20e805146a02d147a318962c444 \
+    --hash=sha256:12d8baf840cc7889b37c7c770f478adea7adce3dcb3944d02ec87508e2dcf153 \
+    --hash=sha256:14265bfe1f09498b9d8ec91e9ec9fa52775edf90fcbde092b25f4a33d444fea9 \
+    --hash=sha256:16d971e29578a5e97d7117866d15889a4a07befe0e87e703ed63cd90cb348c01 \
+    --hash=sha256:177a0ba5f0211d488e295aaf82707237e331c24788d8d76c96c5a41594723217 \
+    --hash=sha256:1a87ca9d5df6fe460483d9a5bbf2b18f620cbed41b432e2bddb686228282d10b \
+    --hash=sha256:1c2a768fdd44ee4a9339a9b0b130049139b8ce3c01d2ce09f67f5a68048d477c \
+    --hash=sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a \
+    --hash=sha256:1dc8b0ea451d6e69735094606991f32867807881400f808a106ee1d963c46a83 \
+    --hash=sha256:1efde3cae86c8c273f1eb3b287be7d8499420cf2fe7585c41d370d3e790054a5 \
+    --hash=sha256:202389074300232baeb53ae2569a60901f7efadd4245cf3a3bf0617d60b439d7 \
+    --hash=sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb \
+    --hash=sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c \
+    --hash=sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1 \
+    --hash=sha256:2cd4a60d0e2fb04537162c62bbbb4182f53541fe0ede35cdf270a1c1e723cc42 \
+    --hash=sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab \
+    --hash=sha256:2fe249cb4651fd12605b7288b24751d8bfd46d35f12a20b1ba33dea122e690df \
+    --hash=sha256:30b8d1d8c52a48c2c5690e152c169b673487a2a58de1ec7393196753063fcd5e \
+    --hash=sha256:320ade88cfb846b8cd6b4ddf5ee9e80ee0c1f52401f2456b84ae1ae6a1a5f207 \
+    --hash=sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18 \
+    --hash=sha256:36836d6ff945a00b88ba1e4572d721e60b5b8c98c155d465f56ad19d68f23734 \
+    --hash=sha256:38c0109396c4cfc574d502df99742a45c72c08eff0a36158b6f04000043dbf38 \
+    --hash=sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110 \
+    --hash=sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18 \
+    --hash=sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44 \
+    --hash=sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d \
+    --hash=sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48 \
+    --hash=sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e \
+    --hash=sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5 \
+    --hash=sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d \
+    --hash=sha256:4e5163c14bffd570ef2affbfdd77bba66383890797df43dc8b4cc7d6f500bf53 \
+    --hash=sha256:511ef87c8aec0783e08ac18565a16d435372bc1ac25a91e6ac7f5ef2b0bff790 \
+    --hash=sha256:532bc9bf33a68613fd7d65e4b1c71a6a38d7d42604ecf239c77392e9b4e8998c \
+    --hash=sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b \
+    --hash=sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116 \
+    --hash=sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d \
+    --hash=sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10 \
+    --hash=sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6 \
+    --hash=sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2 \
+    --hash=sha256:6370e8686f662e6a3941ee48ed4742317cafbe5707e36406e9df792cdb535776 \
+    --hash=sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a \
+    --hash=sha256:65bcd23054beab4d166035cabbc868a09c1a49d1efe458fe8e4361215df40265 \
+    --hash=sha256:66671f93accb62ed07da56613636f3641f1a12c13046ce91ffc923721f23c008 \
+    --hash=sha256:6696b7688f54f5af4462118f0bfa7c1621eeb87154f77fa04b9295ce7a8f2943 \
+    --hash=sha256:6785f414ae0f3c733c437e0f3929197934f526d19dfaa75e18fdb4f94c6fb374 \
+    --hash=sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246 \
+    --hash=sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e \
+    --hash=sha256:6e0d51f618228538a3e8f46bd246f87a6cd030565e015803691603f55e12afb5 \
+    --hash=sha256:6ed74185b2db44f41ef35fd1617c5888e59792da9bbc9190d6c7300617182616 \
+    --hash=sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15 \
+    --hash=sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41 \
+    --hash=sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960 \
+    --hash=sha256:750e02e074872a3fad7f233b47734166440af3cdea0add3e95163110816d6752 \
+    --hash=sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e \
+    --hash=sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72 \
+    --hash=sha256:7641bb8895e77f921102f72833904dcd9901df5d6d72a2ab8f31d04b7e51e4e7 \
+    --hash=sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8 \
+    --hash=sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b \
+    --hash=sha256:813c0e0132266c08eb87469a642cb30aaff57c5f426255419572aaeceeaa7bf4 \
+    --hash=sha256:82b271f5137d07749f7bf32f70b17ab6eaabedd297e75dce75081a24f76eb545 \
+    --hash=sha256:84c018e49c3bf790f9c2771c45e9313a08c2c2a6342b162cd650258b57817706 \
+    --hash=sha256:8751d2787c9131302398b11e6c8068053dcb55d5a8964e114b6e196cf16cb366 \
+    --hash=sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb \
+    --hash=sha256:87fad7d9ba98c86bcb41b2dc8dbb326619be2562af1f8ff50776a39e55721c5a \
+    --hash=sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e \
+    --hash=sha256:8e385e4267ab76874ae30db04c627faaaf0b509e1ccc11a95b3fc3e83f855c00 \
+    --hash=sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f \
+    --hash=sha256:94e1885b270625a9a828c9793b4d52a64445299baa1fea5a173bf1d3dd9a1a5a \
+    --hash=sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1 \
+    --hash=sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66 \
+    --hash=sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356 \
+    --hash=sha256:a6c5863edfbe888d9eff9c8b8087354e27618d9da76425c119293f11712a6319 \
+    --hash=sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4 \
+    --hash=sha256:adb2597b428735679446b46c8badf467b4ca5f5056aae4d51a19f9570301b1ad \
+    --hash=sha256:ae196f021b5e7c78e918242d217db021ed2a6ace2bc6ae94c0fc596221c7f58d \
+    --hash=sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5 \
+    --hash=sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7 \
+    --hash=sha256:aef65cd602a6d0e0ff6f9930fcb1c8fec60dd2cfcb6facaf4bdb0e5873042db0 \
+    --hash=sha256:af21eb4409a119e365397b2adbaca4c9ccab56543a65d5dbd9f920d6ac29f686 \
+    --hash=sha256:b14b2d9dac08e28bb8046a1a0434b1750eb221c8f5b87a68f4fa11a6f97b5e34 \
+    --hash=sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49 \
+    --hash=sha256:bb8cc7534f51d9a017b93e3e85b260924f909601c3df002bcdb58ddb4dc41a5c \
+    --hash=sha256:bc17a677b21b3502a21f66a8cc64f5bfad4df8a0b8434d661666f8ce90ac3af1 \
+    --hash=sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e \
+    --hash=sha256:bd9b23791fe793e4968dba0c447e12f78e425c59fc0e3b97f6450f4781f3ee60 \
+    --hash=sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0 \
+    --hash=sha256:c0f081d69a6e58272819b70288d3221a6ee64b98df852631c80f293514d3b274 \
+    --hash=sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d \
+    --hash=sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0 \
+    --hash=sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae \
+    --hash=sha256:c593052c465475e64bbfe5dbd81680f64a67fdc752c56d7a0ae205dc8aeefe0f \
+    --hash=sha256:cdd68a1fb318e290a2077696b7eb7a21a49163c455979c639bf5a5dcdc46617d \
+    --hash=sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe \
+    --hash=sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3 \
+    --hash=sha256:cf29836da5119f3c8a8a70667b0ef5fdca3bb12f80fd06487cfa575b3909b393 \
+    --hash=sha256:d4a48e5b3c2a489fae013b7589308a40146ee081f6f509e047e0e096084ceca1 \
+    --hash=sha256:d560742f3c0d62afaccf9f41fe485ed69bd7661a241f86a3ef0f0fb8b1a397af \
+    --hash=sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44 \
+    --hash=sha256:d61f00a0869d77422d9b2aba989e2d24afa6ffd552af442e0e58de4f35ea6d00 \
+    --hash=sha256:d635aab80466bc95771bb78d5370e74d36d1fe31467b6b29b8b57b2a3cd7d22c \
+    --hash=sha256:dca4bbc466a95ba9c0234ef56d7dd9509f63da22274589ebd4ed7f1f4d4c54e3 \
+    --hash=sha256:dd915403e231e6b1809fe9b6d9fc55cf8fb5e02765ac625d9cd623342a7905d7 \
+    --hash=sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd \
+    --hash=sha256:e060d01aec0a910bdccb8be71faf34e7799ce36950f8294c8bf612cba65a2c9e \
+    --hash=sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b \
+    --hash=sha256:e17b8d5d6a8c47c85e68ca8379def1303fd360c3e22093a807cd34a71cd082b8 \
+    --hash=sha256:e5f4d355f0a2b1a31bc3edec6795b46324349c9cb25eed068049e4f472fb4259 \
+    --hash=sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859 \
+    --hash=sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46 \
+    --hash=sha256:e80c8378d8f3d83cd3164da1ad2df9e37a666cdde7b1cb2298ed0b558064be30 \
+    --hash=sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b \
+    --hash=sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46 \
+    --hash=sha256:ed065083d0898c9d5b4bbec7b026fd755ff7454e6e8b73a67f8c744b13986e24 \
+    --hash=sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a \
+    --hash=sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24 \
+    --hash=sha256:f22dec1690b584cea26fade98b2435c132c1b5f68e39f5a0b7627cd7ae31f1dc \
+    --hash=sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215 \
+    --hash=sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063 \
+    --hash=sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832 \
+    --hash=sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6 \
+    --hash=sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79 \
+    --hash=sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464
+    # via requests
+cryptography==46.0.7 \
+    --hash=sha256:04959522f938493042d595a736e7dbdff6eb6cc2339c11465b3ff89343b65f65 \
+    --hash=sha256:128c5edfe5e5938b86b03941e94fac9ee793a94452ad1365c9fc3f4f62216832 \
+    --hash=sha256:1d25aee46d0c6f1a501adcddb2d2fee4b979381346a78558ed13e50aa8a59067 \
+    --hash=sha256:24402210aa54baae71d99441d15bb5a1919c195398a87b563df84468160a65de \
+    --hash=sha256:258514877e15963bd43b558917bc9f54cf7cf866c38aa576ebf47a77ddbc43a4 \
+    --hash=sha256:35719dc79d4730d30f1c2b6474bd6acda36ae2dfae1e3c16f2051f215df33ce0 \
+    --hash=sha256:397655da831414d165029da9bc483bed2fe0e75dde6a1523ec2fe63f3c46046b \
+    --hash=sha256:3986ac1dee6def53797289999eabe84798ad7817f3e97779b5061a95b0ee4968 \
+    --hash=sha256:420b1e4109cc95f0e5700eed79908cef9268265c773d3a66f7af1eef53d409ef \
+    --hash=sha256:42a1e5f98abb6391717978baf9f90dc28a743b7d9be7f0751a6f56a75d14065b \
+    --hash=sha256:462ad5cb1c148a22b2e3bcc5ad52504dff325d17daf5df8d88c17dda1f75f2a4 \
+    --hash=sha256:506c4ff91eff4f82bdac7633318a526b1d1309fc07ca76a3ad182cb5b686d6d3 \
+    --hash=sha256:5ad9ef796328c5e3c4ceed237a183f5d41d21150f972455a9d926593a1dcb308 \
+    --hash=sha256:5d1c02a14ceb9148cc7816249f64f623fbfee39e8c03b3650d842ad3f34d637e \
+    --hash=sha256:5e51be372b26ef4ba3de3c167cd3d1022934bc838ae9eaad7e644986d2a3d163 \
+    --hash=sha256:60627cf07e0d9274338521205899337c5d18249db56865f943cbe753aa96f40f \
+    --hash=sha256:65814c60f8cc400c63131584e3e1fad01235edba2614b61fbfbfa954082db0ee \
+    --hash=sha256:73510b83623e080a2c35c62c15298096e2a5dc8d51c3b4e1740211839d0dea77 \
+    --hash=sha256:7bbc6ccf49d05ac8f7d7b5e2e2c33830d4fe2061def88210a126d130d7f71a85 \
+    --hash=sha256:80406c3065e2c55d7f49a9550fe0c49b3f12e5bfff5dedb727e319e1afb9bf99 \
+    --hash=sha256:84d4cced91f0f159a7ddacad249cc077e63195c36aac40b4150e7a57e84fffe7 \
+    --hash=sha256:8a469028a86f12eb7d2fe97162d0634026d92a21f3ae0ac87ed1c4a447886c83 \
+    --hash=sha256:91bbcb08347344f810cbe49065914fe048949648f6bd5c2519f34619142bbe85 \
+    --hash=sha256:935ce7e3cfdb53e3536119a542b839bb94ec1ad081013e9ab9b7cfd478b05006 \
+    --hash=sha256:9694078c5d44c157ef3162e3bf3946510b857df5a3955458381d1c7cfc143ddb \
+    --hash=sha256:a1529d614f44b863a7b480c6d000fe93b59acee9c82ffa027cfadc77521a9f5e \
+    --hash=sha256:abad9dac36cbf55de6eb49badd4016806b3165d396f64925bf2999bcb67837ba \
+    --hash=sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325 \
+    --hash=sha256:b7b412817be92117ec5ed95f880defe9cf18a832e8cafacf0a22337dc1981b4d \
+    --hash=sha256:c5b1ccd1239f48b7151a65bc6dd54bcfcc15e028c8ac126d3fada09db0e07ef1 \
+    --hash=sha256:cbd5fb06b62bd0721e1170273d3f4d5a277044c47ca27ee257025146c34cbdd1 \
+    --hash=sha256:cdf1a610ef82abb396451862739e3fc93b071c844399e15b90726ef7470eeaf2 \
+    --hash=sha256:cdfbe22376065ffcf8be74dc9a909f032df19bc58a699456a21712d6e5eabfd0 \
+    --hash=sha256:d02c738dacda7dc2a74d1b2b3177042009d5cab7c7079db74afc19e56ca1b455 \
+    --hash=sha256:d151173275e1728cf7839aaa80c34fe550c04ddb27b34f48c232193df8db5842 \
+    --hash=sha256:d23c8ca48e44ee015cd0a54aeccdf9f09004eba9fc96f38c911011d9ff1bd457 \
+    --hash=sha256:d3b99c535a9de0adced13d159c5a9cf65c325601aa30f4be08afd680643e9c15 \
+    --hash=sha256:d5f7520159cd9c2154eb61eb67548ca05c5774d39e9c2c4339fd793fe7d097b2 \
+    --hash=sha256:db0f493b9181c7820c8134437eb8b0b4792085d37dbb24da050476ccb664e59c \
+    --hash=sha256:e06acf3c99be55aa3b516397fe42f5855597f430add9c17fa46bf2e0fb34c9bb \
+    --hash=sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5 \
+    --hash=sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4 \
+    --hash=sha256:ebd6daf519b9f189f85c479427bbd6e9c9037862cf8fe89ee35503bd209ed902 \
+    --hash=sha256:f247c8c1a1fb45e12586afbb436ef21ff1e80670b2861a90353d9b025583d246 \
+    --hash=sha256:fbfd0e5f273877695cb93baf14b185f4878128b250cc9f8e617ea0c025dfb022 \
+    --hash=sha256:fc9ab8856ae6cf7c9358430e49b368f3108f050031442eaeb6b9d87e4dcf4e4f \
+    --hash=sha256:fcd8eac50d9138c1d7fc53a653ba60a2bee81a505f9f8850b6b2888555a45d0e \
+    --hash=sha256:fdd1736fed309b4300346f88f74cd120c27c56852c3838cab416e7a166f67298 \
+    --hash=sha256:ffca7aa1d00cf7d6469b988c581598f2259e46215e0140af408966a24cf086ce
+    # via google-auth
+google-api-core==2.30.3 \
+    --hash=sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8 \
+    --hash=sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b
+    # via google-cloud-monitoring
+google-auth==2.49.2 \
+    --hash=sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409 \
+    --hash=sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5
+    # via
+    #   google-api-core
+    #   google-cloud-monitoring
+google-cloud-monitoring==2.30.0 \
+    --hash=sha256:2729f3b88a4798b7757b1d9d31b6cb562bb3544e8173765e4e5cd44d8685b1ed \
+    --hash=sha256:a9530aa9aa246c490810dfa7be32d67e8340d19108acc99cbc02d1ed494fba76
+    # via -r requirements.in
+googleapis-common-protos==1.74.0 \
+    --hash=sha256:57971e4eeeba6aad1163c1f0fc88543f965bb49129b8bb55b2b7b26ecab084f1 \
+    --hash=sha256:702216f78610bb510e3f12ac3cafd281b7ac45cc5d86e90ad87e4d301a3426b5
+    # via
+    #   google-api-core
+    #   grpcio-status
+grpcio==1.80.0 \
+    --hash=sha256:00168469238b022500e486c1c33916acf2f2a9b2c022202cf8a1885d2e3073c1 \
+    --hash=sha256:02e64bb0bb2da14d947a49e6f120a75e947250aebe65f9629b62bb1f5c14e6e9 \
+    --hash=sha256:05d55e1798756282cddd52d56c896b3e7d673e3a8798c2f1cd05ba249a3bb4de \
+    --hash=sha256:09e5e478b3d14afd23f12e49e8b44c8684ac3c5f08561c43a5b9691c54d136ab \
+    --hash=sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921 \
+    --hash=sha256:1b97cd29a8eda100b559b455331c487a80915b6ea6bd91cf3e89836c4ee8d957 \
+    --hash=sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f \
+    --hash=sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257 \
+    --hash=sha256:2bea16af2750fd0a899bf1abd9022244418b55d1f37da2202249ba4ba673838d \
+    --hash=sha256:2dcc70e9f0ba987526e8e8603a610fb4f460e42899e74e7a518bf3c68fe1bf05 \
+    --hash=sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd \
+    --hash=sha256:31b9ac4ad1aa28ffee5503821fafd09e4da0a261ce1c1281c6c8da0423c83b6e \
+    --hash=sha256:33eb763f18f006dc7fee1e69831d38d23f5eccd15b2e0f92a13ee1d9242e5e02 \
+    --hash=sha256:367ce30ba67d05e0592470428f0ec1c31714cab9ef19b8f2e37be1f4c7d32fae \
+    --hash=sha256:3b01e1f5464c583d2f567b2e46ff0d516ef979978f72091fd81f5ab7fa6e2e7f \
+    --hash=sha256:3cb8130ba457d2aa09fa6b7c3ed6b6e4e6a2685fce63cb803d479576c4d80e21 \
+    --hash=sha256:3d4147a97c8344d065d01bbf8b6acec2cf86fb0400d40696c8bdad34a64ffc0e \
+    --hash=sha256:43168871f170d1e4ed16ae03d10cd21efa29f190e710a624cee7e5ae07da6f4f \
+    --hash=sha256:448c884b668b868562b1bda833c5fce6272d26e1926ec46747cda05741d302c1 \
+    --hash=sha256:4560cf0e86514595dbbd330cd65b7afad4b5c4b8c4905c041cfffa138d45e6fd \
+    --hash=sha256:46c2390b59d67f84e882694d489f5b45707c657832d7934859ceb8c33f467069 \
+    --hash=sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411 \
+    --hash=sha256:4ed39fbdcf9b87370f6e8df4e39ca7b38b3e5e9d1b0013c7b6be9639d6578d14 \
+    --hash=sha256:50a9871536d71c4fba24ee856abc03a87764570f0c457dd8db0b4018f379fed9 \
+    --hash=sha256:51b4a7189b0bef2aa30adce3c78f09c83526cf3dddb24c6a96555e3b97340440 \
+    --hash=sha256:52d143637e3872633fc7dd7c3c6a1c84e396b359f3a72e215f8bf69fd82084fc \
+    --hash=sha256:5c07e82e822e1161354e32da2662f741a4944ea955f9f580ec8fb409dd6f6060 \
+    --hash=sha256:627fb7312171cdc52828bd6fac8d7028ff2a64b89f1957b6f3416caa2218d141 \
+    --hash=sha256:68e5851ac4b9afe07e7f84483803ad167852570d65326b34d54ca560bfa53fb6 \
+    --hash=sha256:7b641fc3f1dc647bfd80bd713addc68f6d145956f64677e56d9ebafc0bd72388 \
+    --hash=sha256:8502122a3cc1714038e39a0b071acb1207ca7844208d5ea0d091317555ee7106 \
+    --hash=sha256:873ff5d17d68992ef6605330127425d2fc4e77e612fa3c3e0ed4e668685e3140 \
+    --hash=sha256:886457a7768e408cdce226ad1ca67d2958917d306523a0e21e1a2fdaa75c9c9c \
+    --hash=sha256:8ac393b58aa16991a2f1144ec578084d544038c12242da3a215966b512904d0f \
+    --hash=sha256:8eb613f02d34721f1acf3626dfdb3545bd3c8505b0e52bf8b5710a28d02e8aa7 \
+    --hash=sha256:92d787312e613754d4d8b9ca6d3297e69994a7912a32fa38c4c4e01c272974b0 \
+    --hash=sha256:93b6f823810720912fd131f561f91f5fed0fda372b6b7028a2681b8194d5d294 \
+    --hash=sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f \
+    --hash=sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff \
+    --hash=sha256:a1dc80fe55685b4a543555e6eef975303b36c8db1023b1599b094b92aa77965f \
+    --hash=sha256:a361c20ec1ccd3c3953d20fb6d7b4125093bdd10dff44c5e2bbb39e58917cedc \
+    --hash=sha256:a72d84ad0514db063e21887fbacd1fd7acb4d494a564cae22227cd45c7fbf199 \
+    --hash=sha256:aacdfb4ed3eb919ca997504d27e03d5dba403c85130b8ed450308590a738f7a4 \
+    --hash=sha256:ba0915d51fd4ced2db5ff719f84e270afe0e2d4c45a7bdb1e8d036e4502928c2 \
+    --hash=sha256:ba0db34f7e1d803a878284cd70e4c63cb6ae2510ba51937bf8f45ba997cefcf7 \
+    --hash=sha256:bac1d573dfa84ce59a5547073e28fa7326d53352adda6912e362da0b917fcef4 \
+    --hash=sha256:c51bf8ac4575af2e0678bccfb07e47321fc7acb5049b4482832c5c195e04e13a \
+    --hash=sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0 \
+    --hash=sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193 \
+    --hash=sha256:ce1794f4ea6cc3ca29463f42d665c32ba1b964b48958a66497917fe9069f26e6 \
+    --hash=sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de \
+    --hash=sha256:d8e11f167935b3eb089ac9038e1a063e6d7dbe995c0bb4a661e614583352e76f \
+    --hash=sha256:dc053420fc75749c961e2a4c906398d7c15725d36ccc04ae6d16093167223b58 \
+    --hash=sha256:deb10a1528473c11f72a0939eed36d83e847d7cbb63e8cc5611fb7a912d38614 \
+    --hash=sha256:dfab85db094068ff42e2a3563f60ab3dddcc9d6488a35abf0132daec13209c8a \
+    --hash=sha256:e172cf795a3ba5246d3529e4d34c53db70e888fa582a8ffebd2e6e48bc0cba50 \
+    --hash=sha256:e9e408fc016dffd20661f0126c53d8a31c2821b5c13c5d67a0f5ed5de93319ad \
+    --hash=sha256:ec0a592e926071b4abad50c1495cd0d0d513324b3ff5e7267067c33ba27506e4 \
+    --hash=sha256:f14b618fc30de822681ee986cfdcc2d9327229dc4c98aed16896761cacd468b9 \
+    --hash=sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2 \
+    --hash=sha256:f7691a6788ad9196872f95716df5bc643ebba13c97140b7a5ee5c8e75d1dea81
+    # via
+    #   google-api-core
+    #   google-cloud-monitoring
+    #   grpcio-status
+grpcio-status==1.80.0 \
+    --hash=sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe \
+    --hash=sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd
+    # via google-api-core
+idna==3.11 \
+    --hash=sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea \
+    --hash=sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902
+    # via requests
+proto-plus==1.27.2 \
+    --hash=sha256:6432f75893d3b9e70b9c412f1d2f03f65b11fb164b793d14ae2ca01821d22718 \
+    --hash=sha256:b2adde53adadf75737c44d3dcb0104fde65250dfc83ad59168b4aa3e574b6a24
+    # via
+    #   google-api-core
+    #   google-cloud-monitoring
+protobuf==6.33.6 \
+    --hash=sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326 \
+    --hash=sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901 \
+    --hash=sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3 \
+    --hash=sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a \
+    --hash=sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135 \
+    --hash=sha256:bd56799fb262994b2c2faa1799693c95cc2e22c62f56fb43af311cae45d26f0e \
+    --hash=sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3 \
+    --hash=sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2 \
+    --hash=sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593 \
+    --hash=sha256:f443a394af5ed23672bc6c486be138628fbe5c651ccbc536873d7da23d1868cf
+    # via
+    #   google-api-core
+    #   google-cloud-monitoring
+    #   googleapis-common-protos
+    #   grpcio-status
+    #   proto-plus
+pyasn1==0.6.3 \
+    --hash=sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf \
+    --hash=sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde
+    # via pyasn1-modules
+pyasn1-modules==0.4.2 \
+    --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \
+    --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6
+    # via google-auth
+pycparser==3.0 \
+    --hash=sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29 \
+    --hash=sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992
+    # via cffi
+requests==2.33.1 \
+    --hash=sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517 \
+    --hash=sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a
+    # via google-api-core
+typing-extensions==4.15.0 \
+    --hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \
+    --hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548
+    # via grpcio
+urllib3==2.6.3 \
+    --hash=sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed \
+    --hash=sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4
+    # via requests
diff --git a/container-images/cpu/k6-benchmark/scripts/k6-diffusers-flux-2-klein-4b.js b/container-images/cpu/k6-benchmark/scripts/k6-diffusers-flux-2-klein-4b.js
new file mode 100644
index 000000000..59360c4d2
--- /dev/null
+++ b/container-images/cpu/k6-benchmark/scripts/k6-diffusers-flux-2-klein-4b.js
@@ -0,0 +1,250 @@
+// Copyright 2026 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import http from "k6/http";
+import { check, sleep } from "k6";
+import exec from "k6/execution";
+
+const TARGET_URL = __ENV.TARGET_URL || "http://localhost:8000/generate";
+const ACCELERATOR_NAME = __ENV.ACCELERATOR_NAME || "unknown";
+const INFERENCE_SERVER_TYPE = __ENV.INFERENCE_SERVER_TYPE || "unknown";
+
+// Extract hostname for deployment_name tag
+const urlMatch = TARGET_URL.match(/https?:\/\/([^\/:]+)/);
+const DEPLOYMENT_NAME = urlMatch ? urlMatch[1] : "unknown";
+
+// Parse dynamic scenarios
+if (!__ENV.SCENARIOS_JSON) {
+  throw new Error("SCENARIOS_JSON environment variable is required.");
+}
+
+let configScenarios = [];
+try {
+  configScenarios = JSON.parse(__ENV.SCENARIOS_JSON);
+} catch (e) {
+  throw new Error(`Failed to parse SCENARIOS_JSON: ${e.message}`);
+}
+
+const MODEL_ID = configScenarios[0].model_id || "unknown";
+const SEED = 42;
+
+// Validate first scenario for warmup
+if (
+  !configScenarios[0].width ||
+  !configScenarios[0].height ||
+  !configScenarios[0].steps
+) {
+  throw new Error(
+    "Each scenario in SCENARIOS_JSON must specify 'width', 'height', and 'steps'.",
+  );
+}
+
+console.log(
+  `Loaded ${configScenarios.length} benchmark scenarios for model ${MODEL_ID}: ${JSON.stringify(configScenarios)}`,
+);
+
+// Lookup table for scenario configurations
+const SCENARIO_CONFIGS = {
+  warmup: {
+    batch: configScenarios[0].batch,
+    vus: configScenarios[0].vus,
+    steps: configScenarios[0].steps,
+    width: configScenarios[0].width,
+    height: configScenarios[0].height,
+    model_id: MODEL_ID,
+  },
+};
+
+// Build k6 scenarios object
+const scenarios = {
+  warmup: {
+    executor: "constant-vus",
+    vus: configScenarios[0].vus,
+    duration: "5m",
+    exec: "generate",
+    tags: {
+      scenario: "warmup",
+      batch_size: configScenarios[0].batch.toString(),
+      vus: configScenarios[0].vus.toString(),
+      num_inference_steps: configScenarios[0].steps.toString(),
+      width: configScenarios[0].width.toString(),
+      height: configScenarios[0].height.toString(),
+      inference_server: INFERENCE_SERVER_TYPE,
+    },
+  },
+};
+
+let currentTimeOffsetSeconds = 300; // 5m warmup
+const COOL_DOWN_SECONDS = 30;
+
+configScenarios.forEach((s, index) => {
+  if (!s.width || !s.height || !s.steps) {
+    throw new Error(
+      `Scenario ${index} is missing required fields: width, height, or steps.`,
+    );
+  }
+  const accelTag = (__ENV.ACCELERATOR_NAME || "unknown")
+    .toLowerCase()
+    .replace(/_/g, "-");
+  const scenarioName = `bench_${accelTag}_b${s.batch}_v${s.vus}_s${s.steps}_r${s.width}x${s.height}`;
+  const startTime = currentTimeOffsetSeconds + index * COOL_DOWN_SECONDS;
+
+  scenarios[scenarioName] = {
+    executor: "constant-vus",
+    vus: s.vus,
+    duration: s.duration || "10m",
+    startTime: `${startTime}s`,
+    exec: "generate",
+    tags: {
+      scenario: scenarioName,
+      batch_size: s.batch.toString(),
+      vus: s.vus.toString(),
+      num_inference_steps: s.steps.toString(),
+      width: s.width.toString(),
+      height: s.height.toString(),
+      inference_server: INFERENCE_SERVER_TYPE,
+    },
+  };
+
+  SCENARIO_CONFIGS[scenarioName] = {
+    batch: s.batch,
+    vus: s.vus,
+    steps: s.steps,
+    width: s.width,
+    height: s.height,
+    model_id: s.model_id || MODEL_ID,
+  };
+
+  let durationSeconds = 600; // 10m default
+  if (typeof s.duration === "string") {
+    if (s.duration.endsWith("m"))
+      durationSeconds = parseInt(s.duration.slice(0, -1)) * 60;
+    else if (s.duration.endsWith("s"))
+      durationSeconds = parseInt(s.duration.slice(0, -1));
+  }
+  currentTimeOffsetSeconds += durationSeconds;
+});
+
+export const options = {
+  tags: {
+    model: MODEL_ID,
+    accelerator: ACCELERATOR_NAME,
+    seed: SEED.toString(),
+    target_url: TARGET_URL,
+    deployment_name: DEPLOYMENT_NAME,
+  },
+  discardResponseBodies: false, // Need body for validation and error reporting
+  scenarios: scenarios,
+  thresholds: {
+    http_req_failed: ["rate<0.05"],
+  },
+};
+
+const params = {
+  headers: {
+    "Content-Type": "application/json",
+  },
+  timeout: "120s",
+};
+
+export function setup() {
+  console.log(`Starting dynamic k6 load test against: ${TARGET_URL}`);
+  console.log(
+    `Running ${Object.keys(scenarios).length} scenarios (including warmup)`,
+  );
+}
+
+let lastScenario = "";
+let consecutiveFailures = 0;
+let abortCurrentScenario = false;
+
+export function generate() {
+  const scenarioName = exec.scenario.name;
+  const config = SCENARIO_CONFIGS[scenarioName];
+
+  if (!config) {
+    throw new Error(`No configuration found for scenario: ${scenarioName}`);
+  }
+
+  if (scenarioName !== lastScenario) {
+    console.log(
+      `VU ${exec.vu.idInTest} starting scenario: ${scenarioName} (Batch: ${config.batch}, VUs: ${config.vus})`,
+    );
+    lastScenario = scenarioName;
+    consecutiveFailures = 0;
+    abortCurrentScenario = false;
+  }
+
+  if (abortCurrentScenario) {
+    sleep(1);
+    return;
+  }
+
+  let payload;
+  let endpoint = TARGET_URL;
+
+  if (INFERENCE_SERVER_TYPE === "sglang") {
+    endpoint = `${TARGET_URL}/v1/images/generations`;
+    payload = JSON.stringify({
+      model: `/gcs/${config.model_id}`,
+      prompt:
+        "A highly detailed, cinematic photograph of a futuristic city skyline at sunset, neon lights, 8k resolution, photorealistic",
+      n: config.batch,
+      size: `${config.width}x${config.height}`,
+      num_inference_steps: config.steps,
+      seed: SEED,
+      response_format: "b64_json",
+    });
+  } else {
+    payload = JSON.stringify({
+      prompt:
+        "A highly detailed, cinematic photograph of a futuristic city skyline at sunset, neon lights, 8k resolution, photorealistic",
+      width: config.width,
+      height: config.height,
+      num_inference_steps: config.steps,
+      seed: SEED,
+      batch_size: config.batch,
+    });
+  }
+
+  if (consecutiveFailures === 0) {
+      console.log(`Endpoint: ${endpoint}`);
+      console.log(`Payload: ${payload}`);
+  }
+
+  const res = http.post(endpoint, payload, params);
+
+  const success = check(res, {
+    "is status 200": (r) => r.status === 200,
+    "has body": (r) => r.body && r.body.length > 0,
+  });
+
+  if (!success) {
+    consecutiveFailures++;
+    if (consecutiveFailures === 1) {
+      console.error(
+        `Request failed! Status: ${res.status}. Body: ${res.body || "empty"}`,
+      );
+    }
+    if (consecutiveFailures >= 3) {
+      console.error(`Scenario ${scenarioName} aborted due to 3 consecutive failures.`);
+      abortCurrentScenario = true;
+    }
+    sleep(1);
+  } else {
+    consecutiveFailures = 0;
+  }
+
+  sleep(0.01);
+}
diff --git a/container-images/gpu/diffusers-flux/Dockerfile b/container-images/gpu/diffusers-flux/Dockerfile
index 9af36bb6d..3de731de0 100644
--- a/container-images/gpu/diffusers-flux/Dockerfile
+++ b/container-images/gpu/diffusers-flux/Dockerfile
@@ -25,8 +25,8 @@ COPY --from=primary requirements.txt .
 
 # Install Python and dependencies
 RUN apt-get update && \
-    apt-get install -y python3 python3-pip && \
-    pip install --no-cache-dir -r requirements.txt
+  apt-get install -y python3 python3-pip && \
+  pip install --no-cache-dir -r requirements.txt
 
 COPY --from=primary app.py .
 
diff --git a/container-images/gpu/sglang-diffusers/Dockerfile b/container-images/gpu/sglang-diffusers/Dockerfile
new file mode 100644
index 000000000..31f9d2fd4
--- /dev/null
+++ b/container-images/gpu/sglang-diffusers/Dockerfile
@@ -0,0 +1,23 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM lmsysorg/sglang:v0.5.9-cu129-amd64-runtime
+
+RUN pip install -e "python[diffusion]"
+
+ENTRYPOINT [ "sglang" ]
+
+EXPOSE 30000
+
+CMD [ "--help" ]
diff --git a/container-images/gpu/sglang-diffusers/cloudbuild.yaml b/container-images/gpu/sglang-diffusers/cloudbuild.yaml
new file mode 100644
index 000000000..e25d7488b
--- /dev/null
+++ b/container-images/gpu/sglang-diffusers/cloudbuild.yaml
@@ -0,0 +1,29 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+images:
+  - ${_DESTINATION}
+
+options:
+  logging: CLOUD_LOGGING_ONLY
+  machineType: E2_HIGHCPU_8
+
+steps:
+  - args:
+      - build
+      - --tag=${_DESTINATION}
+      - .
+    id: "Build SGLang-Diffusers image"
+    name: "docker.io/docker:28.3.3-dind-alpine3.22"
+    waitFor: ["-"]
diff --git a/cspell.json b/cspell.json
index d4bfb93ef..096367f74 100644
--- a/cspell.json
+++ b/cspell.json
@@ -74,6 +74,10 @@
       "name": "ray",
       "path": ".github/workflows/dictionary/ray.txt"
     },
+    {
+      "name": "sglang",
+      "path": ".github/workflows/dictionary/sglang.txt"
+    },
     {
       "name": "shell",
       "path": ".github/workflows/dictionary/shell.txt"
@@ -113,6 +117,7 @@
     "nvidia",
     "python",
     "ray",
+    "sglang",
     "shell",
     "svg",
     "terraform",
diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-k6-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-k6-hf-model.md
new file mode 100644
index 000000000..567cdbb25
--- /dev/null
+++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-k6-hf-model.md
@@ -0,0 +1,423 @@
+# GKE Inference Benchmarking with k6
+
+This example sets up a benchmarking job on Google Kubernetes Engine (GKE),
+leveraging the Inference reference-architecture for model deployment and the k6
+open-source tool for scalable benchmarking.
+
+This implementation deploys the k6 as a Kubernetes Job and can be customized
+with different load scenarios and datasets.
+
+This example is built on top of the
+[GKE Inference reference architecture](/docs/platforms/gke/base/use-cases/inference-ref-arch/README.md).
+
+## Before you begin
+
+1. Deploy and configure the
+   [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md).
+
+### Requirements
+
+This guide was designed to be run from
+[Cloud Shell](https://cloud.google.com/shell) in the Google Cloud console. Cloud
+Shell has the following tools installed:
+
+- [Google Cloud Command Line Interface (`gcloud` CLI)](https://cloud.google.com/cli)
+- `curl`
+- `envsubst`
+- `jq`
+- `kubectl`
+- `sponge`
+
+## Create and configure the Google Cloud resources
+
+1. Source the environment configuration.
+
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
+
+1. Update terraform environment variables depending on the accelerators being
+   used (GPU/TPU/BOTH). Example:
+
+   ```shell
+   export TF_VAR_enable_gpu=true
+   export TF_VAR_enable_tpu=false
+   ```
+
+1. Deploy the benchmark infrastructure:
+
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   rm -rf "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench/.terraform/terraform.tfstate" && \
+   terraform -chdir="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench" init && \
+   terraform -chdir="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench" plan -input=false -out=tfplan && \
+   terraform -chdir="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench" apply -input=false tfplan && \
+   rm "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench/tfplan"
+   ```
+
+## Define the Benchmarking Configuration
+
+1. Choose a model:
+
+   - [**FLUX.2-klein-4B**](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B):
+
+     ```shell
+     export HF_MODEL_ID="black-forest-labs/flux.2-klein-4b"
+     ```
+
+1. Select an accelerator:
+
+   | Model           | l4  | RTX Pro 6000 |
+   | --------------- | --- | ------------ |
+   | flux.2-klein-4b | ✅  | ✅           |
+
+   - 1x **NVIDIA Tesla L4 24GB**, running on a `g2-standard-16` Google
+     Kubernetes Engine node:
+
+     ```shell
+     export ACCELERATOR_TYPE="l4"
+     ```
+
+   - **NVIDIA RTX Pro 6000**:
+
+     - 1x **NVIDIA RTX Pro 6000**:
+
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000"
+       ```
+
+     - 1/2 (half) of a **NVIDIA RTX Pro 6000**:
+
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000-1-2"
+       ```
+
+     - 1/4 (one fourth) of a **NVIDIA RTX Pro 6000**:
+
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000-1-4"
+       ```
+
+     - 1/8 (one eight) of a **NVIDIA RTX Pro 6000**:
+
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000-1-8"
+       ```
+
+   Ensure that you have enough quota in your project to provision the selected
+   accelerator type. For more information, see about viewing GPU quotas, see
+   [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota).
+
+1. Configure sequential benchmarking scenarios using the `K6_SCENARIOS_JSON`
+   variable. This variable accepts a JSON array of objects, where each object
+   represents a specific load configuration to be tested sequentially.
+
+   ```shell
+   export K6_SCENARIOS_JSON='[{"batch": 1, "vus": 1}, {"batch": 2, "vus": 4}, {"batch": 4, "vus": 4}]'
+   ```
+
+   **JSON Attribute Definitions:**
+
+   - **`batch`**: The number of prompts sent in a single inference request.
+     Larger batch sizes generally increase GPU utilization but also increase
+     request latency.
+   - **`vus`**: Virtual Users. The number of concurrent worker threads sending
+     requests to the server. Increasing VUs helps saturate the GPU by filling
+     compute gaps between individual requests.
+   - **`duration`** (Optional): The length of time to run this specific scenario
+     (e.g., `"10m"`, `"300s"`). Defaults to `10m` if not specified.
+
+   **Execution Workflow:** The k6 script automatically performs a **5-minute
+   warmup** using the first configured scenario's VU count to ensure the model
+   is loaded and compiled. Between each subsequent scenario, the script enforces
+   a **30-second cool-down period** to allow hardware metrics to return to
+   baseline for clean analysis.
+
+1. Source the environment configuration.
+
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
+
+### Destructive Testing & Stability
+
+When testing "frontier" configurations (e.g., Batch Size > 18 on RTX 6000),
+there is a high risk of triggering a **CUDA Out-of-Memory (OOM)** error. This
+will cause the inference server Pod to crash and restart.
+
+To ensure benchmark integrity, follow these stability guidelines:
+
+1. **Order Destructive Tests Last**: Always place scenarios that are likely to
+   crash the server at the very end of your `K6_SCENARIOS_JSON` array. This
+   prevents a crash from polluting the results of subsequent tests.
+1. **The Re-compilation Penalty**: The inference server uses `torch.compile`
+   with CUDA Graphs. If the server restarts due to an OOM, it requires **3-5
+   minutes** to re-warm and re-compile before it can achieve peak performance.
+1. **Recovery Warmups**: If you must run more tests after a potentially
+   destructive scenario in the same Job, insert a "Recovery Warmup" scenario
+   immediately after the risky one. A recovery scenario should use a small load
+   (e.g., `{"batch": 1, "vus": 1, "duration": "5m"}`) to give the server time to
+   restart and re-compile while k6 records the errors during the transition.
+
+### Tested Configurations Summary
+
+The following table illustrates the configurations that we tested serving
+`flux.2-klein-4b`, and which ones run the benchmark suite to completion,
+assuming no other load on the inference server.
+
+| Accelerator          | Backend | Resolution | Batch Size | VUs | Steps | Status   |
+| -------------------- | ------- | ---------- | ---------- | --- | ----- | -------- |
+| **NVIDIA L4**        | SGLang  | 1024x1024  | 1          | 1-4 | 20    | ✅       |
+| **NVIDIA L4**        | SGLang  | 1024x1024  | 2+         | 1   | 20    | ❌ (OOM) |
+| **NVIDIA L4**        | SGLang  | 768x768    | 1          | 1   | 20    | ✅       |
+| **NVIDIA L4**        | SGLang  | 512x512    | 1-4        | 1-4 | 10-20 | ✅       |
+| **NVIDIA L4 x2**     | SGLang  | 1024x1024  | 1          | 1-4 | 20    | ✅       |
+| **NVIDIA L4 x2**     | SGLang  | 1024x1024  | 2+         | 1   | 20    | ❌ (OOM) |
+| **NVIDIA L4 x2**     | SGLang  | 512x512    | 1          | 1   | 20    | ✅       |
+| **NVIDIA L4 x4**     | SGLang  | 1024x1024  | 1          | 1-2 | 20    | ✅       |
+| **NVIDIA L4 x4**     | SGLang  | 1024x1024  | 2+         | 1   | 20    | ❌ (OOM) |
+| **NVIDIA L4 x4**     | SGLang  | 512x512    | 1          | 1   | 20    | ✅       |
+| **RTX Pro 6000**     | SGLang  | 1024x1024  | 1-24       | 1-8 | 10-50 | ✅       |
+| **RTX Pro 6000**     | SGLang  | 512x512    | 1-4        | 1-4 | 10-20 | ✅       |
+| **RTX Pro 6000**     | SGLang  | 768x768    | 1          | 1   | 20    | ✅       |
+| **RTX Pro 6000 1/2** | SGLang  | 1024x1024  | 1-24       | 1-8 | 10-50 | ✅       |
+| **RTX Pro 6000 1/2** | SGLang  | 512x512    | 1-4        | 1-4 | 10-20 | ✅       |
+| **RTX Pro 6000 1/2** | SGLang  | 768x768    | 1          | 1   | 20    | ✅       |
+| **RTX Pro 6000 1/4** | SGLang  | 1024x1024  | 1-3        | 1-4 | 10-20 | ✅       |
+| **RTX Pro 6000 1/4** | SGLang  | 1024x1024  | 4+         | 1-8 | 10-20 | ❌ (OOM) |
+| **RTX Pro 6000 1/4** | SGLang  | 512x512    | 1-4        | 1-4 | 10-20 | ✅       |
+| **RTX Pro 6000 1/4** | SGLang  | 768x768    | 1          | 1   | 20    | ✅       |
+| **RTX Pro 6000 1/8** | SGLang  | All        | N/A        | N/A | N/A   | ❌ (OOM) |
+
+## Automated Execution (Recommended)
+
+You can use the provided orchestrator script to automate the entire lifecycle
+(build, deploy, monitor, and analyze) in a single command using the environment
+variables defined in the previous steps. The script supports running benchmarks
+sequentially across multiple accelerators by providing a comma-separated list:
+
+```shell
+# Example: Testing multiple resolutions and batch sizes in one run
+export ACCELERATOR_TYPE="l4,rtx-pro-6000"
+export K6_SCENARIOS_JSON='[
+  {"batch": 1, "vus": 1, "width": 512, "height": 512, "steps": 10},
+  {"batch": 4, "vus": 4, "width": 768, "height": 768, "steps": 20},
+  {"batch": 16, "vus": 1, "width": 1024, "height": 1024, "steps": 50}
+]'
+
+./platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/run_benchmark.sh \
+  --accelerator "${ACCELERATOR_TYPE}" \
+  --model "${HF_MODEL_ID}" \
+  --scenarios "${K6_SCENARIOS_JSON}" \
+  --build
+```
+
+**Script Flags:**
+
+- `--accelerator`: The accelerator type(s). Supports a single value (e.g., `l4`)
+  or a comma-separated list for sequential runs (e.g., `l4,rtx-pro-6000`).
+- `--model`: The Hugging Face model ID.
+- `--scenarios`: The JSON array of benchmark scenarios. Each scenario MUST
+  specify `batch`, `vus`, `width`, `height`, and `steps`.
+- `--build`: (Optional) Rebuild and push the k6 benchmark container image once
+  before starting the runs.
+- `--sync-only`: (Optional) Skip executing the benchmark workload on the
+  cluster, and jump straight to downloading the latest results from GCS and
+  running the data aggregation pipeline.
+- `--manual-cost`: (Optional) Override the default on-demand hourly price.
+
+## Manual Execution
+
+If you prefer to run the benchmarking steps individually, follow the
+instructions below.
+
+### Build the benchmark container image
+
+1. Source the environment configuration.
+
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
+
+1. Build the container image for the Diffusers inference server.
+
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   rm -rf ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform/ terraform.tfstate* && \
+   terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark init && \
+   terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark plan -input=false -out=tfplan && \
+   terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark apply -input=false tfplan && \
+   rm ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/tfplan
+   ```
+
+   The build usually takes about 1 minute.
+
+### Deploy the benchmark workload
+
+1. Source the environment configuration.
+
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
+
+1. Set the benchmark parameters:
+
+   ```shell
+   export K6_SCENARIOS_JSON='[{"batch": 1, "vus": 1, "width": 1024, "height": 1024, "steps": 20}]'
+   ```
+
+1. Configure the deployment:
+
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh"
+   ```
+
+1. Deploy the benchmark workload.
+
+   ```shell
+   kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/${HF_MODEL_NAME}"
+   ```
+
+1. Watch the deployment until it is ready.
+
+   ```shell
+   watch --color --interval 5 --no-title \
+   "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/k6-benchmark-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
+   echo '\nLogs(last 10 lines):'
+   kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/k6-benchmark-${HF_MODEL_NAME} --all-containers --tail 10"
+   ```
+
+   When the deployment is ready, you will see the following:
+
+   ```text
+   NAME                                           READY   UP-TO-DATE   AVAILABLE   AGE
+   k6-benchmark-<HF_MODEL_NAME>                   1/1     1            1           ###
+   ```
+
+   You can press `CTRL`+`c` to terminate the watch.
+
+### Analyze and Interpret Results
+
+1. Download the files where the benchmarker collected data points:
+
+   ```shell
+     gcloud storage cp -r gs://${hub_models_bucket_bench_results_name}/ .
+   ```
+
+1. Set up the environment to run the metrics summarization script:
+
+   ```shell
+   # Create and activate a Python virtual environment
+   python3 -m venv .venv
+   . .venv/bin/activate
+
+   # Install dependencies
+   pip install --require-hashes -r "${ACP_REPO_DIR}/container-images/cpu/k6-benchmark/requirements.txt"
+   ```
+
+1. Set the hourly cost in USD for the Compute Engine machine you're using to run
+   the model by initializing the `MODEL_MACHINE_HOURLY_COST_USD` variable. For
+   example, if a machine costs `1.147208384` USD per hour, you initialize
+   `MODEL_MACHINE_HOURLY_COST_USD` as follows:
+
+   ```shell
+   export MODEL_MACHINE_HOURLY_COST_USD="1.147208384"
+   ```
+
+   For more information about machine pricing, see:
+
+   - [Accelerator-optimized pricing](https://cloud.google.com/products/compute/pricing/accelerator-optimized)
+
+1. Run the metrics aggregation and reporting script:
+
+   ```shell
+   for f in "${hub_models_bucket_bench_results_name}"/*"${HF_MODEL_NAME}"*"${ACCELERATOR_TYPE}"*.jsonl; do
+    echo "Processing $f..."
+    python3 "${ACP_REPO_DIR}/container-images/cpu/k6-benchmark/extract_metrics.py" \
+      --file "$f" \
+      --hourly-cost "${MODEL_MACHINE_HOURLY_COST_USD}" \
+      --project-id "${cluster_project_id}" \
+      --output-csv k6-benchmark.csv
+   done
+   ```
+
+1. Review aggregated results for each run by examining the contents of the
+   aggregated results files:
+
+   ```shell
+   for f in "${hub_models_bucket_bench_results_name}"/*report.txt; do
+     echo "Visualizing $f contents:"
+     cat "$f"
+   done
+   ```
+
+   The output is similar to the following:
+
+   ```text
+   ==================================================
+    GKE Performance Consolidated Report
+    Source: k6-diffusers-flux-2-klein-4b-rtx-pro-6000-20260422T123505Z.jsonl
+   ==================================================
+   SUMMARY TABLE:
+   Scenario             Img/s      Lat p50    GPU %      Cost/1k
+   ------------------------------------------------------------
+   bench_b1_v1          0.3779     2.648      84.95%     $3.3074
+   bench_b2_v4          0.4497     9.087      99.89%     $2.7798
+
+   --------------------------------------------------
+    SCENARIO: bench_b2_v4 (Batch: 2, VUs: 4)
+   --------------------------------------------------
+    UX Metrics: 0.4497 Img/s, 0.2248 RPS, Success: 100.00%
+    Latency (Req): p50=18.174s, p95=18.196s, p99=30.436s
+    Latency (Img): p50=9.087s, p95=9.098s, p99=15.218s
+    Hardware: VRAM=25984.0 MiB (26.43%), Compute=99.89%, Power=591.53 W
+    Economics: Cost/1k Images = $2.7798
+   ```
+
+1. Review the aggregated results across all runs:
+
+   ```shell
+   column -s, -t < k6-benchmark.csv | less -S
+   ```
+
+   The output is similar to the following:
+
+   ```text
+   Source File                                             Deployment Name               Target URL                               Model            Accelerator   Resolution  Inference Steps  Batch Size  VUs  Start Time (UTC)     End Time (UTC)       Total Time (s)  Total Requests  Throughput (Images/s)  Request Latency p50 (s)  Peak VRAM    Average Compute  Cost per 1k Images ($)
+   k6-diffusers-flux-2-klein-4b-rtx-pro-6000-20260421.jsonl  diffusers-rtx-pro-6000-flux   http://...                               flux-2-klein-4b  rtx-pro-6000  1024x1024   20               2           4    2026-04-22 12:40:28  2026-04-22 12:50:10  582.66          131             0.4497                 18.174                  25984.0 MiB  99.89%           2.7798
+   ```
+
+## Key LLM Performance Metrics Metric Description Optimization Focus
+
+- **_Time-to-First-Token (TTFT)_**: Latency from request start to the first
+  output token. Crucial for perceived responsiveness in chatbots.
+
+- **_Time-per-Output-Token (TPOT)_**: Average time to generate subsequent
+  tokens. Key measure of generation speed and sustained throughput.
+
+- **_Total Latency (P95/P99)_**: End-to-end time for the entire response.
+  Represents the experience of users with the slowest responses.
+
+- **_Throughput (Tokens/s)_**: Total tokens generated per second under load.
+  Measure of infrastructure efficiency and capacity.
+
+## Clean up
+
+1. Delete the benchmarking job.
+
+   ```shell
+   kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/${HF_MODEL_NAME}"
+   ```
+
+1. Destroy the benchmarking resources.
+
+   > Note: This will only destroy your benchmarking results GCS bucket only if
+   > its empty
+
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench && \
+   rm -rf .terraform/ terraform.tfstate* && \
+   terraform init &&
+   terraform destroy -auto-approve
+   ```
diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md
index 475f60081..a34a47766 100644
--- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md
+++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/diffusers-with-hf-model.md
@@ -8,233 +8,285 @@ This example is built on top of the
 
 ## Before you begin
 
-- The
-  [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
-  is deployed and configured.
+1. Deploy and configure the
+   [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md).
 
-- Get access to the models.
+1. Get access to the models.
 
-  - For FLUX.1-schnell:
+   - For FLUX.1-schnell:
 
-    - Accept the conditions to access its files and content on the Hugging Face
-      model page.
-      - [**black-forest-labs/FLUX.1-schnell**](https://huggingface.co/black-forest-labs/FLUX.1-schnell)
+     - Accept the conditions to access its files and content on the Hugging Face
+       model page.
+       - [**black-forest-labs/FLUX.1-schnell**](https://huggingface.co/black-forest-labs/FLUX.1-schnell)
 
-- Ensure your
-  [Hugging Face Hub **Read** access token](/platforms/gke/base/core/huggingface/initialize/README.md)
-  has been added to Secret Manager.
+   - For FLUX.2-klein-4B: The model is not gated, so there's no license check.
+
+1. Ensure your
+   [Hugging Face Hub **Read** access token](/platforms/gke/base/core/huggingface/initialize/README.md)
+   has been added to Secret Manager.
 
 ## Create and configure the Google Cloud resources
 
-- Deploy the online GPU resources.
+1. Deploy the online GPU resources.
 
-  ```shell
-  export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
-  cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \
-  rm -rf .terraform/ terraform.tfstate* && \
-  terraform init && \
-  terraform plan -input=false -out=tfplan && \
-  terraform apply -input=false tfplan && \
-  rm tfplan
-  ```
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \
+   rm -rf .terraform/ terraform.tfstate* && \
+   terraform init && \
+   terraform plan -input=false -out=tfplan && \
+   terraform apply -input=false tfplan && \
+   rm tfplan
+   ```
 
 ## Download the model to Cloud Storage
 
-- Choose the model.
+1. Choose the model.
+
+   - [**FLUX.1-schnell**](https://huggingface.co/black-forest-labs/FLUX.1-schnell):
+
+     ```shell
+     export HF_MODEL_ID="black-forest-labs/flux.1-schnell"
+     ```
 
-  - **FLUX.1-Schnell**:
+   - [**FLUX.2-klein-4B**](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B):
 
-    ```shell
-    export HF_MODEL_ID="black-forest-labs/flux.1-schnell"
-    ```
+     ```shell
+     export HF_MODEL_ID="black-forest-labs/flux.2-klein-4b"
+     ```
 
-- Source the environment configuration.
+1. Source the environment configuration.
 
-  ```shell
-  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
-  ```
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
 
-- Configure the model download job.
+1. Configure the model download job.
 
-  ```shell
-  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh"
-  ```
+   ```shell
+   "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh"
+   ```
 
-- Deploy the model download job.
+1. Deploy the model download job.
 
-  ```shell
-  kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
-  ```
+   ```shell
+   kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
+   ```
 
-- Watch the model download job until it is complete.
+1. Watch the model download job until it is complete.
 
-  ```shell
-  watch --color --interval 5 --no-title \
-  "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'
-  echo '\nLogs(last 10 lines):'
-  kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10"
-  ```
+   ```shell
+   watch --color --interval 5 --no-title \
+   "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'
+   echo '\nLogs(last 10 lines):'
+   kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10"
+   ```
 
-  When the job is complete, you will see the following:
+   When the job is complete, you will see the following:
 
-  ```text
-  NAME                       STATUS     COMPLETIONS   DURATION   AGE
-  XXXXXXXX-hf-model-to-gcs   Complete   1/1           ###        ###
-  ```
+   ```text
+   NAME                       STATUS     COMPLETIONS   DURATION   AGE
+   XXXXXXXX-hf-model-to-gcs   Complete   1/1           ###        ###
+   ```
 
-  You can press `CTRL`+`c` to terminate the watch.
+   You can press `CTRL`+`c` to terminate the watch.
 
-- Delete the model download job.
+1. Delete the model download job.
 
-  ```shell
-  kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
-  ```
+   ```shell
+   kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
+   ```
 
 ## Build the container image
 
-- Source the environment configuration.
+1. Source the environment configuration.
 
-  ```shell
-  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
-  ```
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
 
-- Build the container image for the Diffusers inference server.
+1. Build the container image for the Diffusers inference server.
 
-  ```shell
-  export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
-  cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux && \
-  rm -rf .terraform/ terraform.tfstate* && \
-  terraform init && \
-  terraform plan -input=false -out=tfplan && \
-  terraform apply -input=false tfplan && \
-  rm tfplan
-  ```
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   rm -rf ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/.terraform/ terraform.tfstate* && \
+   terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux init && \
+   terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux plan -input=false -out=tfplan && \
+   terraform -chdir=${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux apply -input=false tfplan && \
+   rm ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/tfplan
+   ```
 
-  > The build usually takes 10 to 15 minutes.
+   The build usually takes about 25 minutes.
 
 ## Deploy the inference workload
 
-- Source the environment configuration.
+1. Source the environment configuration.
+
+   ```shell
+   source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   ```
+
+1. Check the model name.
+
+   ```shell
+   echo "HF_MODEL_NAME=${HF_MODEL_NAME}"
+   ```
+
+   > If the `HF_MODEL_NAME` variable is not set, ensure that `HF_MODEL_ID` is
+   > set and source the `set_environment_variables.sh` script:
+   >
+   > ```shell
+   > source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+   > ```
+
+   1. Select an accelerator.
+
+   | Model           | NVIDIA L4 | 2x NVIDIA L4 | 4x NVIDIA L4 | NVIDIA H100 | NVIDIA RTX Pro 6000 | 1/2 NVIDIA RTX Pro 6000 | 1/4 NVIDIA RTX Pro 6000 | 1/8 NVIDIA RTX Pro 6000 |
+   | --------------- | --------- | ------------ | ------------ | ----------- | ------------------- | ----------------------- | ----------------------- | ----------------------- |
+   | flux.1-schnell  | ✅        | Not tested   | Not tested   | ✅          | Not tested          | Not tested              | Not tested              | Not tested              |
+   | flux.2-klein-4B | ✅        | ✅           | ✅           | Not tested  | ✅                  | ✅                      | ✅                      | ❌                      |
+
+   > When using fractional GPUs (1/2, 1/4, 1/8), you might see a warning in the
+   > logs of the `inference-server` container:
+   > `No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'`. You can
+   > ignore this warning. It's due to the GPU virtualization layer masking
+   > hardware probes from the PyTorch JIT compiler. It does not affect inference
+   > performance or stability.
+
+   - **NVIDIA Tesla L4 24GB**:
+
+     - 1x **NVIDIA Tesla L4**:
+
+       ```shell
+       export ACCELERATOR_TYPE="l4"
+       ```
+
+     - 2x **NVIDIA Tesla L4**:
+
+       ```shell
+       export ACCELERATOR_TYPE="l4-x2"
+       ```
+
+     - 4x **NVIDIA Tesla L4**:
+
+       ```shell
+       export ACCELERATOR_TYPE="l4-x4"
+       ```
 
-  ```shell
-  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
-  ```
+   - 1x **NVIDIA H100 80GB**:
 
-- Set the environment variables for the workload.
+     ```shell
+     export ACCELERATOR_TYPE="h100"
+     ```
 
-  - Check the model name.
+   - **NVIDIA RTX Pro 6000**:
 
-    ```shell
-    echo "HF_MODEL_NAME=${HF_MODEL_NAME}"
-    ```
+     - 1x **NVIDIA RTX Pro 6000**:
 
-    > If the `HF_MODEL_NAME` variable is not set, ensure that `HF_MODEL_ID` is
-    > set and source the `set_environment_variables.sh` script:
-    >
-    > ```shell
-    > source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"`
-    > ```
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000"
+       ```
 
-  - Select an accelerator.
+     - 1/2x (half) of a **NVIDIA RTX Pro 6000**:
 
-    | Model          | l4  | h100 |
-    | -------------- | --- | ---- |
-    | flux.1-schnell | ✅  | ✅   |
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000-1-2"
+       ```
 
-    - **NVIDIA Tesla L4 24GB**:
+     - 1/4x (one fourth) of a **NVIDIA RTX Pro 6000**:
 
-      ```shell
-      export ACCELERATOR_TYPE="l4"
-      ```
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000-1-4"
+       ```
 
-    - **NVIDIA H100 80GB**:
+     - 1/8x (one eight) of a **NVIDIA RTX Pro 6000**:
 
-      ```shell
-      export ACCELERATOR_TYPE="h100"
-      ```
+       ```shell
+       export ACCELERATOR_TYPE="rtx-pro-6000-1-8"
+       ```
 
-    Ensure that you have enough quota in your project to provision the selected
-    accelerator type. For more information, see about viewing GPU quotas, see
-    [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota).
+   Ensure that you have enough quota in your project to provision the selected
+   accelerator type. For more information, see about viewing GPU quotas, see
+   [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota).
 
-- Configure the deployment.
+1. Configure the deployment.
 
-  ```shell
-  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh"
-  ```
+   ```shell
+   "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh"
+   ```
 
-- Deploy the inference workload.
+1. Deploy the inference workload.
 
-  ```shell
-  kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
-  ```
+   ```shell
+   kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+   ```
 
-- Watch the deployment until it is ready.
+1. Watch the deployment until it is ready.
 
-  ```shell
-  watch --color --interval 5 --no-title \
-  "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
-  echo '\nLogs(last 10 lines):'
-  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10"
-  ```
+   ```shell
+   watch --color --interval 5 --no-title \
+   "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
+   echo '\nLogs(last 10 lines):'
+   kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs deployment/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10"
+   ```
 
-  When the deployment is ready, you will see the following:
+   When the deployment is ready, you will see the following:
 
-  ```text
-  NAME                                           READY   UP-TO-DATE   AVAILABLE   AGE
-  diffusers-<ACCELERATOR_TYPE>-<HF_MODEL_NAME>   1/1     1            1           ###
-  ```
+   ```text
+   NAME                                           READY   UP-TO-DATE   AVAILABLE   AGE
+   diffusers-<ACCELERATOR_TYPE>-<HF_MODEL_NAME>   1/1     1            1           ###
+   ```
 
-  You can press `CTRL`+`c` to terminate the watch.
+   You can press `CTRL`+`c` to terminate the watch.
 
-- Send a test request.
+1. Send a test request.
 
-  ```shell
-  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} port-forward service/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} 8000:8000 >/dev/null &
-  PF_PID=$!
-  while ! echo -e '\x1dclose\x0d' | telnet localhost 8000 >/dev/null 2>&1; do
-    sleep 0.1
-  done
-  curl http://localhost:8000/generate \
-  --data '{
-    "height": 512,
-    "num_inference_steps": 4,
-    "prompt": "A photo of a dog playing fetch in a park.",
-    "width": 512
-  }' \
-  --header "Content-Type: application/json" \
-  --output ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png \
-  --request POST \
-  --show-error \
-  --silent
-  ls -alh ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png
-  kill -9 ${PF_PID}
-  ```
+   ```shell
+   kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} port-forward service/diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} 8000:8000 >/dev/null &
+   PF_PID=$!
+   while ! echo -e '\x1dclose\x0d' | telnet localhost 8000 >/dev/null 2>&1; do
+     sleep 0.1
+   done
+   curl http://localhost:8000/generate \
+   --data '{
+     "height": 512,
+     "num_inference_steps": 4,
+     "prompt": "A photo of a dog playing fetch in a park.",
+     "width": 512
+   }' \
+   --header "Content-Type: application/json" \
+   --output ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png \
+   --request POST \
+   --show-error \
+   --silent
+   ls -alh ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/images/${HF_MODEL_NAME}_${ACCELERATOR_TYPE}_image.png
+   kill -9 ${PF_PID}
+   ```
 
-- Delete the workload.
+1. Delete the workload.
 
-  ```shell
-  kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
-  ```
+   ```shell
+   kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+   ```
 
 ## Clean up
 
-- Destroy the container image.
-
-  ```shell
-  export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
-  cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux && \
-  rm -rf .terraform/ terraform.tfstate* && \
-  terraform init &&
-  terraform destroy -auto-approve
-  ```
-
-- Destroy the online GPU resources.
-
-  ```shell
-  export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
-  cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \
-  rm -rf .terraform/ terraform.tfstate* && \
-  terraform init &&
-  terraform destroy -auto-approve
-  ```
+1. Destroy the container image.
+
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux && \
+   rm -rf .terraform/ terraform.tfstate* && \
+   terraform init &&
+   terraform destroy -auto-approve
+   ```
+
+1. Destroy the online GPU resources.
+
+   ```shell
+   export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+   cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/online_gpu && \
+   rm -rf .terraform/ terraform.tfstate* && \
+   terraform init &&
+   terraform destroy -auto-approve
+   ```
diff --git a/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh
index 8f18078f9..9a0502324 100755
--- a/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh
+++ b/platforms/gke/base/_shared_config/scripts/set_environment_variables.sh
@@ -15,7 +15,14 @@
 # limitations under the License.
 
 BASH_SOURCE_MY_PATH="$(
-  cd "$(dirname "${BASH_SOURCE}")" >/dev/null 2>&1
+  SCRIPT_SOURCE="${BASH_SOURCE[0]:-}"
+  if [[ -z "${SCRIPT_SOURCE:-}" ]]; then
+    # Fallback in case BASH_SOURCE is not defined, such as when sourcing this
+    # script from a non-Bash shell
+    SCRIPT_SOURCE="$0"
+  fi
+
+  cd "$(dirname "${SCRIPT_SOURCE}")" >/dev/null 2>&1 || return 1
   pwd -P
 )"
 
diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/n4/custom-compute-cpu-n4-8.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/n4/custom-compute-cpu-n4-8.yaml
new file mode 100644
index 000000000..034e9f7f5
--- /dev/null
+++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu/n4/custom-compute-cpu-n4-8.yaml
@@ -0,0 +1,38 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: cloud.google.com/v1
+kind: ComputeClass
+metadata:
+  name: cpu-n4-8
+spec:
+  activeMigration:
+    optimizeRulePriority: true
+  nodePoolConfig:
+    imageStreaming:
+      enabled: true
+  nodePoolAutoCreation:
+    enabled: true
+  priorities:
+    # Use reservations if available
+    - machineType: n4-standard-8
+      maxPodsPerNode: 64
+      reservations:
+        affinity: AnyBestEffort
+      spot: false
+
+    # Use on-demand
+    - machineType: n4-standard-8
+      maxPodsPerNode: 64
+      spot: false
diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-2.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-2.yaml
new file mode 100644
index 000000000..a5d3c9f0e
--- /dev/null
+++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-2.yaml
@@ -0,0 +1,96 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: cloud.google.com/v1
+kind: ComputeClass
+metadata:
+  name: gpu-rtx-pro-6000-96gb-x1-2
+spec:
+  activeMigration:
+    optimizeRulePriority: true
+  nodePoolConfig:
+    imageStreaming:
+      enabled: true
+  nodePoolAutoCreation:
+    enabled: true
+  priorities:
+    # Use a specific reservation
+    # - gpu:
+    #     count: 1
+    #     driverVersion: latest
+    #     type: nvidia-rtx-pro-6000
+    #   machineType: g4-standard-24
+    #   maxPodsPerNode: 32
+    #   reservations:
+    #     affinity: Specific
+    #     specific:
+    #       - name: nvidia-rtx-pro-6000-specific
+    #         reservationBlock:
+    #           name: <RESERVATION_NAME>
+    #   spot: false
+
+    # Use any reservation
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-24
+      maxPodsPerNode: 32
+      reservations:
+        affinity: AnyBestEffort
+      spot: false
+
+    # Use on-demand
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-24
+      maxPodsPerNode: 32
+      spot: false
+
+    # Use DWS FlexStart with 7 day limit
+    - flexStart:
+        enabled: true
+        nodeRecycling:
+          leadTimeSeconds: 3600
+      gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-24
+      maxPodsPerNode: 32
+      maxRunDurationSeconds: 604800
+
+    # Use DWS FlexStart with 1 day limit
+    - flexStart:
+        enabled: true
+        nodeRecycling:
+          leadTimeSeconds: 3600
+      gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-24
+      maxPodsPerNode: 32
+      maxRunDurationSeconds: 86400
+
+    # Use spot
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-24
+      maxPodsPerNode: 32
+      spot: true
diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-4.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-4.yaml
new file mode 100644
index 000000000..2fd214151
--- /dev/null
+++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-4.yaml
@@ -0,0 +1,96 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: cloud.google.com/v1
+kind: ComputeClass
+metadata:
+  name: gpu-rtx-pro-6000-96gb-x1-4
+spec:
+  activeMigration:
+    optimizeRulePriority: true
+  nodePoolConfig:
+    imageStreaming:
+      enabled: true
+  nodePoolAutoCreation:
+    enabled: true
+  priorities:
+    # Use a specific reservation
+    # - gpu:
+    #     count: 1
+    #     driverVersion: latest
+    #     type: nvidia-rtx-pro-6000
+    #   machineType: g4-standard-12
+    #   maxPodsPerNode: 32
+    #   reservations:
+    #     affinity: Specific
+    #     specific:
+    #       - name: nvidia-rtx-pro-6000-specific
+    #         reservationBlock:
+    #           name: <RESERVATION_NAME>
+    #   spot: false
+
+    # Use any reservation
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-12
+      maxPodsPerNode: 32
+      reservations:
+        affinity: AnyBestEffort
+      spot: false
+
+    # Use on-demand
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-12
+      maxPodsPerNode: 32
+      spot: false
+
+    # Use DWS FlexStart with 7 day limit
+    - flexStart:
+        enabled: true
+        nodeRecycling:
+          leadTimeSeconds: 3600
+      gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-12
+      maxPodsPerNode: 32
+      maxRunDurationSeconds: 604800
+
+    # Use DWS FlexStart with 1 day limit
+    - flexStart:
+        enabled: true
+        nodeRecycling:
+          leadTimeSeconds: 3600
+      gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-12
+      maxPodsPerNode: 32
+      maxRunDurationSeconds: 86400
+
+    # Use spot
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-12
+      maxPodsPerNode: 32
+      spot: true
diff --git a/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-8.yaml b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-8.yaml
new file mode 100644
index 000000000..df6c0baa1
--- /dev/null
+++ b/platforms/gke/base/core/custom_compute_class/templates/manifests/gpu/rtx-pro-6000-96gb/custom-compute-gpu-rtx-pro-6000-96gb-x1-8.yaml
@@ -0,0 +1,96 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: cloud.google.com/v1
+kind: ComputeClass
+metadata:
+  name: gpu-rtx-pro-6000-96gb-x1-8
+spec:
+  activeMigration:
+    optimizeRulePriority: true
+  nodePoolConfig:
+    imageStreaming:
+      enabled: true
+  nodePoolAutoCreation:
+    enabled: true
+  priorities:
+    # Use a specific reservation
+    # - gpu:
+    #     count: 1
+    #     driverVersion: latest
+    #     type: nvidia-rtx-pro-6000
+    #   machineType: g4-standard-6
+    #   maxPodsPerNode: 32
+    #   reservations:
+    #     affinity: Specific
+    #     specific:
+    #       - name: nvidia-rtx-pro-6000-specific
+    #         reservationBlock:
+    #           name: <RESERVATION_NAME>
+    #   spot: false
+
+    # Use any reservation
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-6
+      maxPodsPerNode: 32
+      reservations:
+        affinity: AnyBestEffort
+      spot: false
+
+    # Use on-demand
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-6
+      maxPodsPerNode: 32
+      spot: false
+
+    # Use DWS FlexStart with 7 day limit
+    - flexStart:
+        enabled: true
+        nodeRecycling:
+          leadTimeSeconds: 3600
+      gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-6
+      maxPodsPerNode: 32
+      maxRunDurationSeconds: 604800
+
+    # Use DWS FlexStart with 1 day limit
+    - flexStart:
+        enabled: true
+        nodeRecycling:
+          leadTimeSeconds: 3600
+      gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-6
+      maxPodsPerNode: 32
+      maxRunDurationSeconds: 86400
+
+    # Use spot
+    - gpu:
+        count: 1
+        driverVersion: latest
+        type: nvidia-rtx-pro-6000
+      machineType: g4-standard-6
+      maxPodsPerNode: 32
+      spot: true
diff --git a/platforms/gke/base/core/nvidia/initialize/README.md b/platforms/gke/base/core/nvidia/initialize/README.md
index 38eb27567..ae31bfde4 100644
--- a/platforms/gke/base/core/nvidia/initialize/README.md
+++ b/platforms/gke/base/core/nvidia/initialize/README.md
@@ -1,4 +1,4 @@
-# NVIDIA initialize
+# NVIDIA NGC initialization
 
 - Set environment variables.
 
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/.terraform.lock.hcl b/platforms/gke/base/core/workloads/nri_device_injector/.terraform.lock.hcl
new file mode 100644
index 000000000..27d625960
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/.terraform.lock.hcl
@@ -0,0 +1,42 @@
+# This file is maintained automatically by "terraform init".
+# Manual edits may be lost in future updates.
+
+provider "registry.terraform.io/hashicorp/google" {
+  version     = "7.6.0"
+  constraints = "7.6.0"
+  hashes = [
+    "h1:JYsO3fV5OtaNuRTdjGZC1Z3Ku1ZIrRJGwXwsBjtWudk=",
+    "zh:0c70c768b0a34d7a61de70d0e85cf0057820556647bbce2384972a45d7092e4e",
+    "zh:0cb7aab89cd435c5c8dab9231ea176d64fdf1df1125db15a6b9ead978a93c0b2",
+    "zh:32f25c42214bb356bb67cef6057c9904f2878cd053a7760e5ee3737619f28638",
+    "zh:38b05b1171ab086c88b95d379120fb6c28c9e895ae924557c11c35e138319119",
+    "zh:39d8206d453a614fa0be3aeac8ea3921fb3ab7ed122205cbbcc2a41ca6176cb5",
+    "zh:58d9059aa6b4aab5ede4fc173dcdc7b4d042d0b1a1ab55407dd345931d7f4815",
+    "zh:a4bc001c8ac7700d0107155296250c3b8969511e1a488f3b318f3db62362eef2",
+    "zh:cc75e25db4bb672ebc200a89d6cff9ff0b9911e14e188d1b4429bb3511d2b35f",
+    "zh:d7f7639930735f17b2b4f73814204a9a050186ea7e1c2671a52e0fa7ddf7a001",
+    "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
+    "zh:ff1190ae618dae9243de59caf4149abb4a9b775cb6439f119cd32a30f1a21820",
+    "zh:ff15b7b86787f6fd186211e7c37a72f2cc70374b284aaf063e1f989717441161",
+  ]
+}
+
+provider "registry.terraform.io/hashicorp/local" {
+  version     = "2.5.3"
+  constraints = "2.5.3"
+  hashes = [
+    "h1:1Nkh16jQJMp0EuDmvP/96f5Unnir0z12WyDuoR6HjMo=",
+    "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927",
+    "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e",
+    "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b",
+    "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
+    "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf",
+    "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d",
+    "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09",
+    "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f",
+    "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1",
+    "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6",
+    "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47",
+    "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82",
+  ]
+}
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_cluster.auto.tfvars b/platforms/gke/base/core/workloads/nri_device_injector/_cluster.auto.tfvars
new file mode 120000
index 000000000..4d9954e5a
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/_cluster.auto.tfvars
@@ -0,0 +1 @@
+../../../_shared_config/cluster.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_cluster_variables.tf b/platforms/gke/base/core/workloads/nri_device_injector/_cluster_variables.tf
new file mode 120000
index 000000000..3f2c29e19
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/_cluster_variables.tf
@@ -0,0 +1 @@
+../../../_shared_config/cluster_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_platform.auto.tfvars b/platforms/gke/base/core/workloads/nri_device_injector/_platform.auto.tfvars
new file mode 120000
index 000000000..c3133e727
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/_platform.auto.tfvars
@@ -0,0 +1 @@
+../../../_shared_config/platform.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_platform_variables.tf b/platforms/gke/base/core/workloads/nri_device_injector/_platform_variables.tf
new file mode 120000
index 000000000..c68738baa
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/_platform_variables.tf
@@ -0,0 +1 @@
+../../../_shared_config/platform_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_workloads.auto.tfvars b/platforms/gke/base/core/workloads/nri_device_injector/_workloads.auto.tfvars
new file mode 120000
index 000000000..b65551f53
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/_workloads.auto.tfvars
@@ -0,0 +1 @@
+../../../_shared_config/workloads.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/_workloads_variables.tf b/platforms/gke/base/core/workloads/nri_device_injector/_workloads_variables.tf
new file mode 120000
index 000000000..fec5c48ce
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/_workloads_variables.tf
@@ -0,0 +1 @@
+../../../_shared_config/workloads_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/main.tf b/platforms/gke/base/core/workloads/nri_device_injector/main.tf
new file mode 100644
index 000000000..ea8b5384c
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/main.tf
@@ -0,0 +1,62 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+locals {
+  kubeconfig_directory = "${path.module}/../../../kubernetes/kubeconfig"
+  kubeconfig_file      = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}"
+
+  manifests_directory         = "${local.namespace_directory}/kube-system"
+  namespace_directory         = "${local.manifests_directory_root}/namespace"
+  version_manifests_directory = "${path.module}/manifests"
+}
+
+data "local_file" "kubeconfig" {
+  filename = local.kubeconfig_file
+}
+
+resource "terraform_data" "manifests" {
+  input = {
+    manifests_dir         = local.manifests_directory
+    version_manifests_dir = local.version_manifests_directory
+  }
+
+  provisioner "local-exec" {
+    command     = <<EOT
+mkdir -p "${self.input.version_manifests_dir}" && \
+mkdir -p "${self.input.manifests_dir}" && \
+curl --silent --show-error --location https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/refs/heads/master/nri_device_injector/nri-device-injector.yaml --output "${self.input.version_manifests_dir}/manifests.yaml" && \
+cp -r "${self.input.version_manifests_dir}"/* "${self.input.manifests_dir}/"
+EOT
+    interpreter = ["bash", "-c"]
+    working_dir = path.module
+  }
+
+  triggers_replace = {
+    manifests_dir         = local.manifests_directory
+    version_manifests_dir = local.version_manifests_directory
+  }
+}
+
+module "kubectl_apply_manifests" {
+  depends_on = [
+    terraform_data.manifests,
+  ]
+
+  source = "../../../modules/kubectl_apply"
+
+  apply_server_side           = true
+  kubeconfig_file             = data.local_file.kubeconfig.filename
+  manifest                    = local.version_manifests_directory
+  manifest_includes_namespace = true
+}
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/project.tf b/platforms/gke/base/core/workloads/nri_device_injector/project.tf
new file mode 100644
index 000000000..a50809807
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/project.tf
@@ -0,0 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+data "google_project" "cluster" {
+  project_id = local.cluster_project_id
+}
diff --git a/platforms/gke/base/core/workloads/nri_device_injector/versions.tf b/platforms/gke/base/core/workloads/nri_device_injector/versions.tf
new file mode 100644
index 000000000..a61ab18ab
--- /dev/null
+++ b/platforms/gke/base/core/workloads/nri_device_injector/versions.tf
@@ -0,0 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform {
+  required_version = ">= 1.5.7"
+
+  required_providers {
+    google = {
+      source  = "hashicorp/google"
+      version = "7.6.0"
+    }
+    local = {
+      source  = "hashicorp/local"
+      version = "2.5.3"
+    }
+  }
+
+  provider_meta "google" {
+    module_name = "cloud-solutions/acp_gke_base_core_workloads_nri_deploy-v1"
+  }
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/run_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/run_benchmark.sh
new file mode 100755
index 000000000..ee84fc105
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/run_benchmark.sh
@@ -0,0 +1,244 @@
+#!/usr/bin/env bash
+
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# --- Configuration & Discovery ---
+SCRIPT_DIR="$(cd "$(dirname "$0")" >/dev/null 2>&1 && pwd -P)"
+if [[ -z "${ACP_REPO_DIR:-}" ]]; then
+  ACP_REPO_DIR="$(cd "${SCRIPT_DIR}/../../../../../../" >/dev/null 2>&1 && pwd -P)"
+  export ACP_REPO_DIR
+fi
+
+ENV_SCRIPT="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+if [[ -f "${ENV_SCRIPT}" ]]; then
+  echo "[INFO] Sourcing environment variables..."
+  source "${ENV_SCRIPT}"
+else
+  echo "[ERROR] Could not find environment script at ${ENV_SCRIPT}"
+  exit 1
+fi
+
+# --- Default Price Mapping (On-Demand) ---
+# Prices as of April 2026. Users should seek updated prices on:
+# https://cloud.google.com/products/compute/pricing
+function get_hourly_cost() {
+  case "$1" in
+  "l4") echo "1.147208384" ;;      # g2-standard-16 + 1x L4
+  "l4-x2") echo "2.000832696" ;;   # g2-standard-24 + 2x L4
+  "l4-x4") echo "4.001665392" ;;   # g2-standard-48 + 4x L4
+  "rtx-pro-6000") echo "4.4999" ;; # g4-standard-48 + 1x RTX 6000 (96GB)
+  "rtx-pro-6000-1-2") echo "2.5874425" ;; # g4-standard-24 + 1/2x RTX 6000
+  "rtx-pro-6000-1-4") echo "1.29372125" ;; # g4-standard-12 + 1/4x RTX 6000
+  "rtx-pro-6000-1-8") echo "0.646860625" ;; # g4-standard-6 + 1/8x RTX 6000
+  *) echo "0.0" ;;
+  esac
+}
+
+# --- CLI Arguments ---
+BUILD_IMAGE=false
+SYNC_ONLY=false
+SCENARIOS_JSON=""
+MANUAL_COST=""
+ACCELERATORS_INPUT=""
+
+while [[ "$#" -gt 0 ]]; do
+  case $1 in
+  --build) BUILD_IMAGE=true ;;
+  --sync-only) SYNC_ONLY=true ;;
+  --scenarios)
+    SCENARIOS_JSON="$2"
+    shift
+    ;;
+  --accelerator)
+    ACCELERATORS_INPUT="$2"
+    shift
+    ;;
+  --model)
+    export HF_MODEL_ID="$2"
+    shift
+    ;;
+  --manual-cost)
+    MANUAL_COST="$2"
+    shift
+    ;;
+  *)
+    echo "Unknown parameter: $1"
+    exit 1
+    ;;
+  esac
+  shift
+done
+
+if [[ -z "${ACCELERATORS_INPUT}" ]]; then
+  echo "[ERROR] --accelerator is required (can be comma-separated list)"
+  exit 1
+fi
+if [[ -z "${HF_MODEL_ID}" ]]; then
+  echo "[ERROR] --model is required"
+  exit 1
+fi
+
+# Minify scenarios JSON and inject model_id to prevent Kustomize parsing errors
+if [[ "${SYNC_ONLY}" != "true" ]]; then
+  if [[ -z "${SCENARIOS_JSON}" ]]; then
+    SCENARIOS_JSON='[{"batch": 1, "vus": 1}]'
+  fi
+  export K6_SCENARIOS_JSON
+  K6_SCENARIOS_JSON=$(echo "${SCENARIOS_JSON}" | jq -c --arg m "${HF_MODEL_ID}" 'map(. + {model_id: $m})')
+fi
+
+# --- Phase 1: Build (Once) ---
+if [[ "${BUILD_IMAGE}" == "true" ]]; then
+  echo "[INFO] Building benchmark container image..."
+  cd "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark"
+  terraform init -input=false && terraform apply -auto-approve -input=false
+fi
+
+# --- Phase 2-5: Sequential Accelerator Loop ---
+IFS=',' read -ra ADDR <<<"${ACCELERATORS_INPUT}"
+for ACCEL in "${ADDR[@]}"; do
+  export ACCELERATOR_TYPE="${ACCEL}"
+  echo ""
+  echo "======================================================================"
+  echo " STARTING SUITE FOR ACCELERATOR: ${ACCELERATOR_TYPE}"
+  echo "======================================================================"
+
+  if [[ "${SYNC_ONLY}" != "true" ]]; then
+    # Refresh deployment config for this accelerator
+    echo "[INFO] Configuring deployment for ${HF_MODEL_NAME} on ${ACCELERATOR_TYPE}..."
+    CONFIG_DIR="${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark"
+    pushd "${CONFIG_DIR}" >/dev/null
+    source "./configure_deployment.sh"
+    popd >/dev/null
+
+    echo "[INFO] Cleaning up existing benchmark jobs..."
+    kubectl delete --ignore-not-found --kustomize "${CONFIG_DIR}/${HF_MODEL_NAME}"
+    kubectl wait --for=delete pod -l job-name="k6-benchmark-${HF_MODEL_NAME}" -n "${ira_online_gpu_kubernetes_namespace_name}" --timeout=60s || true
+
+    echo "[INFO] Launching benchmark Job..."
+    kubectl apply --kustomize "${CONFIG_DIR}/${HF_MODEL_NAME}"
+
+    # --- Phase 4: Monitoring ---
+    echo "[INFO] Monitoring benchmark Job..."
+
+    LOG_PID=""
+
+    # Smart Monitoring Loop
+    (while true; do
+      # Get current Pod name and status
+      POD_INFO=$(kubectl get pods -n "${ira_online_gpu_kubernetes_namespace_name}" -l job-name="k6-benchmark-${HF_MODEL_NAME}" -o jsonpath='{.items[0].metadata.name} {.items[0].status.phase}' 2>/dev/null || echo "None None")
+      read -r POD_NAME POD_STATUS <<<"$POD_INFO"
+
+      TIMESTAMP=$(date +"%T")
+
+      if [[ "${POD_STATUS}" == "Running" ]]; then
+        # If Pod is running but we aren't tailing logs yet, start tailing
+        if [[ -z "${LOG_PID}" ]] || ! kill -0 "${LOG_PID}" 2>/dev/null; then
+          echo "[${TIMESTAMP}] Pod is Running. Starting log stream..."
+          kubectl logs -n "${ira_online_gpu_kubernetes_namespace_name}" "${POD_NAME}" -c k6-benchmark -f &
+          LOG_PID=$!
+        fi
+        echo "[HEARTBEAT] ${TIMESTAMP} | Pod: ${POD_NAME} | Status: ${POD_STATUS}"
+      elif [[ "${POD_STATUS}" == "Pending" ]]; then
+        # If pending, show the latest event to track scale-up/image pull
+        EVENT=$(kubectl get events -n "${ira_online_gpu_kubernetes_namespace_name}" --field-selector involvedObject.name="${POD_NAME}" --sort-by='.lastTimestamp' -o jsonpath='{.items[-1].message}' 2>/dev/null || echo "Waiting for events...")
+        echo "[HEARTBEAT] ${TIMESTAMP} | Status: Pending | Last Event: ${EVENT}"
+      elif [[ "${POD_STATUS}" == "None" ]]; then
+        echo "[HEARTBEAT] ${TIMESTAMP} | Waiting for Pod to be created..."
+      else
+        echo "[HEARTBEAT] ${TIMESTAMP} | Status: ${POD_STATUS}"
+      fi
+
+      sleep 60
+    done) &
+    MONITOR_PID=$!
+
+    echo "[INFO] Waiting for Job completion (max 6h)..."
+    TIMEOUT=21600 # 6 hours
+    ELAPSED=0
+    SLEEP_INTERVAL=10
+    while true; do
+      # Check terminal state
+      STATUS=$(kubectl get job "k6-benchmark-${HF_MODEL_NAME}" -n "${ira_online_gpu_kubernetes_namespace_name}" -o jsonpath='{.status.conditions[?(@.status=="True")].type}' 2>/dev/null || echo "Unknown")
+
+      if [[ "$STATUS" == *"Complete"* ]]; then
+        echo "[INFO] Job completed successfully."
+        break
+      elif [[ "$STATUS" == *"Failed"* ]]; then
+        echo "[ERROR] Job failed or was aborted by k6 thresholds. Check container logs for details."
+        break
+      elif [[ "$STATUS" == "Unknown" ]]; then
+        if ! kubectl get job "k6-benchmark-${HF_MODEL_NAME}" -n "${ira_online_gpu_kubernetes_namespace_name}" &>/dev/null; then
+          echo "[WARN] Job not found. Stopping wait."
+          break
+        fi
+      fi
+
+      if [ "$ELAPSED" -ge "$TIMEOUT" ]; then
+        echo "[ERROR] Timeout reached waiting for Job completion."
+        exit 1
+      fi
+
+      sleep $SLEEP_INTERVAL
+      ELAPSED=$((ELAPSED + SLEEP_INTERVAL))
+    done
+
+    kill $MONITOR_PID 2>/dev/null || true
+    # Ensure the background log process (inside the monitor subshell) is also cleaned up
+    # We kill the process group to be sure
+    pkill -P $MONITOR_PID 2>/dev/null || true
+  else
+    echo "[INFO] --sync-only flag detected. Skipping Job deployment and monitoring."
+  fi
+
+  echo "[INFO] Syncing results from GCS..."
+  RESULTS_DIR="${ACP_REPO_DIR}/${hub_models_bucket_bench_results_name}"
+  mkdir -p "${RESULTS_DIR}"
+  gcloud storage cp -r "gs://${hub_models_bucket_bench_results_name}/*.jsonl" "${RESULTS_DIR}/"
+
+  # Find the most recent file for this SPECIFIC accelerator run
+  LATEST_JSONL=$(ls -t "${RESULTS_DIR}"/*"${HF_MODEL_NAME}"*"${ACCELERATOR_TYPE}"*.jsonl | head -n 1)
+  COST="${MANUAL_COST:-$(get_hourly_cost "${ACCELERATOR_TYPE}")}"
+
+  echo "[INFO] ----------------------------------------------------------------------"
+  echo "[INFO] Analyzing: ${LATEST_JSONL}"
+  echo "[INFO] Accelerator: ${ACCELERATOR_TYPE} | Cost: \$${COST}/hr"
+  echo "[INFO] DISCLAIMER: This rate represents public on-demand pricing."
+  echo "[INFO] It does NOT account for CUDs, SUDs, or custom private pricing."
+  echo "[INFO] For current and accurate pricing, visit:"
+  echo "[INFO] https://cloud.google.com/products/compute/pricing"
+  echo "[INFO] ----------------------------------------------------------------------"
+
+  . "${ACP_REPO_DIR}/.venv/bin/activate"
+  python3 "${ACP_REPO_DIR}/container-images/cpu/k6-benchmark/extract_metrics.py" \
+    --file "${LATEST_JSONL}" \
+    --hourly-cost "${COST}" \
+    --project-id "${cluster_project_id}" \
+    --namespace "${ira_online_gpu_kubernetes_namespace_name}" \
+    --output-csv "${ACP_REPO_DIR}/k6-benchmark.csv"
+
+  if [[ "${SYNC_ONLY}" != "true" ]]; then
+    echo "[INFO] Cleaning up Job resources for ${ACCELERATOR_TYPE}..."
+    kubectl delete --ignore-not-found --kustomize "${CONFIG_DIR}/${HF_MODEL_NAME}"
+  fi
+done
+
+echo ""
+echo "======================================================================"
+echo " ALL BENCHMARK SUITES COMPLETE"
+echo " Final Aggregated CSV: ${ACP_REPO_DIR}/k6-benchmark.csv"
+echo "======================================================================"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env
similarity index 100%
rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env
rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh
index 93ece036e..92fa0c342 100755
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh
@@ -13,10 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -o errexit
-set -o nounset
-set -o pipefail
-
 MY_PATH="$(
   cd "$(dirname "$0")" >/dev/null 2>&1
   pwd -P
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/job.yaml
new file mode 100644
index 000000000..6ee64692a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/job.yaml
@@ -0,0 +1,71 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: k6-benchmark
+  namespace: replaced-by-kustomize
+spec:
+  backoffLimit: 0
+  template:
+    metadata:
+      labels:
+        app: k6-benchmark
+      annotations:
+        gke-gcsfuse/volumes: "true"
+        cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - args: []
+          command: []
+          image: replaced-by-kustomize
+          imagePullPolicy: Always
+          name: k6-benchmark
+          resources: {}
+          volumeMounts:
+            - mountPath: /output
+              name: k6-benchmark-bucket-results
+              readOnly: false
+          env:
+            - name: TARGET_URL
+              valueFrom:
+                configMapKeyRef:
+                  key: K6_TARGET_URL
+                  name: deployment
+            - name: ACCELERATOR_NAME
+              valueFrom:
+                configMapKeyRef:
+                  key: ACCELERATOR_NAME
+                  name: deployment
+            - name: SCENARIOS_JSON
+              valueFrom:
+                configMapKeyRef:
+                  key: K6_SCENARIOS_JSON
+                  name: deployment
+            - name: INFERENCE_SERVER_TYPE
+              valueFrom:
+                configMapKeyRef:
+                  key: INFERENCE_SERVER_TYPE
+                  name: deployment
+      serviceAccountName: replaced-by-kustomize
+      terminationGracePeriodSeconds: 0
+      volumes:
+        - csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: replaced-by-kustomize
+              mountOptions: "implicit-dirs,uid=12345,gid=12345"
+          name: k6-benchmark-bucket-results
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/kustomization.yaml
new file mode 100644
index 000000000..7f988c7f4
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/kustomization.yaml
@@ -0,0 +1,25 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - deployment.env
+    name: deployment
+    namespace: replaced-by-kustomize
+
+resources:
+  - job.yaml
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/templates/deployment.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/templates/deployment.tpl.env
new file mode 100644
index 000000000..32ad48a33
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/base/templates/deployment.tpl.env
@@ -0,0 +1,8 @@
+INFERENCE_KUBERNETES_NAMESPACE=${ira_online_gpu_kubernetes_namespace_name}
+INFERENCE_KUBERNETES_SERVICE_ACCOUNT=${ira_inference_perf_bench_kubernetes_service_account_name}
+CONTAINER_IMAGE_URL=${ira_cpu_k6_benchmark_image_url}
+ACCELERATOR_NAME=${ACCELERATOR_TYPE}
+BENCHMARK_RESULTS_BUCKET_NAME=${hub_models_bucket_bench_results_name}
+K6_TARGET_URL=${K6_TARGET_URL}
+K6_SCENARIOS_JSON=${K6_SCENARIOS_JSON}
+INFERENCE_SERVER_TYPE=${K6_INFERENCE_SERVER_TYPE}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh
new file mode 100755
index 000000000..f2e41dbc9
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+MY_PATH="$(
+  cd "$(dirname "$0")" >/dev/null 2>&1
+  pwd -P
+)"
+
+source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh"
+
+if [[ -z "${ACCELERATOR_TYPE:-}" ]]; then
+  echo "ACCELERATOR_TYPE is not set"
+  return 1
+fi
+
+if [[ -z "${HF_MODEL_NAME:-}" ]]; then
+  echo "HF_MODEL_NAME is not set"
+  echo "If the HF_MODEL_NAME variable is not set, ensure that HF_MODEL_ID is set and source the set_environment_variables.sh script:"
+  echo "source \"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh\""
+  return 1
+fi
+
+if [[ -z "${K6_SCENARIOS_JSON:-}" ]]; then
+  echo "K6_SCENARIOS_JSON is not set."
+  return 1
+fi
+export K6_SCENARIOS_JSON
+
+echo "Configuring deployment for ${HF_MODEL_NAME} running on ${ACCELERATOR_TYPE}"
+
+if [[ "${HF_MODEL_NAME:-}" == "flux-2-klein-4b" ]]; then
+  K6_TARGET_URL="http://diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}:8000"
+  K6_INFERENCE_SERVER_TYPE="sglang"
+elif [[ "${HF_MODEL_NAME:-}" == "flux-1-schnell" ]]; then
+  K6_TARGET_URL="http://diffusers-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}:8000/generate"
+  K6_INFERENCE_SERVER_TYPE="diffusers"
+else
+  echo "Model not supported: ${HF_MODEL_NAME:-"HF_MODEL_NAME variable not set"}"
+  return 1
+fi
+
+export K6_TARGET_URL
+export K6_INFERENCE_SERVER_TYPE
+
+envsubst <"${MY_PATH}/base/templates/deployment.tpl.env" | sponge "${MY_PATH}/base/deployment.env"
+
+echo "Deployment configuration:"
+cat "${MY_PATH}/base/deployment.env"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..a859b82b4
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,86 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -flux-2-klein-4b
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.app
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=k6-benchmark].image
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Job
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.BENCHMARK_RESULTS_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=k6-benchmark-bucket-results].csi.volumeAttributes.bucketName
+        select:
+          kind: Job
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..832c00108
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: k6-benchmark
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: cpu-n4-8
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..e81641a62
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: k6-benchmark
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: k6-benchmark
+          args:
+            - "/app/scripts/k6-diffusers-flux-2-klein-4b.js"
+          resources:
+            limits:
+              cpu: "2"
+              memory: 1G
+            requests:
+              cpu: "2"
+              memory: 1G
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..c8f0f2217
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/flux-2-klein-4b/runtime.env
@@ -0,0 +1 @@
+APP_LABEL=k6-benchmark-flux-2-klein-4b
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/kustomization.yaml
new file mode 100644
index 000000000..353b1abc9
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/kustomization.yaml
@@ -0,0 +1,29 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../base
+
+configMapGenerator:
+  - files:
+      - patch-entrypoint.sh
+    name: entrypoint-patch
+    namespace: replaced-by-kustomize
+
+patches:
+  - path: patch-server.yaml
+  - path: patch-gcsfuse.yaml
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-entrypoint.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-entrypoint.sh
new file mode 100644
index 000000000..c7f27a2fe
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-entrypoint.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+
+# Dynamic python patch of parallel_state.py
+python3 -c "
+path = '/sgl-workspace/sglang/python/sglang/multimodal_gen/runtime/distributed/parallel_state.py'
+with open(path, 'r') as f:
+    content = f.read()
+
+old_code = '''        extra_args = (
+            {}
+            if (
+                current_platform.is_mps()
+                or current_platform.is_musa()
+                or current_platform.is_npu()
+            )
+            else dict(device_id=device_id)
+        )'''
+
+new_code = '''        extra_args = (
+            {}
+            if (
+                current_platform.is_mps()
+                or current_platform.is_musa()
+                or current_platform.is_npu()
+                or world_size == 1
+            )
+            else dict(device_id=device_id)
+        )'''
+
+if old_code in content:
+    content = content.replace(old_code, new_code)
+    with open(path, 'w') as f:
+        f.write(content)
+    print('[Patch] Successfully patched parallel_state.py!')
+else:
+    print('[Patch] Target code not found or already patched.')
+"
+
+# Execute normal SGLang binary entrypoint with arguments
+exec sglang "$@"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-gcsfuse.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-gcsfuse.yaml
new file mode 100644
index 000000000..5f647c181
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-gcsfuse.yaml
@@ -0,0 +1,36 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This patch overrides the base GCS Fuse configuration for virtual GPU slices.
+# Deviation from base:
+# 1. file-cache:max-size-mb is set to 0 (disabled).
+# 2. parallel downloads are disabled (implied by removing the flag).
+# Rationale: Virtual GPU slices run on memory-constrained host node pools.
+# Disabling the RAM-based file cache prevents node-level evictions and sidecar
+# OOM crashes during model weight prefetching.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      volumes:
+        - name: huggingface-hub-model-bucket
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:0,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-server.yaml
new file mode 100644
index 000000000..1e296d9b7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base-vgpu/patch-server.yaml
@@ -0,0 +1,63 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          # The normal sglang command is wrapped by patch-entrypoint.sh.
+          # This startup wrapper dynamically patches sglang's parallel_state.py
+          # to omit the 'device_id' argument in torch.distributed.init_process_group
+          # when world_size == 1.
+          #
+          # Why: PyTorch's eager device connection hook (eager_connect_single_device)
+          # is triggered when device_id is passed, which attempts direct GPU/NCCL hardware
+          # binding calls that crash with 'cudaErrorNotSupported' inside virtualized
+          # MIG / vGPU slice sandboxes under GKE. Delaying NCCL init (which is never
+          # called on tp=1 setups) bypasses the crash entirely.
+          #
+          # Tracked in SGLang Issue: https://github.com/sgl-project/sglang/issues/25670
+          command:
+            - "/scripts/patch-entrypoint.sh"
+          args:
+            - "serve"
+            - "--mem-fraction-static=$(GPU_MEMORY_UTILIZATION)"
+            - "--model-path=/gcs/$(MODEL_ID)"
+            - "--tp-size=$(TENSOR_PARALLEL_SIZE)"
+            - "--trust-remote-code"
+            - "--port=30000"
+            - "--host=0.0.0.0"
+          env:
+            - name: NCCL_P2P_DISABLE
+              value: "1"
+            - name: NCCL_SHM_DISABLE
+              value: "1"
+            - name: NCCL_NVLS_ENABLE
+              value: "0"
+            - name: NCCL_DEBUG
+              value: "INFO"
+          volumeMounts:
+            - mountPath: /scripts
+              name: entrypoint-patch-volume
+      volumes:
+        - name: entrypoint-patch-volume
+          configMap:
+            name: entrypoint-patch
+            defaultMode: 0755
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml
index 127b3cad0..26ae92e37 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/deployment.yaml
@@ -31,7 +31,7 @@ spec:
         gke-gcsfuse/memory-limit: "0"
         gke-gcsfuse/volumes: "true"
       labels:
-        ai.gke.io/inference-server: diffusers
+        ai.gke.io/inference-server: replaced-by-kustomize
         ai.gke.io/model: replaced-by-kustomize
         app: diffusers
     spec:
@@ -85,8 +85,8 @@ spec:
               path: /health
               port: 8000
               scheme: HTTP
-            initialDelaySeconds: 60
-            periodSeconds: 10
+            initialDelaySeconds: 120
+            periodSeconds: 30
             successThreshold: 1
             timeoutSeconds: 1
           resources: {}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml
index 22dc66f91..fc01c9452 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/kustomization.yaml
@@ -26,3 +26,13 @@ resources:
   - ../../base
   - deployment.yaml
   - service.yaml
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env
index b36920254..bcc398a74 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/base/templates/diffusers.tpl.env
@@ -1 +1,2 @@
-CONTAINER_IMAGE_URL=${ira_online_gpu_diffusers_flux_image_url}
+CONTAINER_IMAGE_URL=${DIFFUSERS_CONTAINER_IMAGE_URL}
+INFERENCE_SERVER=${DIFFUSERS_INFERENCE_SERVER}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh
index 2863274d0..685fe995a 100755
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh
@@ -26,4 +26,27 @@ source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_var
 
 "${MY_PATH}/../configure_deployment.sh"
 
-envsubst < "${MY_PATH}/base/templates/diffusers.tpl.env" | sponge "${MY_PATH}/base/diffusers.env"
+if [[ "${HF_MODEL_ID}" == "black-forest-labs/flux.1-schnell" ]]; then
+  DIFFUSERS_CONTAINER_IMAGE_URL="${ira_online_gpu_diffusers_flux_image_url}"
+  DIFFUSERS_INFERENCE_SERVER="diffusers"
+elif [[ "${HF_MODEL_ID}" == "black-forest-labs/flux.2-klein-4b" ]]; then
+  DIFFUSERS_CONTAINER_IMAGE_URL="${ira_online_gpu_diffusers_sglang_diffusers_image_url}"
+  DIFFUSERS_INFERENCE_SERVER="sglang"
+else
+  echo "[ERROR] Set a container image URL for model: ${HF_MODEL_ID:-"no model set"}"
+  return 1
+fi
+
+export DIFFUSERS_CONTAINER_IMAGE_URL
+export DIFFUSERS_INFERENCE_SERVER
+
+envsubst <"${MY_PATH}/base/templates/diffusers.tpl.env" | sponge "${MY_PATH}/base/diffusers.env"
+
+echo "Configurations for ${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+
+echo "Deployment configuration:"
+cat "${MY_PATH}/base/diffusers.env"
+echo
+
+echo "Runtime configuration:"
+cat "${MY_PATH}/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}/runtime.env"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml
index 5ced51113..e39e58664 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/h100-flux-1-schnell/kustomization.yaml
@@ -28,6 +28,15 @@ patches:
   - path: patch-resources.yaml
 
 replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
   - source:
       fieldPath: data.APP_LABEL
       kind: ConfigMap
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml
index 6cc31e249..a8408c99e 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-1-schnell/kustomization.yaml
@@ -28,6 +28,15 @@ patches:
   - path: patch-resources.yaml
 
 replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
   - source:
       fieldPath: data.APP_LABEL
       kind: ConfigMap
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..f0c05f7a4
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,142 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -l4-flux-2-klein-4b
+
+patches:
+  - path: patch-server.yaml
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..42621a442
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-l4-24gb-s16-x1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..5b87d66b7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..2d997452a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              cpu: "6"
+              memory: 45G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "6"
+              memory: 45G
+              nvidia.com/gpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-server.yaml
new file mode 100644
index 000000000..4725a1810
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/patch-server.yaml
@@ -0,0 +1,34 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          command:
+            - sglang
+            - serve
+          args:
+            - "--mem-fraction-static=$(GPU_MEMORY_UTILIZATION)"
+            - "--model-path=/gcs/$(MODEL_ID)"
+            - "--tp-size=$(TENSOR_PARALLEL_SIZE)"
+            - "--trust-remote-code"
+            - "--port=30000"
+            - "--host=0.0.0.0"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..1392ebab2
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-flux-2-klein-4b/runtime.env
@@ -0,0 +1,5 @@
+APP_LABEL=diffusers-l4-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.95
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..537b6b731
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,142 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -l4-x2-flux-2-klein-4b
+
+patches:
+  - path: patch-server.yaml
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..3a1d2988a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-l4-24gb-x2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..cc8584d48
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..4a23f238d
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              cpu: "20"
+              memory: 80G
+              nvidia.com/gpu: "2"
+            requests:
+              cpu: "20"
+              memory: 80G
+              nvidia.com/gpu: "2"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-server.yaml
new file mode 100644
index 000000000..1c267ac3c
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/patch-server.yaml
@@ -0,0 +1,46 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          env:
+            - name: NUM_GPUS
+              valueFrom:
+                configMapKeyRef:
+                  name: runtime
+                  key: NUM_GPUS
+            - name: TENSOR_PARALLEL_SIZE
+              valueFrom:
+                configMapKeyRef:
+                  name: runtime
+                  key: TENSOR_PARALLEL_SIZE
+          command:
+            - sglang
+            - serve
+          args:
+            - "--model-path=/gcs/$(MODEL_ID)"
+            - "--tp-size=$(TENSOR_PARALLEL_SIZE)"
+            - "--trust-remote-code"
+            - "--port=30000"
+            - "--host=0.0.0.0"
+            - "--backend=sglang"
+            - "--num-gpus=$(NUM_GPUS)"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..2f2783f52
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x2-flux-2-klein-4b/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=diffusers-l4-x2-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.95
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=2
+NUM_GPUS=2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..0bbc8c8cd
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,142 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -l4-x4-flux-2-klein-4b
+
+patches:
+  - path: patch-server.yaml
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..330f065e4
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-l4-24gb-x4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..cc8584d48
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..04b58a68f
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              cpu: "40"
+              memory: 160G
+              nvidia.com/gpu: "4"
+            requests:
+              cpu: "40"
+              memory: 160G
+              nvidia.com/gpu: "4"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-server.yaml
new file mode 100644
index 000000000..1c267ac3c
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/patch-server.yaml
@@ -0,0 +1,46 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          env:
+            - name: NUM_GPUS
+              valueFrom:
+                configMapKeyRef:
+                  name: runtime
+                  key: NUM_GPUS
+            - name: TENSOR_PARALLEL_SIZE
+              valueFrom:
+                configMapKeyRef:
+                  name: runtime
+                  key: TENSOR_PARALLEL_SIZE
+          command:
+            - sglang
+            - serve
+          args:
+            - "--model-path=/gcs/$(MODEL_ID)"
+            - "--tp-size=$(TENSOR_PARALLEL_SIZE)"
+            - "--trust-remote-code"
+            - "--port=30000"
+            - "--host=0.0.0.0"
+            - "--backend=sglang"
+            - "--num-gpus=$(NUM_GPUS)"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..60aeb744b
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/l4-x4-flux-2-klein-4b/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=diffusers-l4-x4-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.95
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=4
+NUM_GPUS=4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..f0935a835
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,141 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -rtx-pro-6000-1-2-flux-2-klein-4b
+
+patches:
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base-vgpu
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..157faff63
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1-2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..cc8584d48
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..8193521e2
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    metadata:
+      annotations:
+        devices.gke.io/container.inference-server: |+
+          - path: /dev/nvidia-caps/nvidia-cap1
+          - path: /dev/nvidia-caps/nvidia-cap2
+          - path: /dev/nvidia-caps/nvidia-cap3
+          - path: /dev/nvidia-caps/nvidia-cap4
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              cpu: "6"
+              memory: 45G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "6"
+              memory: 45G
+              nvidia.com/gpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..f35250665
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-2-flux-2-klein-4b/runtime.env
@@ -0,0 +1,5 @@
+APP_LABEL=diffusers-rtx-pro-6000-1-2-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.95
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..ab6afae70
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,141 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -rtx-pro-6000-1-4-flux-2-klein-4b
+
+patches:
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base-vgpu
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..d50d8bf60
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1-4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..cc8584d48
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..c6655431e
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    metadata:
+      annotations:
+        devices.gke.io/container.inference-server: |+
+          - path: /dev/nvidia-caps/nvidia-cap1
+          - path: /dev/nvidia-caps/nvidia-cap2
+          - path: /dev/nvidia-caps/nvidia-cap3
+          - path: /dev/nvidia-caps/nvidia-cap4
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              cpu: "6"
+              memory: 38G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "6"
+              memory: 38G
+              nvidia.com/gpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..646fdd22f
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-4-flux-2-klein-4b/runtime.env
@@ -0,0 +1,5 @@
+APP_LABEL=diffusers-rtx-pro-6000-1-4-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.95
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..51bed340a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,152 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -rtx-pro-6000-1-8-flux-2-klein-4b
+
+patches:
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: diffusers
+    patch: |-
+      - op: remove
+        path: /spec/template/spec/volumes/3/emptyDir/medium
+      - op: remove
+        path: /spec/template/spec/volumes/4/emptyDir/medium
+      - op: remove
+        path: /spec/template/spec/volumes/5/emptyDir/medium
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base-vgpu
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..fd81a2a54
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1-8
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..cc8584d48
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..b90a86b0b
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,82 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    metadata:
+      annotations:
+        devices.gke.io/container.inference-server: |+
+          - path: /dev/nvidia-caps/nvidia-cap1
+          - path: /dev/nvidia-caps/nvidia-cap2
+          - path: /dev/nvidia-caps/nvidia-cap3
+          - path: /dev/nvidia-caps/nvidia-cap4
+    spec:
+      tolerations:
+        - effect: NoSchedule
+          key: node.kubernetes.io/memory-pressure
+          operator: Exists
+      initContainers:
+        - name: preload-model
+          image: us-central1-docker.pkg.dev/accelerated-platforms-dev/frr-l4/gpu/sglang:latest
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -eo pipefail
+              echo "########### Copying model files to local SSD storage..."
+              mkdir -p /local-model/black-forest-labs
+              cp -a /gcs/black-forest-labs/flux.2-klein-4b /local-model/black-forest-labs/
+              echo "########### Copy completed successfully!"
+              ls -lh /local-model/black-forest-labs/flux.2-klein-4b
+          volumeMounts:
+            - mountPath: /gcs/black-forest-labs/flux.2-klein-4b
+              name: huggingface-hub-model-bucket
+              readOnly: true
+            - mountPath: /local-model
+              name: local-model-storage
+      containers:
+        - name: inference-server
+          args:
+            - serve
+            - --mem-fraction-static=$(GPU_MEMORY_UTILIZATION)
+            - --model-path=/local-model/black-forest-labs/flux.2-klein-4b
+            - --tp-size=$(TENSOR_PARALLEL_SIZE)
+            - --trust-remote-code
+            - --port=30000
+            - --host=0.0.0.0
+            - --text-encoder-cpu-offload
+            - --vae-cpu-offload
+            - --pin-cpu-memory=False
+          resources:
+            limits:
+              cpu: "4"
+              memory: 23500Mi
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "4"
+              memory: 12Gi
+              nvidia.com/gpu: "1"
+          volumeMounts:
+            - mountPath: /local-model
+              name: local-model-storage
+              readOnly: true
+        - $patch: delete
+          name: fetch-safetensors
+      volumes:
+        - name: local-model-storage
+          emptyDir: {}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..5021ea1d5
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-1-8-flux-2-klein-4b/runtime.env
@@ -0,0 +1,5 @@
+APP_LABEL=diffusers-rtx-pro-6000-1-8-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.75
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/kustomization.yaml
new file mode 100644
index 000000000..ab55bbdca
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/kustomization.yaml
@@ -0,0 +1,142 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -rtx-pro-6000-flux-2-klein-4b
+
+patches:
+  - path: patch-server.yaml
+  - path: patch-ports.yaml
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.INFERENCE_SERVER
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/inference-server]
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: diffusers
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..44af184b7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-ports.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-ports.yaml
new file mode 100644
index 000000000..cc8584d48
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-ports.yaml
@@ -0,0 +1,38 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          readinessProbe:
+            httpGet:
+              port: 30000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  ports:
+    - port: 8000
+      protocol: TCP
+      targetPort: 30000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-resources.yaml
new file mode 100644
index 000000000..2d997452a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              cpu: "6"
+              memory: 45G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "6"
+              memory: 45G
+              nvidia.com/gpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-server.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-server.yaml
new file mode 100644
index 000000000..4725a1810
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/patch-server.yaml
@@ -0,0 +1,34 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: diffusers
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          command:
+            - sglang
+            - serve
+          args:
+            - "--mem-fraction-static=$(GPU_MEMORY_UTILIZATION)"
+            - "--model-path=/gcs/$(MODEL_ID)"
+            - "--tp-size=$(TENSOR_PARALLEL_SIZE)"
+            - "--trust-remote-code"
+            - "--port=30000"
+            - "--host=0.0.0.0"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/runtime.env
new file mode 100644
index 000000000..a4eaea70a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/rtx-pro-6000-flux-2-klein-4b/runtime.env
@@ -0,0 +1,5 @@
+APP_LABEL=diffusers-rtx-pro-6000-flux-2-klein-4b
+GPU_MEMORY_UTILIZATION=0.95
+MODEL_ID=black-forest-labs/flux.2-klein-4b
+MODEL_NAME=flux-2-klein-4b
+TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md b/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md
index d235a13fd..d855ec88b 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md
@@ -149,10 +149,10 @@ For more information about providing values for Terraform input variables, see
 
 - Configure the platform.
 
-  - [Optional]
+  - \[Optional\]
     [Hugging Face initialization](/platforms/gke/base/core/huggingface/initialize/README.md)
-  - [Optional]
-    [NVIDIA initialization](/platforms/gke/base/core/nvidia/initialize/README.md)
+  - \[Optional\]
+    [NVIDIA NGC initialization](/platforms/gke/base/core/nvidia/initialize/README.md)
 
 ### Resources created
 
@@ -180,6 +180,7 @@ For more information about providing values for Terraform input variables, see
           <a href="/platforms/gke/base/core/custom_compute_class/templates/manifests/cpu">CPU</a>
         </summary>
         <ul>
+          <li>cpu-e2-s-16</li>
           <li>cpu-n4-s-8</li>
         <ul>
       </details>
@@ -204,10 +205,14 @@ For more information about providing values for Terraform input variables, see
           <li>gpu-l4-24gb-x2</li>
           <li>gpu-l4-24gb-x4</li>
           <li>gpu-l4-24gb-x8</li>
+          <li>gpu-rtx-pro-6000-96gb-x1</li>
+          <li>gpu-rtx-pro-6000-96gb-x1-2</li>
+          <li>gpu-rtx-pro-6000-96gb-x1-4</li>
+          <li>gpu-rtx-pro-6000-96gb-x1-8</li>
         <ul>
       </details>
     - <details>
-        <summary>        
+        <summary>
           <a href="/platforms/gke/base/core/custom_compute_class/templates/manifests/tpu">TPU</a>
         </summary>
         <ul>
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf
index 6992f930d..2450e427f 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf
@@ -52,10 +52,13 @@ locals {
 
   ira_offline_batch_project_id = var.ira_offline_batch_project_id != null ? var.ira_offline_batch_project_id : var.platform_default_project_id
 
-  ira_online_gpu_diffusers_flux_image_url        = var.ira_online_gpu_diffusers_flux_image_url != null ? var.ira_online_gpu_diffusers_flux_image_url : "${local.cloudbuild_ar_image_repository_url}/gpu-diffusers/flux:latest"
-  ira_online_gpu_kubernetes_namespace_name       = var.ira_online_gpu_kubernetes_namespace_name != null ? var.ira_online_gpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-online-gpu"
-  ira_online_gpu_kubernetes_service_account_name = var.ira_online_gpu_kubernetes_service_account_name != null ? var.ira_online_gpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-online-gpu"
-  ira_online_gpu_vllm_image_url                  = var.ira_online_gpu_vllm_image_url != null ? var.ira_online_gpu_vllm_image_url : "${local.cloudbuild_ar_image_repository_url}/vllm/gpu:latest"
+  ira_cpu_k6_benchmark_image_url = var.ira_cpu_k6_benchmark_image_url != null ? var.ira_cpu_k6_benchmark_image_url : "${local.cloudbuild_ar_image_repository_url}/cpu/k6-benchmark:latest"
+
+  ira_online_gpu_diffusers_flux_image_url             = var.ira_online_gpu_diffusers_flux_image_url != null ? var.ira_online_gpu_diffusers_flux_image_url : "${local.cloudbuild_ar_image_repository_url}/gpu-diffusers/flux:latest"
+  ira_online_gpu_diffusers_sglang_diffusers_image_url = var.ira_online_gpu_sglang_diffusers_image_url != null ? var.ira_online_gpu_sglang_diffusers_image_url : "${local.cloudbuild_ar_image_repository_url}/gpu/sglang:latest"
+  ira_online_gpu_kubernetes_namespace_name            = var.ira_online_gpu_kubernetes_namespace_name != null ? var.ira_online_gpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-online-gpu"
+  ira_online_gpu_kubernetes_service_account_name      = var.ira_online_gpu_kubernetes_service_account_name != null ? var.ira_online_gpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-online-gpu"
+  ira_online_gpu_vllm_image_url                       = var.ira_online_gpu_vllm_image_url != null ? var.ira_online_gpu_vllm_image_url : "${local.cloudbuild_ar_image_repository_url}/vllm/gpu:latest"
 
   ira_online_tpu_kubernetes_namespace_name       = var.ira_online_tpu_kubernetes_namespace_name != null ? var.ira_online_tpu_kubernetes_namespace_name : "${local.unique_identifier_prefix}-online-tpu"
   ira_online_tpu_kubernetes_service_account_name = var.ira_online_tpu_kubernetes_service_account_name != null ? var.ira_online_tpu_kubernetes_service_account_name : "${local.unique_identifier_prefix}-online-tpu"
@@ -219,6 +222,11 @@ variable "ira_online_gpu_diffusers_flux_image_url" {
   type        = string
 }
 
+variable "ira_cpu_k6_benchmark_image_url" {
+  default     = null
+  description = "The URL for the k6 benchmark container image."
+  type        = string
+}
 
 variable "ira_online_gpu_kubernetes_namespace_name" {
   default     = null
@@ -238,6 +246,12 @@ variable "ira_online_gpu_vllm_image_url" {
   type        = string
 }
 
+variable "ira_online_gpu_sglang_diffusers_image_url" {
+  default     = null
+  description = "The URL for the GPU SGLang Diffusers container image."
+  type        = string
+}
+
 variable "ira_online_tpu_kubernetes_namespace_name" {
   default     = null
   description = "The Kubernetes namespace for the online TPU inference workloads."
@@ -291,4 +305,3 @@ variable "enable_tpu" {
   description = "Turns on inference-perf resources for TPU cluster"
   type        = bool
 }
-
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf
index 8f573d8f7..824a1a8ca 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf
@@ -152,10 +152,18 @@ output "ira_offline_batch_project_id" {
   value = local.ira_offline_batch_project_id
 }
 
+output "ira_cpu_k6_benchmark_image_url" {
+  value = local.ira_cpu_k6_benchmark_image_url
+}
+
 output "ira_online_gpu_diffusers_flux_image_url" {
   value = local.ira_online_gpu_diffusers_flux_image_url
 }
 
+output "ira_online_gpu_diffusers_sglang_diffusers_image_url" {
+  value = local.ira_online_gpu_diffusers_sglang_diffusers_image_url
+}
+
 output "ira_online_gpu_kubernetes_namespace_name" {
   value = local.ira_online_gpu_kubernetes_namespace_name
 }
@@ -239,4 +247,3 @@ output "workflow_api_service_account_oauth_display_name" {
 output "workflow_api_service_account_project_id" {
   value = local.workflow_api_service_account_project_id
 }
-
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh
index ecfbd0c4e..da72d5651 100755
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh
@@ -14,7 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 MY_PATH_IRA_ENV="$(
-  cd "$(dirname "${BASH_SOURCE}")" >/dev/null 2>&1
+  SCRIPT_SOURCE="${BASH_SOURCE[0]:-}"
+  if [[ -z "${SCRIPT_SOURCE:-}" ]]; then
+    # Fallback in case BASH_SOURCE is not defined, such as when sourcing this
+    # script from a non-Bash shell
+    SCRIPT_SOURCE="$0"
+  fi
+
+  cd "$(dirname "${SCRIPT_SOURCE}")" >/dev/null 2>&1 || return 1
   pwd -P
 )"
 
@@ -36,6 +43,7 @@ if [[ -v HF_MODEL_ID ]]; then
 
   HF_MODEL_NAME="${HF_MODEL_ID##*/}"
   HF_MODEL_NAME="${HF_MODEL_NAME//./-}"
-  HF_MODEL_NAME="${HF_MODEL_NAME,,}"
+  # Don't use ,, to make this portable across shells
+  HF_MODEL_NAME="$(echo "${HF_MODEL_NAME}" | tr '[:upper:]' '[:lower:]')"
   export HF_MODEL_NAME
 fi
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh b/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh
index c83ae354b..d2f6a6a66 100755
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/deploy-standard.sh
@@ -50,6 +50,7 @@ declare -a CORE_TERRASERVICES_APPLY=(
   "workloads/jobset"
   "workloads/lws"
   "workloads/priority_class"
+  "workloads/nri_device_injector"
   "workloads/kueue"
 )
 CORE_TERRASERVICES_APPLY="${CORE_TERRASERVICES_APPLY[*]}" "${ACP_PLATFORM_CORE_DIR}/deploy.sh"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform.lock.hcl b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform.lock.hcl
new file mode 100644
index 000000000..45a622858
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/.terraform.lock.hcl
@@ -0,0 +1,42 @@
+# This file is maintained automatically by "terraform init".
+# Manual edits may be lost in future updates.
+
+provider "registry.terraform.io/hashicorp/google" {
+  version     = "6.49.2"
+  constraints = "6.49.2"
+  hashes = [
+    "h1:+B64rc5fCMrWtjZIsQGx/fftYiQnSInfqrLs76PZNH0=",
+    "zh:04dbba38cc201d8f35f21c65fe5fe022b2ef30712c59d0b04df1182ee484ee29",
+    "zh:37478f37b696e214049a7c1e397a6ebcf6b10e3652a6275c5e99ef972a0cd17f",
+    "zh:3a68292e88e6612ed014e22d53a693859071337fcc49a244936094ae8f2b82d8",
+    "zh:4adc8c706652b6c170c520bd3815abba7e145aeec26a2abdfa8a98ae85fbfc0d",
+    "zh:5e8dbf922be32eb54c370260fd71e8124d4d7a3bddc2d0e6b47b15efc30a2224",
+    "zh:632bccc9396e61947242095738164ae27db060b1c172422b41e3b12e80236ecc",
+    "zh:66ee64a5621199868c8fa68492124d38b37e1d733d240508c595b124b5123cb7",
+    "zh:6843060f0673a4e556c248672171c8a29c7faeaee9954cdffeb19a55de7e5184",
+    "zh:87d3b0bd397de17ea6c8b34c898afb9f08eda28c6c6272d8dd75fe17ceef77f3",
+    "zh:9d2f0f93f4506dc0002c2dec1b2117626b6376c214653b71629a933ce77e3523",
+    "zh:e80ccae3d640dca17b496220e3f42f6f0cc4c6fb80ffae9e2bbaea446373c137",
+    "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
+  ]
+}
+
+provider "registry.terraform.io/hashicorp/local" {
+  version     = "2.5.3"
+  constraints = "2.5.3"
+  hashes = [
+    "h1:1Nkh16jQJMp0EuDmvP/96f5Unnir0z12WyDuoR6HjMo=",
+    "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927",
+    "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e",
+    "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b",
+    "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
+    "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf",
+    "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d",
+    "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09",
+    "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f",
+    "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1",
+    "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6",
+    "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47",
+    "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82",
+  ]
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild.auto.tfvars
new file mode 120000
index 000000000..238bf8e95
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild.auto.tfvars
@@ -0,0 +1 @@
+../../../_shared_config/_cloudbuild.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild_variables.tf
new file mode 120000
index 000000000..8fade6147
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_cloudbuild_variables.tf
@@ -0,0 +1 @@
+../../../_shared_config/_cloudbuild_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch.auto.tfvars
new file mode 120000
index 000000000..c89c6eab2
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch.auto.tfvars
@@ -0,0 +1 @@
+../../../_shared_config/inference-ref-arch.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch_variables.tf
new file mode 120000
index 000000000..b2f96723d
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_inference-ref-arch_variables.tf
@@ -0,0 +1 @@
+../../../_shared_config/inference-ref-arch_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform.auto.tfvars
new file mode 120000
index 000000000..c9c406bba
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform.auto.tfvars
@@ -0,0 +1 @@
+../../../_shared_config/_platform.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform_variables.tf
new file mode 120000
index 000000000..7ec64070d
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/_platform_variables.tf
@@ -0,0 +1 @@
+../../../_shared_config/_platform_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/cloudbuild.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/cloudbuild.tf
new file mode 100644
index 000000000..b7640b5f5
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/cloudbuild.tf
@@ -0,0 +1,45 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+locals {
+  image_destination = local.ira_async_cpu_load_generator_image_url
+}
+
+resource "terraform_data" "submit_docker_build_k6_benchmark" {
+  input = {
+    acp_root                      = local.acp_root
+    cloudbuild_project_id         = local.cloudbuild_project_id
+    cloudbuild_service_account_id = local.cloudbuild_service_account_id
+    cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name
+    image_destination             = local.ira_cpu_k6_benchmark_image_url
+  }
+
+  provisioner "local-exec" {
+    command     = <<-EOT
+gcloud builds submit \
+--config="cloudbuild.yaml" \
+--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \
+--project="${self.input.cloudbuild_project_id}" \
+--quiet \
+--service-account="${self.input.cloudbuild_service_account_id}" \
+--substitutions=_DESTINATION="${self.input.image_destination}"
+EOT
+    interpreter = ["bash", "-c"]
+    working_dir = "${local.acp_root}/container-images/cpu/k6-benchmark"
+  }
+
+  triggers_replace = {
+    source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/cpu/k6-benchmark", "**") : filesha256("${local.acp_root}/container-images/cpu/k6-benchmark/${file}")]))
+  }
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/local_file.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/local_file.tf
new file mode 100644
index 000000000..2635bb2b3
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/local_file.tf
@@ -0,0 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+locals {
+  acp_root = "${path.module}/../../../../../../../../.."
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/versions.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/versions.tf
new file mode 100644
index 000000000..34a59fbc9
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/cpu/k6_benchmark/versions.tf
@@ -0,0 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform {
+  required_version = ">= 1.5.7"
+
+  required_providers {
+    google = {
+      source  = "hashicorp/google"
+      version = "6.49.2"
+    }
+    local = {
+      source  = "hashicorp/local"
+      version = "2.5.3"
+    }
+  }
+
+  provider_meta "google" {
+    module_name = "cloud-solutions/acp_ira_images_cpu_batch_load_generator_deploy-v1"
+  }
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf
index ab6a646b1..5a11160ef 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/images/gpu/diffusers_flux/cloudbuild.tf
@@ -45,3 +45,31 @@ EOT
     source_hash          = sha256(join("", [for file in fileset("${local.acp_root}/container-images/gpu/diffusers-flux/src", "**") : filesha256("${local.acp_root}/container-images/gpu/diffusers-flux/src/${file}")]))
   }
 }
+
+resource "terraform_data" "submit_sglang_diffusers" {
+  input = {
+    acp_root                      = local.acp_root
+    cloudbuild_project_id         = local.cloudbuild_project_id
+    cloudbuild_service_account_id = local.cloudbuild_service_account_id
+    cloudbuild_source_bucket_name = local.cloudbuild_source_bucket_name
+    image_destination             = local.ira_online_gpu_diffusers_sglang_diffusers_image_url
+  }
+
+  provisioner "local-exec" {
+    command     = <<-EOT
+gcloud builds submit \
+--config="cloudbuild.yaml" \
+--gcs-source-staging-dir="gs://${self.input.cloudbuild_source_bucket_name}/source" \
+--project="${self.input.cloudbuild_project_id}" \
+--quiet \
+--service-account="${self.input.cloudbuild_service_account_id}" \
+--substitutions=_DESTINATION="${self.input.image_destination}"
+EOT
+    interpreter = ["bash", "-c"]
+    working_dir = "${local.acp_root}/container-images/gpu/sglang-diffusers"
+  }
+
+  triggers_replace = {
+    source_hash = sha256(join("", [for file in fileset("${local.acp_root}/container-images/gpu/sglang-diffusers", "**") : filesha256("${local.acp_root}/container-images/gpu/sglang-diffusers/${file}")]))
+  }
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh b/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh
index 1d0e1ba9a..e59cf3f97 100755
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-standard.sh
@@ -69,6 +69,7 @@ done
 if [ "${ACP_TEARDOWN_CORE_PLATFORM}" = "true" ]; then
   declare -a CORE_TERRASERVICES_DESTROY=(
     "workloads/kueue"
+    "workloads/nri_device_injector"
     "workloads/priority_class"
     "workloads/lws"
     "workloads/jobset"
diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh
index 102621046..3a507e469 100755
--- a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh
+++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh
@@ -47,9 +47,14 @@ export ACCELERATOR_TYPE="l4"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-load-generator/configure_load_generator.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/configure_pubsub_subscriber.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/vllm/configure_vllm.sh"
-"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm/configure_vllm.sh"
 
+# Validate diffusers kustomize
+export HF_MODEL_ID="black-forest-labs/flux.1-schnell"
+"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh"
+export HF_MODEL_ID="black-forest-labs/flux.2-klein-4b"
+"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh"
+
 export ACCELERATOR_TYPE="v5e"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/max-diffusion/configure_max_diffusion.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/vllm/configure_vllm.sh"
@@ -65,6 +70,12 @@ export APP_LABEL="vllm-rtx-pro-6000-gemma-3-27b-it-sd-eagle"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/configure_vllm_spec_decoding.sh"
 
+# Validate k6-benchmark kustomize
+export ACCELERATOR_TYPE="l4"
+export HF_MODEL_NAME="HF_MODEL_NAME"
+export K6_REQUEST_BATCH_SIZE=1
+"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/k6-benchmark/configure_deployment.sh"
+
 # Validate offline-batch-inference-gpu kustomize
 export ACCELERATOR_TYPE="rtx-pro-6000"
 export HF_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"