From 46383c3cfc0a654c795405061e9066782e511329 Mon Sep 17 00:00:00 2001
From: syeda-anjum <syedaanjum@google.com>
Date: Sat, 18 Apr 2026 20:41:54 +0000
Subject: [PATCH 1/4] llm-d on TPUs, new branch

---
 .../llmd/vllm/base/deployment.yaml            | 198 ++++++++++++++++++
 .../llmd/vllm/base/kustomization.yaml         |  26 +++
 .../llmd/vllm/base/templates/vllm.tpl.env     |   2 +
 .../llmd/vllm/configure_vllm.sh               |  28 +++
 .../vllm/v5e-gemma-3-1b-it/kustomization.yaml | 131 ++++++++++++
 .../v5e-gemma-3-1b-it/patch-nodeselector.yaml |  24 +++
 .../v5e-gemma-3-1b-it/patch-resources.yaml    |  29 +++
 .../llmd/vllm/v5e-gemma-3-1b-it/runtime.env   |   6 +
 .../v5e-gemma-3-27b-it/kustomization.yaml     | 131 ++++++++++++
 .../patch-nodeselector.yaml                   |  24 +++
 .../v5e-gemma-3-27b-it/patch-resources.yaml   |  29 +++
 .../llmd/vllm/v5e-gemma-3-27b-it/runtime.env  |   6 +
 .../vllm/v5e-gemma-3-4b-it/kustomization.yaml | 131 ++++++++++++
 .../v5e-gemma-3-4b-it/patch-nodeselector.yaml |  24 +++
 .../v5e-gemma-3-4b-it/patch-resources.yaml    |  29 +++
 .../llmd/vllm/v5e-gemma-3-4b-it/runtime.env   |   6 +
 .../vllm/v5e-qwen3-32b/kustomization.yaml     | 131 ++++++++++++
 .../v5e-qwen3-32b/patch-nodeselector.yaml     |  24 +++
 .../vllm/v5e-qwen3-32b/patch-resources.yaml   |  29 +++
 .../llmd/vllm/v5e-qwen3-32b/runtime.env       |   6 +
 .../v6e-gemma-3-27b-it/kustomization.yaml     | 131 ++++++++++++
 .../patch-nodeselector.yaml                   |  24 +++
 .../v6e-gemma-3-27b-it/patch-resources.yaml   |  29 +++
 .../llmd/vllm/v6e-gemma-3-27b-it/runtime.env  |   6 +
 .../vllm/v6e-qwen3-32b/kustomization.yaml     | 131 ++++++++++++
 .../v6e-qwen3-32b/patch-nodeselector.yaml     |  24 +++
 .../vllm/v6e-qwen3-32b/patch-resources.yaml   |  29 +++
 .../llmd/vllm/v6e-qwen3-32b/runtime.env       |   6 +
 28 files changed, 1394 insertions(+)
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env
 create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env

diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml
new file mode 100644
index 000000000..bd913a736
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml
@@ -0,0 +1,198 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ms-inference-scheduling-llmd-modelservice
+  namespace: replaced-by-kustomize
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      llmd.ai/inferenceServing: "true"
+      llmd.ai/model: random_model
+      llmd.ai/role: decode
+      app: replaced-by-kustomize
+  template:
+    metadata:
+      annotations:
+        gke-gcsfuse/cpu-limit: "0"
+        gke-gcsfuse/ephemeral-storage-limit: "0"
+        gke-gcsfuse/memory-limit: "0"
+        gke-gcsfuse/volumes: "true"
+      labels:
+        llmd.ai/inferenceServing: "true"
+        llmd.ai/model: random_model
+        llmd.ai/role: decode
+        ai.gke.io/model: replaced-by-kustomize
+        app: replaced-by-kustomize
+    spec:
+      initContainers:
+        - name: routing-proxy
+          args:
+            - --port=8000
+            - --vllm-port=8200
+            - --connector=nixlv2
+            - --zap-encoder=json
+            - --zap-log-level=debug
+            - --secure-proxy=false
+          image: replaced-by-kustomize
+          imagePullPolicy: Always
+          ports:
+            - containerPort: 8000
+          resources: {}
+          restartPolicy: Always
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+      serviceAccountName: replaced-by-kustomize
+      volumes:
+        - emptyDir: {}
+          name: metrics-volume
+        - emptyDir: {}
+          name: torch-compile-cache
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 20Gi
+          name: dev-shm
+        - csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: cloud-storage-bucket-name
+              mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize,,uid=2000,gid=2000
+              skipCSIBucketAccessCheck: "true"
+          name: huggingface-hub-model-bucket
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-cache
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-tmp
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-buffer
+      containers:
+        - args:
+            - |
+              echo "########### $(date) - Starting parallel-fetch-safetensors for model: ${MODEL_ID}"
+              ls -alR /gcs
+              find /gcs/${MODEL_ID}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null'
+              echo "########### $(date) - Finished parallel-fetch-safetensors"
+              sleep infinity
+          command: ["/bin/sh", "-c"]
+          env:
+            - name: MODEL_ID
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_ID
+                  name: runtime
+          image: busybox
+          name: fetch-safetensors
+          volumeMounts:
+            - mountPath: /gcs
+              name: huggingface-hub-model-bucket
+              readOnly: true
+        - name: inference-server
+          image: replaced-by-kustomize
+          command: ["vllm", "serve"]
+          args:
+            - /gcs/$(MODEL_ID)
+            - "--port"
+            - "8200"
+            - "--served-model-name"
+            - "$(MODEL_ID)"
+            - "--kv-transfer-config"
+            - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
+            - "--disable-uvicorn-access-log"
+            - "--tensor-parallel-size"
+            - "$(TENSOR_PARALLEL_SIZE)"
+            - "--gpu-memory-utilization"
+            - "$(GPU_MEMORY_UTILIZATION)"
+            - "--max-model-len"
+            - "$(MAX_MODEL_LEN)"
+          env:
+            - name: UCX_TLS
+              value: cuda_ipc,cuda_copy,tcp
+            - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+            - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+              value: "5557"
+            - name: VLLM_LOGGING_LEVEL
+              value: DEBUG
+            - name: DP_SIZE
+              value: "1"
+            - name: DP_SIZE_LOCAL
+              value: "1"
+            - name: GPU_MEMORY_UTILIZATION
+              valueFrom:
+                configMapKeyRef:
+                  key: GPU_MEMORY_UTILIZATION
+                  name: runtime
+            - name: MAX_MODEL_LEN
+              valueFrom:
+                configMapKeyRef:
+                  key: MAX_MODEL_LEN
+                  name: runtime
+            - name: MODEL_ID
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_ID
+                  name: runtime
+            - name: TENSOR_PARALLEL_SIZE
+              valueFrom:
+                configMapKeyRef:
+                  key: TENSOR_PARALLEL_SIZE
+                  name: runtime
+          ports:
+            - containerPort: 5557
+              protocol: TCP
+            - containerPort: 8200
+              name: metrics
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8200
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /v1/models
+              port: 8200
+            periodSeconds: 5
+            timeoutSeconds: 2
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /v1/models
+              port: 8200
+            initialDelaySeconds: 15
+            periodSeconds: 30
+            timeoutSeconds: 5
+          resources: {}
+          volumeMounts:
+            - mountPath: /.config
+              name: metrics-volume
+            - mountPath: /.cache
+              name: torch-compile-cache
+            - mountPath: /dev/shm
+              name: dev-shm
+            - mountPath: /gcs
+              name: huggingface-hub-model-bucket
+              readOnly: true
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml
new file mode 100644
index 000000000..0314fa83a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - vllm.env
+    name: vllm
+    namespace: replaced-by-kustomize
+
+resources:
+  - ../../../base
+  - deployment.yaml
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env
new file mode 100644
index 000000000..ad4d6211e
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env
@@ -0,0 +1,2 @@
+CONTAINER_IMAGE_URL=ghcr.io/llm-d/llm-d-cuda:v0.5.0
+ROUTING_PROXY_IMAGE=ghcr.io/llm-d/llm-d-routing-sidecar:v0.4.0-rc.1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh
new file mode 100755
index 000000000..818f410e5
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o errexit
+set -o nounset
+set -o pipefail
+
+MY_PATH="$(
+  cd "$(dirname "$0")" >/dev/null 2>&1
+  pwd -P
+)"
+
+source "${MY_PATH}/../../../../examples/llmd/_shared_config/scripts/set_environment_variables.sh"
+"${MY_PATH}/../../configure_deployment.sh"
+
+envsubst <"${MY_PATH}/base/templates/vllm.tpl.env" | sponge "${MY_PATH}/base/vllm.env"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml
new file mode 100644
index 000000000..f90964787
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: "-v5e-gemma-3-1b-it"
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml
new file mode 100644
index 000000000..832e2fceb
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v5e-1x1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml
new file mode 100644
index 000000000..b3371ea99
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "1"
+            requests:
+              google.com/tpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env
new file mode 100644
index 000000000..8cc55f05c
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v5e-gemma-3-1b-it
+GPU_MEMORY_UTILIZATION=0.9
+MAX_MODEL_LEN=1024
+MODEL_ID=google/gemma-3-1b-it
+MODEL_NAME=gemma-3-1b-it
+TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml
new file mode 100644
index 000000000..fe78bce08
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -v5e-gemma-3-27b-it
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml
new file mode 100644
index 000000000..789ec78c6
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v5e-2x4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml
new file mode 100644
index 000000000..a2f2513e0
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "8"
+            requests:
+              google.com/tpu: "8"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env
new file mode 100644
index 000000000..cc066c34f
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v5e-gemma-3-27b-it
+GPU_MEMORY_UTILIZATION=0.9
+MAX_MODEL_LEN=1024
+MODEL_ID=google/gemma-3-27b-it
+MODEL_NAME=gemma-3-27b-it
+TENSOR_PARALLEL_SIZE=8
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml
new file mode 100644
index 000000000..ffb5e7862
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -v5e-gemma-3-4b-it
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml
new file mode 100644
index 000000000..d2d74eca4
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v5e-2x2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml
new file mode 100644
index 000000000..117ab1b36
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "4"
+            requests:
+              google.com/tpu: "4"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env
new file mode 100644
index 000000000..d00314f82
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v5e-gemma-3-4b-it
+GPU_MEMORY_UTILIZATION=0.9
+MAX_MODEL_LEN=1024
+MODEL_ID=google/gemma-3-4b-it
+MODEL_NAME=gemma-3-4b-it
+TENSOR_PARALLEL_SIZE=4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml
new file mode 100644
index 000000000..64201eff4
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -v5e-qwen3-32b
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml
new file mode 100644
index 000000000..789ec78c6
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v5e-2x4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml
new file mode 100644
index 000000000..a2f2513e0
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "8"
+            requests:
+              google.com/tpu: "8"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env
new file mode 100644
index 000000000..90a9f9620
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v5e-qwen3-32b
+GPU_MEMORY_UTILIZATION=0.95
+MAX_MODEL_LEN=32768
+MODEL_ID=qwen/qwen3-32b
+MODEL_NAME=qwen3-32b
+TENSOR_PARALLEL_SIZE=8
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml
new file mode 100644
index 000000000..38517e631
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -v6e-gemma-3-27b-it
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml
new file mode 100644
index 000000000..f984468b3
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v6e-2x4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml
new file mode 100644
index 000000000..a2f2513e0
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "8"
+            requests:
+              google.com/tpu: "8"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env
new file mode 100644
index 000000000..43a0308b6
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v6e-gemma-3-27b-it
+GPU_MEMORY_UTILIZATION=0.95
+MAX_MODEL_LEN=16384
+MODEL_ID=google/gemma-3-27b-it
+MODEL_NAME=gemma-3-27b-it
+TENSOR_PARALLEL_SIZE=8
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml
new file mode 100644
index 000000000..a6fd4ac44
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -v6e-qwen3-32b
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml
new file mode 100644
index 000000000..e0f4839a7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v6e-2x2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml
new file mode 100644
index 000000000..117ab1b36
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "4"
+            requests:
+              google.com/tpu: "4"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env
new file mode 100644
index 000000000..625f9245d
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v6e-qwen3-32b
+GPU_MEMORY_UTILIZATION=0.95
+MAX_MODEL_LEN=32768
+MODEL_ID=qwen/qwen3-32b
+MODEL_NAME=qwen3-32b
+TENSOR_PARALLEL_SIZE=4

From a1cef20ba16d3f997cedf7d2257c20e5865cc66f Mon Sep 17 00:00:00 2001
From: Syeda Anjum <syedaanjum@google.com>
Date: Fri, 1 May 2026 12:22:55 -0500
Subject: [PATCH 2/4] fix: rename template file to match script expectation in
 async-pubsub-subscriber

---
 ...-pubsub-subscriber.tpl.env => async-pubsub-subscriber.tpl.env} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/{batch-pubsub-subscriber.tpl.env => async-pubsub-subscriber.tpl.env} (100%)

diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env
similarity index 100%
rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env
rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env

From 44e185e81ce20075de34cdaac6f9bba6840f2e0a Mon Sep 17 00:00:00 2001
From: Syeda Anjum <syedaanjum@google.com>
Date: Fri, 1 May 2026 12:41:54 -0500
Subject: [PATCH 3/4] feat: add Gemma-4 templates and README for llm-d on TPUs

---
 .../llmd/llmd-vllm-with-hf-model-tpu.md       | 558 ++++++++++++++++++
 .../v6e-gemma-4-26b-a4b/kustomization.yaml    | 140 +++++
 .../patch-nodeselector.yaml                   |  24 +
 .../v6e-gemma-4-26b-a4b/patch-resources.yaml  |  29 +
 .../v6e-gemma-4-26b-a4b/patch-vllm-args.yaml  |  17 +
 .../llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env |   6 +
 .../vllm/v6e-gemma-4-31b/kustomization.yaml   | 141 +++++
 .../v6e-gemma-4-31b/patch-nodeselector.yaml   |  24 +
 .../vllm/v6e-gemma-4-31b/patch-resources.yaml |  29 +
 .../vllm/v6e-gemma-4-31b/patch-vllm-args.yaml |  17 +
 .../llmd/vllm/v6e-gemma-4-31b/runtime.env     |   6 +
 11 files changed, 991 insertions(+)
 create mode 100644 docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env

diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md
new file mode 100644
index 000000000..fa4565c4a
--- /dev/null
+++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md
@@ -0,0 +1,558 @@
+# Intelligent inference scheduling with llm-d on TPUs
+
+## Prerequisite
+
+This architecture and workflow assumes that the reader is familiar with the
+following GKE, Google Cloud Networking and llm-d components:
+
+- [Gateway API resources](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/gateway-api#gateway_resources)
+- [GKE Gateway Controller](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/gateway-api#gateway_controller)
+- [Google Cloud Load Balancer through GKE](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/service-load-balancer#load_balancer_types)
+- [Gateway API Inference Extension(GAIE)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal)
+- [vLLM-Optimized Inference Schedule](https://llm-d.ai/docs/architecture)
+
+## Architecture
+
+![image](../images/llm-d.png)
+
+## Workflow
+
+- User securely hits the Cloud Endpoint DNS from a web browser.
+- The DNS resolves to an External IP mapped to a
+  `Global External Load Balancer`.
+- The `Global External Load Balancer` has a `HTTPRoute` that points to the
+  Gradio chat GKE service as the backend. It also has a backend policy
+  specifying that the request to the backend will have
+  `IAP(Identity-aware proxy)` authentication enabled.
+- The `Global External Load Balancer` routes the request via `IAP` to the Gradio
+  chat GKE service backend.
+- Gradio chat GKE service forwards the request to the Gradio GKE Deployment and
+  the user will see the chat interface loading on the browser.
+- When the user sends a request via chat interface, the request reaches the
+  Gradio GKE deployment as explained in previous steps.
+- The Gradio GKE deployment takes the chat message and routes the request to the
+  `Internal Regional Load Balancer` fronting the llm-d deployment.
+- The `Internal Regional Load Balancer` has a `HTTPRoute` attached to it that
+  points to an `InferencePool` as the backend. This `InferencePool` contains the
+  pods running the model server, specifically running the inference of the model
+  of your choice via `vllm`.
+- The `InferencePool` has a reference to the GAIE endpoint picker(`EPP`) which
+  means that the `GKE Gateway Controller` instead of routing the request to the
+  backend in round-robin fashion, will consult the `EPP` to provide it with the
+  backend where the traffic should be routed.
+- The `EPP` has
+  [scheduling profiles](https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md)
+  that defines how to score the pods in the `InferencePool`. The scoring is done
+  on the metrics coming out of the pods.
+- Once the `EPP` identifies the pod which should be used based on the scores, it
+  returns its IP address to the `GKE Gateway Controller` corresponding to the
+  `Internal Regional Load Balancer` which then routes the request to the pod.
+
+## Pull the source code
+
+- Open [Cloud Shell](https://cloud.google.com/shell).
+
+- Clone the repository and change directory to the guide directory
+
+  ```
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms && \
+  export ACP_REPO_DIR="$(pwd)"
+  ```
+
+  To set the `ACP_REPO_DIR` value for new shell instances, write the value to
+  your shell initialization file.
+
+  `bash`
+
+  ```
+  sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.bashrc
+  ```
+
+  `zsh`
+
+  ```
+  sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.zshrc
+  ```
+
+## Configure
+
+Terraform loads variables in the following order, with later sources taking
+precedence over earlier ones:
+
+- Environment variables (`TF_VAR_<variable_name>`)
+- Any `*.auto.tfvars` or files, processed in lexical order of their filenames.
+- Any `-var` and `-var-file` options on the command line, in the order they are
+  provided.
+
+- Set the platform defaults project ID
+
+  ```
+  export TF_VAR_platform_default_project_id="<PROJECT_ID>"
+  ```
+
+  **-- OR --**
+
+  ```
+  platform_default_project_id="<PROJECT_ID>"
+  sed -i '/^platform_default_project_id[[:blank:]]*=/{h;s/=.*/= "'"${platform_default_project_id}"'"/};${x;/^$/{s//platform_default_project_id = "'"${platform_default_project_id}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/_shared_config/platform.auto.tfvars
+  ```
+
+- Optional : By default, the platform name is set to `dev`. If you want to
+  change it, set the platform name
+
+  ```
+  platform_name="<PLATFORM_NAME>"
+  sed -i '/^platform_name[[:blank:]]*=/{h;s/=.*/= "'"${platform_name}"'"/};${x;/^$/{s//platform_name="'"${platform_name}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/_shared_config/platform.auto.tfvars
+  ```
+
+- Optional : Run the following step if you want to run the inference of a model
+  other than `google/gemma-4-31b` which is the default model for this deployment.
+
+  ```
+  llmd_model_id="<MODEL_ID>"
+  sed -i "/^llmd_model_id[[:blank:]]*=/{h;s|=.*|= \"${llmd_model_id}\"|};\${x;/^$/{s|.*|llmd_model_id=\"${llmd_model_id}\"|;H};x}" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars"
+  ```
+
+  Valid values for `MODEL_ID` are:
+
+  - `google/gemma-4-31b` **(default)**
+  - `google/gemma-4-26b-a4b`
+  - `google/gemma-3-27b-it`
+  - `google/gemma-3-4b-it`
+  - `google/gemma-3-1b-it`
+  - `qwen/qwen3-32b`
+
+- In order to choose an accelerator and for the model you want to run, refer to
+  the following table.
+
+  | Model                          | v5e | v6e |
+  | ------------------------------ | --- | --- |
+  | gemma-4-31b                    | ❌  | ✅  |
+  | gemma-4-26b-a4b                | ❌  | ✅  |
+  | gemma-3-27b-it                 | ✅  | ✅  |
+  | gemma-3-4b-it                  | ✅  | ❌  |
+  | gemma-3-1b-it                  | ✅  | ❌  |
+  | qwen3-32b                      | ✅  | ✅  |
+
+- Optional : Run the following step if you want to run the model on an
+  accelerator other than `v6e` which is the default accelerator for
+  this deployment.
+
+  ```
+  llmd_accelerator_type="<ACCELERATOR>"
+  sed -i '/^llmd_accelerator_type[[:blank:]]*=/{h;s/=.*/= "'"${llmd_accelerator_type}"'"/};${x;/^$/{s//llmd_accelerator_type="'"${llmd_accelerator_type}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars
+  ```
+
+  Valid values for `ACCELERATOR` are:
+
+  - `v5e`
+  - `v6e` **(default)**
+
+## Configure Identity-Aware Proxy (IAP)
+
+Identity-Aware Proxy (IAP) lets you establish a central authorization layer for
+applications accessed by HTTPS, so you can use an application-level access
+control model instead of relying on network-level firewalls.
+
+IAP policies scale across your organization. You can define access policies
+centrally and apply them to all of your applications and resources. When you
+assign a dedicated team to create and enforce policies, you protect your project
+from incorrect policy definition or implementation in any application.
+
+For more information on IAP, see the
+[Identity-Aware Proxy documentation](https://cloud.google.com/iap/docs/concepts-overview#gke)
+
+### Configure OAuth consent screen for IAP
+
+For this guide we will configure a generic OAuth consent screen setup for
+internal use. Internal use means that only users within your organization can be
+granted IAM permissions to access the IAP secured applications and resource.
+
+See the
+[Configuring the OAuth consent screen documentation](https://developers.google.com/workspace/guides/configure-oauth-consent)
+for additional information
+
+- Set environment variables.
+
+  ```shell
+  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/scripts/set_environment_variables.sh"
+  ```
+
+- Ensure that IAP is enabled.
+
+  ```shell
+  gcloud services enable iap.googleapis.com \
+  --project="${llmd_iap_oath_branding_project_id}"
+  ```
+
+- Check if the branding is already configured.
+
+  ```shell
+  gcloud iap oauth-brands list \
+  --project="${llmd_iap_oath_branding_project_id}"
+  ```
+
+  > If an entry is displayed, the branding is already configured.
+
+- Configure the branding.
+
+  ```shell
+  gcloud iap oauth-brands create \
+  --application_title="IAP Secured Application" \
+  --project="${llmd_iap_oath_branding_project_id}" \
+  --support_email="<SUPPORT_EMAIL_ADDRESS>"
+  ```
+
+  Replace `<SUPPORT_EMAIL_ADDRESS>` with a group email address that you are a
+  manager on or your personal email address. The email address should be
+  supplied without the domain.
+
+### Default IAP access
+
+For simplicity, in this guide access to the IAP secured applications will be
+configure to allow all users in the organization. Access can be configured per
+IAP application or resources.
+
+- Set the IAP allow domain
+
+  ```
+  IAP_DOMAIN=$(gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}')
+  echo "IAP_DOMAIN=${IAP_DOMAIN}"
+  ```
+
+  **If the domain of the active `gcloud` user is different from the organization
+  that the `llmd_iap_oath_branding_project_id` project is in, you will need to
+  manually set `IAP_DOMAIN` environment variable**
+
+  ```
+  IAP_DOMAIN="<project_id's organization domain>"
+  ```
+
+- Set the IAP domain in the configuration file
+
+  ```
+  sed -i '/^llmd_iap_domain[[:blank:]]*=/{h;s/=.*/= "'"${IAP_DOMAIN}"'"/};${x;/^$/{s//llmd_iap_domain="'"${IAP_DOMAIN}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars
+  ```
+
+### Install Terraform 1.8.0+
+
+> [!IMPORTANT]  
+> At the time this guide was written, Cloud Shell had Terraform v1.5.7 installed
+> by default. Terraform version 1.8.0 or later is required for this guide.
+
+- Run the `install_terraform.sh` script to install Terraform 1.8.0.
+
+  ```shell
+  "${ACP_REPO_DIR}/tools/bin/install_terraform.sh"
+  ```
+
+## Deploy the entire stack except the model server
+
+```
+${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/deploy-llmd.sh
+```
+
+## Resources created
+
+The `deploy-llmd.sh` script will perform the following steps:
+
+- Set up base GKE cluster platform.
+- Create resources required to deploy llm-d on the GKE cluster and access it.
+- Deploy a gradio chat frontend backed by Identity-Aware Proxy.
+- Creates a custom Cloud Monitoring dashboard named `llm-d dashboard`
+
+At this time, you have all resources shown in the architecture diagram created
+except the model server. In order to run the model server, first download the
+model from hugging face to a GCS bucket as instructed in the next steps. Note
+that we will not be downloading the model directly from HuggingFace as it slows
+down the modelserver startup time. Instead, we will use GCSFuse to download the
+model from the GCS bucket which is faster. For more details on how downloading
+the model from GCS saves time, take a look at
+[Storage optimization guide](../../../../../../use-cases/inferencing/cost-optimization/gcsfuse/README.md)
+
+## Download the model to Cloud Storage
+
+- [Generate a Hugging Face tokens](https://huggingface.co/docs/hub/security-tokens)
+  with token type **Read**.
+- Add the token to the secret manager
+
+  ```
+  HF_TOKEN_READ=<YOUR_HUGGINGFACE_READ_TOKEN>
+  echo ${HF_TOKEN_READ} | gcloud secrets versions add ${huggingface_hub_access_token_read_secret_manager_secret_name} --data-file=- --project=${huggingface_secret_manager_project_id}
+  ```
+
+- Source the environment configuration.
+
+  ```shell
+  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/scripts/set_environment_variables.sh"
+  ```
+
+- Configure the model download job.
+
+  ```shell
+  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh"
+  ```
+
+- Deploy the model download job.
+
+  ```shell
+  kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
+  ```
+
+- Watch the model download job until it is complete.
+
+  ```shell
+  watch --color --interval 5 --no-title \
+  "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'
+  echo '\nLogs(last 10 lines):'
+  kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10"
+  ```
+
+  When the job is complete, you will see the following:
+
+  ```text
+  NAME                       STATUS     COMPLETIONS   DURATION   AGE
+  XXXXXXXX-hf-model-to-gcs   Complete   1/1           ###        ###
+  ```
+
+  You can press `CTRL`+`c` to terminate the watch.
+
+- Delete the model download job.
+
+  ```shell
+  kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
+  ```
+
+## Deploy the model server
+
+- Configure the model server
+
+  ```shell
+  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh"
+  ```
+
+- Deploy the model server
+
+  ```shell
+  kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+  ```
+
+  The Kubernetes manifests are based on the
+  [Inference Quickstart recommendations](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference-quickstart).
+
+- Watch the deployment until it is ready.
+
+  ```shell
+  watch --color --interval 5 --no-title \
+  "kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get deployment/ms-inference-scheduling-llmd-modelservice-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
+  echo '\nLogs(last 10 lines):'
+  kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} logs deployment/ms-inference-scheduling-llmd-modelservice-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10"
+  ```
+
+## Verify llm-d deployment is up and running
+
+- Set the environment variables
+
+  ```
+  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/scripts/set_environment_variables.sh"
+  ```
+
+- Get cluster credentials
+
+  ```
+  ${cluster_credentials_command}
+  ```
+
+- Check the all the deployments
+
+  ```
+  kubectl get deployments -n ${ira_online_tpu_kubernetes_namespace_name}
+  ```
+
+  You should see three deployments similar to the following:
+
+  ```
+  NAME                                              READY   UP-TO-DATE   AVAILABLE   AGE
+  gaie-inference-scheduling-epp                     1/1     1            1           XXXX
+  gradio-XXXX                                       1/1     1            1           XXXX
+  ms-inference-scheduling-llmd-modelservice-XXXX    2/2     1            1           XXXX
+  ```
+
+  Note:
+
+  - gaie-inference-scheduling-epp is the Gateway API Inference Extension
+    endpoint picker.
+  - gradio-XXXX is the front end chat interface abstracting the model server.
+  - ms-inference-scheduling-llmd-modelservice-XXXX is the model server running
+    inference of the model you chose. It may take some time for this deployment
+    to be up completely depending upon the TPU availability
+
+- Check all the resources
+
+  ```
+  kubectl get all -n ${ira_online_tpu_kubernetes_namespace_name}
+  ```
+
+  You should see output similar to the following:
+
+  ```
+  NAME                                                    READY    STATUS    RESTARTS    AGE
+  pod/gaie-inference-scheduling-epp-XXXX                   1/1      Running    0          XX
+  pod/gradio-XXXX                                          1/1      Running    0          XX
+  pod/pod/ms-inference-scheduling-llmd-modelservice-XXXX   4/4      Running    0          XX
+  pod/pod/ms-inference-scheduling-llmd-modelservice-XXXX   4/4      Running    0          XX
+
+  NAME                                                  TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)             AGE
+  service/gaie-inference-scheduling-epp                 ClusterIP   34.118.230.43    <none>        9002/TCP,9090/TCP   XX
+  service/gaie-inference-scheduling-ips-XXXX            ClusterIP   None             <none>        54321/TCP           XX
+  service/gradio-svc-XXXX                               ClusterIP   34.118.232.165   <none>        8080/TCP            XX
+
+  NAME                                                             READY   UP-TO-DATE   AVAILABLE   AGE
+  deployment.apps/gaie-inference-scheduling-epp                    1/1     1            1           XX
+  deployment.apps/gradio-XXXX                                      1/1     1            1           XX
+  deployment.apps/ms-inference-scheduling-llmd-modelservice-XXXX   2/2     2            2           XX
+
+  NAME                                                              DESIRED   CURRENT   READY   AGE
+  replicaset.apps/gaie-inference-scheduling-epp-XXXX                1         1         1       XX
+  replicaset.apps/gradio-XXXX                                       1         1         1       XX
+  replicaset.apps/ms-inference-scheduling-llmd-modelservice-XXXX    2         2         2       XX
+  ```
+
+- Wait for the model server deployment to be ready before accessing the chat
+  interface.
+
+  ```
+  watch --color --interval 5 --no-title \
+  "kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get deployment/${llmd_ms_deployment_name}-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'"
+  ```
+
+- When the deployment is ready, you will output similar to the following
+
+  ```
+  NAME                                             READY   UP-TO-DATE   AVAILABLE   AGE
+  ms-inference-scheduling-llmd-modelservice-XXXX   2/2     2            2           XX
+  ```
+
+- Output the Chat URL.
+
+  ```
+  echo -e "\nChat URL: https://${llmd_endpoints_hostname}\n"
+  ```
+
+- Open the Chat URL in a web browser.
+
+> [!TIP]  
+> If the browser doesn't load the Gradio chat interface, the SSL certificate
+> could still be getting provisioned. Check the status of the certificate by
+> running the following command:
+>
+> `gcloud compute ssl-certificates describe ${llmd_ssl_certificate_name} --project ${cluster_project_id} --format=json | jq -r '.managed.status'`
+>
+> If the output of the command is `PROVISIONING`, it means the certificate has
+> not been provisioned yet. Wait for the status to change to `ACTIVE`
+
+## Generate load on the model server
+
+In this section, you will generate some load on the model server and view the
+metrics on the monitoring dashboard. Then, you will run the stress test to spawn
+many requests to build the processing queue. Note that the scripts used in this
+section spawn requests to the gradio endpoint which will route the request to
+the model server via llm-d's intelligent scheduling. This is done to replicate a
+real-world scenario where the model server is running behind a front end. Due to
+the additional front end layer(in this case, gradio), the metrics will indicate
+a slightly lower performance compared to the scenario where the requests are
+directly sent to the model server via llm-d internal load balancer eliminating
+the latency caused by the front end layer.
+
+1. In order to send a request to the gradio chat interface fronting llm-d and
+   model server, the active `gcloud` account needs to have the
+   [Service Account Token Creator](https://cloud.google.com/iam/docs/roles-permissions/iam#iam.serviceAccountTokenCreator)
+   role for the stress test service account. The following command will add the
+   role to the active `gcloud` account.
+
+   ```shell
+   gcloud iam service-accounts add-iam-policy-binding ${stress_test_service_account_email} \
+   --member="user:$(gcloud auth list --filter=status:ACTIVE --format="value(account)")" \
+   --project="${stress_test_service_account_project_id}" \
+   --role="roles/iam.serviceAccountTokenCreator"
+   ```
+
+   The stress test service account has the role
+   `roles/iap.httpsResourceAccessor` and can access the gradio chat application
+   secured by Identity-Aware proxy.
+
+2. Generate JSON Web Token (JWT)
+
+   ```shell
+   cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/initialize/scripts && \
+   cat > jwt-claim.json << EOF
+   {
+    "iss": "${stress_test_service_account_email}",
+    "sub": "${stress_test_service_account_email}",
+    "aud": "https://${llmd_endpoints_hostname}/gradio_api/api/sync_chat/",
+    "iat": $(date +%s),
+    "exp": $((`date +%s` + 3600))
+   }
+   EOF
+   ```
+
+   Wait for a couple of mins as the IAM permissions could take some time to
+   reflect the changes.
+
+   ```shell
+   gcloud iam service-accounts sign-jwt --iam-account="${stress_test_service_account_email}" jwt-claim.json token.jwt
+   ```
+
+3. Set up python virtual environment and install required packages
+
+   ```
+   python3 -m venv venv &&
+   source venv/bin/activate &&
+   pip install aiohttp
+   ```
+
+4. Run the script to trigger generate some load.
+
+   ```shell
+   python ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/initialize/scripts/generate_load.py
+   ```
+
+   The response should look like this:
+
+   ```
+   Preparing to send the requests to the MODEL google/gemma-4-31b
+   Starting Load: 500 concurrent users...
+   User 01 | Status: 200
+   User 02 | Status: 200
+   User 04 | Status: 200
+   User 05 | Status: 200
+   ```
+
+5. Go to
+   [Cloud Monitoring Dashboard page](https://console.cloud.google.com/monitoring/dashboards?pli=1)
+   and search for `llm-d dashboard`. Open the dashboard. You will see various
+   metrics getting populated including TTFT, TPOT, Input Token/s , Output
+   Token/s etc. You will see something similar to the following pic.
+
+![dashboard](../images/llmd-dashboard.png)
+
+- You can view the metrics published by `vllm` and `gaie` on the dashboard. Note
+  that some of the network metrics like `Throughput TX Bytes per Pod` are only
+  applicable to non-spot and A3 and higher machine types.
+
+Note : Now, if you want to run different combinations of model and
+accelerator(e.g. google/gemma-3-27b-it on v5e), update the Terraform variables
+`llmd_model_id` and `llmd_accelerator_type` in
+`"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars"`
+to the model and accelerator of your choice and run
+[model download](#download-the-model-to-cloud-storage) and
+[model server deployment](#deploy-the-model-server) steps again.
+
+## Teardown
+
+Teardown the llm-d platform
+
+```shell
+${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-llmd.sh
+```
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml
new file mode 100644
index 000000000..9480e9f07
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml
@@ -0,0 +1,140 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+images:
+  - name: replaced-by-kustomize
+    newName: vllm/vllm-tpu
+    newTag: gemma4
+
+nameSuffix: -v6e-gemma-4-26b-a4b
+
+patches:
+  - path: patch-vllm-args.yaml
+    target:
+      kind: Deployment
+      name: vllm
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  # - source:
+  #     fieldPath: data.CONTAINER_IMAGE_URL
+  #     kind: ConfigMap
+  #     name: vllm
+  #   targets:
+  #     - fieldPaths:
+  #         - spec.template.spec.containers.[name=inference-server].image
+  #       select:
+  #         kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml
new file mode 100644
index 000000000..e0f4839a7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v6e-2x2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml
new file mode 100644
index 000000000..117ab1b36
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "4"
+            requests:
+              google.com/tpu: "4"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml
new file mode 100644
index 000000000..835df7aec
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml
@@ -0,0 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- op: add
+  path: /spec/template/spec/containers/1/args/-
+  value: "--chat-template=/gcs/google/gemma-4-26b-a4b/chat_template.jinja"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env
new file mode 100644
index 000000000..bf8c4759e
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v6e-gemma-4-26b-a4b
+GPU_MEMORY_UTILIZATION=0.95
+MAX_MODEL_LEN=16384
+MODEL_ID=google/gemma-4-26b-a4b
+MODEL_NAME=gemma-4-26b-a4b
+TENSOR_PARALLEL_SIZE=4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml
new file mode 100644
index 000000000..bf566d459
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml
@@ -0,0 +1,141 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+images:
+  - name: replaced-by-kustomize
+    newName: vllm/vllm-tpu
+    newTag: gemma4
+
+nameSuffix: -v6e-gemma-4-31b
+
+patches:
+  - path: patch-vllm-args.yaml
+    target:
+      kind: Deployment
+      name: vllm
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  # - source:
+  #     fieldPath: data.CONTAINER_IMAGE_URL
+  #     kind: ConfigMap
+  #     name: vllm
+  #   targets:
+  #     - fieldPaths:
+  #         - spec.template.spec.containers.[name=inference-server].image
+  #       select:
+  #         kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml
new file mode 100644
index 000000000..e0f4839a7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: tpu-v6e-2x2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml
new file mode 100644
index 000000000..117ab1b36
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: inference-server
+          resources:
+            limits:
+              google.com/tpu: "4"
+            requests:
+              google.com/tpu: "4"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml
new file mode 100644
index 000000000..ea89e9f93
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml
@@ -0,0 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+- op: add
+  path: /spec/template/spec/containers/1/args/-
+  value: "--chat-template=/gcs/google/gemma-4-31b/chat_template.jinja"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env
new file mode 100644
index 000000000..3edbcd9d8
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env
@@ -0,0 +1,6 @@
+APP_LABEL=vllm-v6e-gemma-4-31b
+GPU_MEMORY_UTILIZATION=0.95
+MAX_MODEL_LEN=16384
+MODEL_ID=google/gemma-4-31b
+MODEL_NAME=gemma-4-31b
+TENSOR_PARALLEL_SIZE=4

From 74a944285163cee728faf2a273135ceaaebeea5c Mon Sep 17 00:00:00 2001
From: Syeda Anjum <syedaanjum@google.com>
Date: Fri, 1 May 2026 13:02:28 -0500
Subject: [PATCH 4/4] removing v5e examples as the accelerator is not
 officially supported on llm-d documentation

---
 .../vllm/v5e-gemma-3-1b-it/kustomization.yaml | 131 ------------------
 .../v5e-gemma-3-1b-it/patch-nodeselector.yaml |  24 ----
 .../v5e-gemma-3-1b-it/patch-resources.yaml    |  29 ----
 .../llmd/vllm/v5e-gemma-3-1b-it/runtime.env   |   6 -
 .../v5e-gemma-3-27b-it/kustomization.yaml     | 131 ------------------
 .../patch-nodeselector.yaml                   |  24 ----
 .../v5e-gemma-3-27b-it/patch-resources.yaml   |  29 ----
 .../llmd/vllm/v5e-gemma-3-27b-it/runtime.env  |   6 -
 .../vllm/v5e-gemma-3-4b-it/kustomization.yaml | 131 ------------------
 .../v5e-gemma-3-4b-it/patch-nodeselector.yaml |  24 ----
 .../v5e-gemma-3-4b-it/patch-resources.yaml    |  29 ----
 .../llmd/vllm/v5e-gemma-3-4b-it/runtime.env   |   6 -
 .../vllm/v5e-qwen3-32b/kustomization.yaml     | 131 ------------------
 .../v5e-qwen3-32b/patch-nodeselector.yaml     |  24 ----
 .../vllm/v5e-qwen3-32b/patch-resources.yaml   |  29 ----
 .../llmd/vllm/v5e-qwen3-32b/runtime.env       |   6 -
 16 files changed, 760 deletions(-)
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml
 delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env

diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml
deleted file mode 100644
index f90964787..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-configMapGenerator:
-  - envs:
-      - runtime.env
-    name: runtime
-    namespace: replaced-by-kustomize
-
-nameSuffix: "-v5e-gemma-3-1b-it"
-
-patches:
-  - path: patch-nodeselector.yaml
-  - path: patch-resources.yaml
-
-replacements:
-  - source:
-      fieldPath: data.APP_LABEL
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.selector.matchLabels.app
-          - spec.template.metadata.labels.app
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.selector.app
-        select:
-          kind: Service
-  - source:
-      fieldPath: data.CONTAINER_IMAGE_URL
-      kind: ConfigMap
-      name: vllm
-    targets:
-      - fieldPaths:
-          - spec.template.spec.containers.[name=inference-server].image
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ConfigMap
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Service
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.serviceAccountName
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.name
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.MODEL_BUCKET_NAME
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
-        options:
-          delimiter: .
-          index: 0
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_ID
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
-        options:
-          delimiter: "only-dir:"
-          index: 1
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-        options:
-          delimiter: /
-          index: 2
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_NAME
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.metadata.labels.[ai.gke.io/model]
-        select:
-          kind: Deployment
-
-resources:
-  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml
deleted file mode 100644
index 832e2fceb..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      nodeSelector:
-        cloud.google.com/compute-class: tpu-v5e-1x1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml
deleted file mode 100644
index b3371ea99..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      containers:
-        - name: inference-server
-          resources:
-            limits:
-              google.com/tpu: "1"
-            requests:
-              google.com/tpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env
deleted file mode 100644
index 8cc55f05c..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env
+++ /dev/null
@@ -1,6 +0,0 @@
-APP_LABEL=vllm-v5e-gemma-3-1b-it
-GPU_MEMORY_UTILIZATION=0.9
-MAX_MODEL_LEN=1024
-MODEL_ID=google/gemma-3-1b-it
-MODEL_NAME=gemma-3-1b-it
-TENSOR_PARALLEL_SIZE=1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml
deleted file mode 100644
index fe78bce08..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-configMapGenerator:
-  - envs:
-      - runtime.env
-    name: runtime
-    namespace: replaced-by-kustomize
-
-nameSuffix: -v5e-gemma-3-27b-it
-
-patches:
-  - path: patch-nodeselector.yaml
-  - path: patch-resources.yaml
-
-replacements:
-  - source:
-      fieldPath: data.APP_LABEL
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.selector.matchLabels.app
-          - spec.template.metadata.labels.app
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.selector.app
-        select:
-          kind: Service
-  - source:
-      fieldPath: data.CONTAINER_IMAGE_URL
-      kind: ConfigMap
-      name: vllm
-    targets:
-      - fieldPaths:
-          - spec.template.spec.containers.[name=inference-server].image
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ConfigMap
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Service
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.serviceAccountName
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.name
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.MODEL_BUCKET_NAME
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
-        options:
-          delimiter: .
-          index: 0
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_ID
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
-        options:
-          delimiter: "only-dir:"
-          index: 1
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-        options:
-          delimiter: /
-          index: 2
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_NAME
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.metadata.labels.[ai.gke.io/model]
-        select:
-          kind: Deployment
-
-resources:
-  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml
deleted file mode 100644
index 789ec78c6..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      nodeSelector:
-        cloud.google.com/compute-class: tpu-v5e-2x4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml
deleted file mode 100644
index a2f2513e0..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      containers:
-        - name: inference-server
-          resources:
-            limits:
-              google.com/tpu: "8"
-            requests:
-              google.com/tpu: "8"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env
deleted file mode 100644
index cc066c34f..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env
+++ /dev/null
@@ -1,6 +0,0 @@
-APP_LABEL=vllm-v5e-gemma-3-27b-it
-GPU_MEMORY_UTILIZATION=0.9
-MAX_MODEL_LEN=1024
-MODEL_ID=google/gemma-3-27b-it
-MODEL_NAME=gemma-3-27b-it
-TENSOR_PARALLEL_SIZE=8
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml
deleted file mode 100644
index ffb5e7862..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-configMapGenerator:
-  - envs:
-      - runtime.env
-    name: runtime
-    namespace: replaced-by-kustomize
-
-nameSuffix: -v5e-gemma-3-4b-it
-
-patches:
-  - path: patch-nodeselector.yaml
-  - path: patch-resources.yaml
-
-replacements:
-  - source:
-      fieldPath: data.APP_LABEL
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.selector.matchLabels.app
-          - spec.template.metadata.labels.app
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.selector.app
-        select:
-          kind: Service
-  - source:
-      fieldPath: data.CONTAINER_IMAGE_URL
-      kind: ConfigMap
-      name: vllm
-    targets:
-      - fieldPaths:
-          - spec.template.spec.containers.[name=inference-server].image
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ConfigMap
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Service
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.serviceAccountName
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.name
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.MODEL_BUCKET_NAME
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
-        options:
-          delimiter: .
-          index: 0
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_ID
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
-        options:
-          delimiter: "only-dir:"
-          index: 1
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-        options:
-          delimiter: /
-          index: 2
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_NAME
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.metadata.labels.[ai.gke.io/model]
-        select:
-          kind: Deployment
-
-resources:
-  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml
deleted file mode 100644
index d2d74eca4..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      nodeSelector:
-        cloud.google.com/compute-class: tpu-v5e-2x2
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml
deleted file mode 100644
index 117ab1b36..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      containers:
-        - name: inference-server
-          resources:
-            limits:
-              google.com/tpu: "4"
-            requests:
-              google.com/tpu: "4"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env
deleted file mode 100644
index d00314f82..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env
+++ /dev/null
@@ -1,6 +0,0 @@
-APP_LABEL=vllm-v5e-gemma-3-4b-it
-GPU_MEMORY_UTILIZATION=0.9
-MAX_MODEL_LEN=1024
-MODEL_ID=google/gemma-3-4b-it
-MODEL_NAME=gemma-3-4b-it
-TENSOR_PARALLEL_SIZE=4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml
deleted file mode 100644
index 64201eff4..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-configMapGenerator:
-  - envs:
-      - runtime.env
-    name: runtime
-    namespace: replaced-by-kustomize
-
-nameSuffix: -v5e-qwen3-32b
-
-patches:
-  - path: patch-nodeselector.yaml
-  - path: patch-resources.yaml
-
-replacements:
-  - source:
-      fieldPath: data.APP_LABEL
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.selector.matchLabels.app
-          - spec.template.metadata.labels.app
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.selector.app
-        select:
-          kind: Service
-  - source:
-      fieldPath: data.CONTAINER_IMAGE_URL
-      kind: ConfigMap
-      name: vllm
-    targets:
-      - fieldPaths:
-          - spec.template.spec.containers.[name=inference-server].image
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ConfigMap
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: Service
-      - fieldPaths:
-          - metadata.namespace
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.serviceAccountName
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - metadata.name
-        select:
-          kind: ServiceAccount
-  - source:
-      fieldPath: data.MODEL_BUCKET_NAME
-      kind: ConfigMap
-      name: deployment
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
-        options:
-          delimiter: .
-          index: 0
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_ID
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
-        options:
-          delimiter: "only-dir:"
-          index: 1
-        select:
-          kind: Deployment
-      - fieldPaths:
-          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
-        options:
-          delimiter: /
-          index: 2
-        select:
-          kind: Deployment
-  - source:
-      fieldPath: data.MODEL_NAME
-      kind: ConfigMap
-      name: runtime
-    targets:
-      - fieldPaths:
-          - spec.template.metadata.labels.[ai.gke.io/model]
-        select:
-          kind: Deployment
-
-resources:
-  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml
deleted file mode 100644
index 789ec78c6..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      nodeSelector:
-        cloud.google.com/compute-class: tpu-v5e-2x4
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml
deleted file mode 100644
index a2f2513e0..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm
-  namespace: replaced-by-kustomize
-spec:
-  template:
-    spec:
-      containers:
-        - name: inference-server
-          resources:
-            limits:
-              google.com/tpu: "8"
-            requests:
-              google.com/tpu: "8"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env
deleted file mode 100644
index 90a9f9620..000000000
--- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env
+++ /dev/null
@@ -1,6 +0,0 @@
-APP_LABEL=vllm-v5e-qwen3-32b
-GPU_MEMORY_UTILIZATION=0.95
-MAX_MODEL_LEN=32768
-MODEL_ID=qwen/qwen3-32b
-MODEL_NAME=qwen3-32b
-TENSOR_PARALLEL_SIZE=8