From 46383c3cfc0a654c795405061e9066782e511329 Mon Sep 17 00:00:00 2001 From: syeda-anjum Date: Sat, 18 Apr 2026 20:41:54 +0000 Subject: [PATCH 1/4] llm-d on TPUs, new branch --- .../llmd/vllm/base/deployment.yaml | 198 ++++++++++++++++++ .../llmd/vllm/base/kustomization.yaml | 26 +++ .../llmd/vllm/base/templates/vllm.tpl.env | 2 + .../llmd/vllm/configure_vllm.sh | 28 +++ .../vllm/v5e-gemma-3-1b-it/kustomization.yaml | 131 ++++++++++++ .../v5e-gemma-3-1b-it/patch-nodeselector.yaml | 24 +++ .../v5e-gemma-3-1b-it/patch-resources.yaml | 29 +++ .../llmd/vllm/v5e-gemma-3-1b-it/runtime.env | 6 + .../v5e-gemma-3-27b-it/kustomization.yaml | 131 ++++++++++++ .../patch-nodeselector.yaml | 24 +++ .../v5e-gemma-3-27b-it/patch-resources.yaml | 29 +++ .../llmd/vllm/v5e-gemma-3-27b-it/runtime.env | 6 + .../vllm/v5e-gemma-3-4b-it/kustomization.yaml | 131 ++++++++++++ .../v5e-gemma-3-4b-it/patch-nodeselector.yaml | 24 +++ .../v5e-gemma-3-4b-it/patch-resources.yaml | 29 +++ .../llmd/vllm/v5e-gemma-3-4b-it/runtime.env | 6 + .../vllm/v5e-qwen3-32b/kustomization.yaml | 131 ++++++++++++ .../v5e-qwen3-32b/patch-nodeselector.yaml | 24 +++ .../vllm/v5e-qwen3-32b/patch-resources.yaml | 29 +++ .../llmd/vllm/v5e-qwen3-32b/runtime.env | 6 + .../v6e-gemma-3-27b-it/kustomization.yaml | 131 ++++++++++++ .../patch-nodeselector.yaml | 24 +++ .../v6e-gemma-3-27b-it/patch-resources.yaml | 29 +++ .../llmd/vllm/v6e-gemma-3-27b-it/runtime.env | 6 + .../vllm/v6e-qwen3-32b/kustomization.yaml | 131 ++++++++++++ .../v6e-qwen3-32b/patch-nodeselector.yaml | 24 +++ .../vllm/v6e-qwen3-32b/patch-resources.yaml | 29 +++ .../llmd/vllm/v6e-qwen3-32b/runtime.env | 6 + 28 files changed, 1394 insertions(+) create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml new file mode 100644 index 000000000..bd913a736 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml @@ -0,0 +1,198 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ms-inference-scheduling-llmd-modelservice + namespace: replaced-by-kustomize +spec: + replicas: 2 + selector: + matchLabels: + llmd.ai/inferenceServing: "true" + llmd.ai/model: random_model + llmd.ai/role: decode + app: replaced-by-kustomize + template: + metadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + labels: + llmd.ai/inferenceServing: "true" + llmd.ai/model: random_model + llmd.ai/role: decode + ai.gke.io/model: replaced-by-kustomize + app: replaced-by-kustomize + spec: + initContainers: + - name: routing-proxy + args: + - --port=8000 + - --vllm-port=8200 + - --connector=nixlv2 + - --zap-encoder=json + - --zap-log-level=debug + - --secure-proxy=false + image: replaced-by-kustomize + imagePullPolicy: Always + ports: + - containerPort: 8000 + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + serviceAccountName: replaced-by-kustomize + volumes: + - emptyDir: {} + name: metrics-volume + - emptyDir: {} + name: torch-compile-cache + - emptyDir: + medium: Memory + sizeLimit: 20Gi + name: dev-shm + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: cloud-storage-bucket-name + mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize,,uid=2000,gid=2000 + skipCSIBucketAccessCheck: "true" + name: huggingface-hub-model-bucket + - emptyDir: + medium: Memory + name: gke-gcsfuse-cache + - emptyDir: + medium: Memory + name: gke-gcsfuse-tmp + - emptyDir: + medium: Memory + name: gke-gcsfuse-buffer + containers: + - args: + - | + echo "########### $(date) - Starting parallel-fetch-safetensors for model: ${MODEL_ID}" + ls -alR /gcs + find /gcs/${MODEL_ID}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null' + echo "########### $(date) - Finished parallel-fetch-safetensors" + sleep infinity + command: ["/bin/sh", "-c"] + env: + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: runtime + image: busybox + name: fetch-safetensors + volumeMounts: + - mountPath: /gcs + name: huggingface-hub-model-bucket + readOnly: true + - name: inference-server + image: replaced-by-kustomize + command: ["vllm", "serve"] + args: + - /gcs/$(MODEL_ID) + - "--port" + - "8200" + - "--served-model-name" + - "$(MODEL_ID)" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + - "--disable-uvicorn-access-log" + - "--tensor-parallel-size" + - "$(TENSOR_PARALLEL_SIZE)" + - "--gpu-memory-utilization" + - "$(GPU_MEMORY_UTILIZATION)" + - "--max-model-len" + - "$(MAX_MODEL_LEN)" + env: + - name: UCX_TLS + value: cuda_ipc,cuda_copy,tcp + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: DP_SIZE + value: "1" + - name: DP_SIZE_LOCAL + value: "1" + - name: GPU_MEMORY_UTILIZATION + valueFrom: + configMapKeyRef: + key: GPU_MEMORY_UTILIZATION + name: runtime + - name: MAX_MODEL_LEN + valueFrom: + configMapKeyRef: + key: MAX_MODEL_LEN + name: runtime + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: runtime + - name: TENSOR_PARALLEL_SIZE + valueFrom: + configMapKeyRef: + key: TENSOR_PARALLEL_SIZE + name: runtime + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + livenessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8200 + periodSeconds: 10 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + path: /v1/models + port: 8200 + periodSeconds: 5 + timeoutSeconds: 2 + startupProbe: + failureThreshold: 60 + httpGet: + path: /v1/models + port: 8200 + initialDelaySeconds: 15 + periodSeconds: 30 + timeoutSeconds: 5 + resources: {} + volumeMounts: + - mountPath: /.config + name: metrics-volume + - mountPath: /.cache + name: torch-compile-cache + - mountPath: /dev/shm + name: dev-shm + - mountPath: /gcs + name: huggingface-hub-model-bucket + readOnly: true diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml new file mode 100644 index 000000000..0314fa83a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - vllm.env + name: vllm + namespace: replaced-by-kustomize + +resources: + - ../../../base + - deployment.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env new file mode 100644 index 000000000..ad4d6211e --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env @@ -0,0 +1,2 @@ +CONTAINER_IMAGE_URL=ghcr.io/llm-d/llm-d-cuda:v0.5.0 +ROUTING_PROXY_IMAGE=ghcr.io/llm-d/llm-d-routing-sidecar:v0.4.0-rc.1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh new file mode 100755 index 000000000..818f410e5 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source "${MY_PATH}/../../../../examples/llmd/_shared_config/scripts/set_environment_variables.sh" +"${MY_PATH}/../../configure_deployment.sh" + +envsubst <"${MY_PATH}/base/templates/vllm.tpl.env" | sponge "${MY_PATH}/base/vllm.env" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml new file mode 100644 index 000000000..f90964787 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: "-v5e-gemma-3-1b-it" + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml new file mode 100644 index 000000000..832e2fceb --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v5e-1x1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml new file mode 100644 index 000000000..b3371ea99 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "1" + requests: + google.com/tpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env new file mode 100644 index 000000000..8cc55f05c --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v5e-gemma-3-1b-it +GPU_MEMORY_UTILIZATION=0.9 +MAX_MODEL_LEN=1024 +MODEL_ID=google/gemma-3-1b-it +MODEL_NAME=gemma-3-1b-it +TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml new file mode 100644 index 000000000..fe78bce08 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -v5e-gemma-3-27b-it + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml new file mode 100644 index 000000000..789ec78c6 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v5e-2x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml new file mode 100644 index 000000000..a2f2513e0 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "8" + requests: + google.com/tpu: "8" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env new file mode 100644 index 000000000..cc066c34f --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v5e-gemma-3-27b-it +GPU_MEMORY_UTILIZATION=0.9 +MAX_MODEL_LEN=1024 +MODEL_ID=google/gemma-3-27b-it +MODEL_NAME=gemma-3-27b-it +TENSOR_PARALLEL_SIZE=8 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml new file mode 100644 index 000000000..ffb5e7862 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -v5e-gemma-3-4b-it + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml new file mode 100644 index 000000000..d2d74eca4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v5e-2x2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml new file mode 100644 index 000000000..117ab1b36 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "4" + requests: + google.com/tpu: "4" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env new file mode 100644 index 000000000..d00314f82 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v5e-gemma-3-4b-it +GPU_MEMORY_UTILIZATION=0.9 +MAX_MODEL_LEN=1024 +MODEL_ID=google/gemma-3-4b-it +MODEL_NAME=gemma-3-4b-it +TENSOR_PARALLEL_SIZE=4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml new file mode 100644 index 000000000..64201eff4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -v5e-qwen3-32b + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml new file mode 100644 index 000000000..789ec78c6 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v5e-2x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml new file mode 100644 index 000000000..a2f2513e0 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "8" + requests: + google.com/tpu: "8" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env new file mode 100644 index 000000000..90a9f9620 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v5e-qwen3-32b +GPU_MEMORY_UTILIZATION=0.95 +MAX_MODEL_LEN=32768 +MODEL_ID=qwen/qwen3-32b +MODEL_NAME=qwen3-32b +TENSOR_PARALLEL_SIZE=8 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml new file mode 100644 index 000000000..38517e631 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -v6e-gemma-3-27b-it + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml new file mode 100644 index 000000000..f984468b3 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v6e-2x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml new file mode 100644 index 000000000..a2f2513e0 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "8" + requests: + google.com/tpu: "8" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env new file mode 100644 index 000000000..43a0308b6 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v6e-gemma-3-27b-it +GPU_MEMORY_UTILIZATION=0.95 +MAX_MODEL_LEN=16384 +MODEL_ID=google/gemma-3-27b-it +MODEL_NAME=gemma-3-27b-it +TENSOR_PARALLEL_SIZE=8 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml new file mode 100644 index 000000000..a6fd4ac44 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/kustomization.yaml @@ -0,0 +1,131 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -v6e-qwen3-32b + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + - source: + fieldPath: data.CONTAINER_IMAGE_URL + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=inference-server].image + select: + kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml new file mode 100644 index 000000000..e0f4839a7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v6e-2x2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml new file mode 100644 index 000000000..117ab1b36 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "4" + requests: + google.com/tpu: "4" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env new file mode 100644 index 000000000..625f9245d --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-qwen3-32b/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v6e-qwen3-32b +GPU_MEMORY_UTILIZATION=0.95 +MAX_MODEL_LEN=32768 +MODEL_ID=qwen/qwen3-32b +MODEL_NAME=qwen3-32b +TENSOR_PARALLEL_SIZE=4 From a1cef20ba16d3f997cedf7d2257c20e5865cc66f Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Fri, 1 May 2026 12:22:55 -0500 Subject: [PATCH 2/4] fix: rename template file to match script expectation in async-pubsub-subscriber --- ...-pubsub-subscriber.tpl.env => async-pubsub-subscriber.tpl.env} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/{batch-pubsub-subscriber.tpl.env => async-pubsub-subscriber.tpl.env} (100%) diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/batch-pubsub-subscriber.tpl.env rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/async-pubsub-subscriber/base/templates/async-pubsub-subscriber.tpl.env From 44e185e81ce20075de34cdaac6f9bba6840f2e0a Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Fri, 1 May 2026 12:41:54 -0500 Subject: [PATCH 3/4] feat: add Gemma-4 templates and README for llm-d on TPUs --- .../llmd/llmd-vllm-with-hf-model-tpu.md | 558 ++++++++++++++++++ .../v6e-gemma-4-26b-a4b/kustomization.yaml | 140 +++++ .../patch-nodeselector.yaml | 24 + .../v6e-gemma-4-26b-a4b/patch-resources.yaml | 29 + .../v6e-gemma-4-26b-a4b/patch-vllm-args.yaml | 17 + .../llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env | 6 + .../vllm/v6e-gemma-4-31b/kustomization.yaml | 141 +++++ .../v6e-gemma-4-31b/patch-nodeselector.yaml | 24 + .../vllm/v6e-gemma-4-31b/patch-resources.yaml | 29 + .../vllm/v6e-gemma-4-31b/patch-vllm-args.yaml | 17 + .../llmd/vllm/v6e-gemma-4-31b/runtime.env | 6 + 11 files changed, 991 insertions(+) create mode 100644 docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md new file mode 100644 index 000000000..fa4565c4a --- /dev/null +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md @@ -0,0 +1,558 @@ +# Intelligent inference scheduling with llm-d on TPUs + +## Prerequisite + +This architecture and workflow assumes that the reader is familiar with the +following GKE, Google Cloud Networking and llm-d components: + +- [Gateway API resources](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/gateway-api#gateway_resources) +- [GKE Gateway Controller](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/gateway-api#gateway_controller) +- [Google Cloud Load Balancer through GKE](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/service-load-balancer#load_balancer_types) +- [Gateway API Inference Extension(GAIE)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal) +- [vLLM-Optimized Inference Schedule](https://llm-d.ai/docs/architecture) + +## Architecture + +![image](../images/llm-d.png) + +## Workflow + +- User securely hits the Cloud Endpoint DNS from a web browser. +- The DNS resolves to an External IP mapped to a + `Global External Load Balancer`. +- The `Global External Load Balancer` has a `HTTPRoute` that points to the + Gradio chat GKE service as the backend. It also has a backend policy + specifying that the request to the backend will have + `IAP(Identity-aware proxy)` authentication enabled. +- The `Global External Load Balancer` routes the request via `IAP` to the Gradio + chat GKE service backend. +- Gradio chat GKE service forwards the request to the Gradio GKE Deployment and + the user will see the chat interface loading on the browser. +- When the user sends a request via chat interface, the request reaches the + Gradio GKE deployment as explained in previous steps. +- The Gradio GKE deployment takes the chat message and routes the request to the + `Internal Regional Load Balancer` fronting the llm-d deployment. +- The `Internal Regional Load Balancer` has a `HTTPRoute` attached to it that + points to an `InferencePool` as the backend. This `InferencePool` contains the + pods running the model server, specifically running the inference of the model + of your choice via `vllm`. +- The `InferencePool` has a reference to the GAIE endpoint picker(`EPP`) which + means that the `GKE Gateway Controller` instead of routing the request to the + backend in round-robin fashion, will consult the `EPP` to provide it with the + backend where the traffic should be routed. +- The `EPP` has + [scheduling profiles](https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md) + that defines how to score the pods in the `InferencePool`. The scoring is done + on the metrics coming out of the pods. +- Once the `EPP` identifies the pod which should be used based on the scores, it + returns its IP address to the `GKE Gateway Controller` corresponding to the + `Internal Regional Load Balancer` which then routes the request to the pod. + +## Pull the source code + +- Open [Cloud Shell](https://cloud.google.com/shell). + +- Clone the repository and change directory to the guide directory + + ``` + git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \ + cd accelerated-platforms && \ + export ACP_REPO_DIR="$(pwd)" + ``` + + To set the `ACP_REPO_DIR` value for new shell instances, write the value to + your shell initialization file. + + `bash` + + ``` + sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.bashrc + ``` + + `zsh` + + ``` + sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.zshrc + ``` + +## Configure + +Terraform loads variables in the following order, with later sources taking +precedence over earlier ones: + +- Environment variables (`TF_VAR_`) +- Any `*.auto.tfvars` or files, processed in lexical order of their filenames. +- Any `-var` and `-var-file` options on the command line, in the order they are + provided. + +- Set the platform defaults project ID + + ``` + export TF_VAR_platform_default_project_id="" + ``` + + **-- OR --** + + ``` + platform_default_project_id="" + sed -i '/^platform_default_project_id[[:blank:]]*=/{h;s/=.*/= "'"${platform_default_project_id}"'"/};${x;/^$/{s//platform_default_project_id = "'"${platform_default_project_id}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/_shared_config/platform.auto.tfvars + ``` + +- Optional : By default, the platform name is set to `dev`. If you want to + change it, set the platform name + + ``` + platform_name="" + sed -i '/^platform_name[[:blank:]]*=/{h;s/=.*/= "'"${platform_name}"'"/};${x;/^$/{s//platform_name="'"${platform_name}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/_shared_config/platform.auto.tfvars + ``` + +- Optional : Run the following step if you want to run the inference of a model + other than `google/gemma-4-31b` which is the default model for this deployment. + + ``` + llmd_model_id="" + sed -i "/^llmd_model_id[[:blank:]]*=/{h;s|=.*|= \"${llmd_model_id}\"|};\${x;/^$/{s|.*|llmd_model_id=\"${llmd_model_id}\"|;H};x}" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars" + ``` + + Valid values for `MODEL_ID` are: + + - `google/gemma-4-31b` **(default)** + - `google/gemma-4-26b-a4b` + - `google/gemma-3-27b-it` + - `google/gemma-3-4b-it` + - `google/gemma-3-1b-it` + - `qwen/qwen3-32b` + +- In order to choose an accelerator and for the model you want to run, refer to + the following table. + + | Model | v5e | v6e | + | ------------------------------ | --- | --- | + | gemma-4-31b | ❌ | ✅ | + | gemma-4-26b-a4b | ❌ | ✅ | + | gemma-3-27b-it | ✅ | ✅ | + | gemma-3-4b-it | ✅ | ❌ | + | gemma-3-1b-it | ✅ | ❌ | + | qwen3-32b | ✅ | ✅ | + +- Optional : Run the following step if you want to run the model on an + accelerator other than `v6e` which is the default accelerator for + this deployment. + + ``` + llmd_accelerator_type="" + sed -i '/^llmd_accelerator_type[[:blank:]]*=/{h;s/=.*/= "'"${llmd_accelerator_type}"'"/};${x;/^$/{s//llmd_accelerator_type="'"${llmd_accelerator_type}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars + ``` + + Valid values for `ACCELERATOR` are: + + - `v5e` + - `v6e` **(default)** + +## Configure Identity-Aware Proxy (IAP) + +Identity-Aware Proxy (IAP) lets you establish a central authorization layer for +applications accessed by HTTPS, so you can use an application-level access +control model instead of relying on network-level firewalls. + +IAP policies scale across your organization. You can define access policies +centrally and apply them to all of your applications and resources. When you +assign a dedicated team to create and enforce policies, you protect your project +from incorrect policy definition or implementation in any application. + +For more information on IAP, see the +[Identity-Aware Proxy documentation](https://cloud.google.com/iap/docs/concepts-overview#gke) + +### Configure OAuth consent screen for IAP + +For this guide we will configure a generic OAuth consent screen setup for +internal use. Internal use means that only users within your organization can be +granted IAM permissions to access the IAP secured applications and resource. + +See the +[Configuring the OAuth consent screen documentation](https://developers.google.com/workspace/guides/configure-oauth-consent) +for additional information + +- Set environment variables. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/scripts/set_environment_variables.sh" + ``` + +- Ensure that IAP is enabled. + + ```shell + gcloud services enable iap.googleapis.com \ + --project="${llmd_iap_oath_branding_project_id}" + ``` + +- Check if the branding is already configured. + + ```shell + gcloud iap oauth-brands list \ + --project="${llmd_iap_oath_branding_project_id}" + ``` + + > If an entry is displayed, the branding is already configured. + +- Configure the branding. + + ```shell + gcloud iap oauth-brands create \ + --application_title="IAP Secured Application" \ + --project="${llmd_iap_oath_branding_project_id}" \ + --support_email="" + ``` + + Replace `` with a group email address that you are a + manager on or your personal email address. The email address should be + supplied without the domain. + +### Default IAP access + +For simplicity, in this guide access to the IAP secured applications will be +configure to allow all users in the organization. Access can be configured per +IAP application or resources. + +- Set the IAP allow domain + + ``` + IAP_DOMAIN=$(gcloud auth list --filter=status:ACTIVE --format="value(account)" | awk -F@ '{print $2}') + echo "IAP_DOMAIN=${IAP_DOMAIN}" + ``` + + **If the domain of the active `gcloud` user is different from the organization + that the `llmd_iap_oath_branding_project_id` project is in, you will need to + manually set `IAP_DOMAIN` environment variable** + + ``` + IAP_DOMAIN="" + ``` + +- Set the IAP domain in the configuration file + + ``` + sed -i '/^llmd_iap_domain[[:blank:]]*=/{h;s/=.*/= "'"${IAP_DOMAIN}"'"/};${x;/^$/{s//llmd_iap_domain="'"${IAP_DOMAIN}"'"/;H};x}' ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars + ``` + +### Install Terraform 1.8.0+ + +> [!IMPORTANT] +> At the time this guide was written, Cloud Shell had Terraform v1.5.7 installed +> by default. Terraform version 1.8.0 or later is required for this guide. + +- Run the `install_terraform.sh` script to install Terraform 1.8.0. + + ```shell + "${ACP_REPO_DIR}/tools/bin/install_terraform.sh" + ``` + +## Deploy the entire stack except the model server + +``` +${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/deploy-llmd.sh +``` + +## Resources created + +The `deploy-llmd.sh` script will perform the following steps: + +- Set up base GKE cluster platform. +- Create resources required to deploy llm-d on the GKE cluster and access it. +- Deploy a gradio chat frontend backed by Identity-Aware Proxy. +- Creates a custom Cloud Monitoring dashboard named `llm-d dashboard` + +At this time, you have all resources shown in the architecture diagram created +except the model server. In order to run the model server, first download the +model from hugging face to a GCS bucket as instructed in the next steps. Note +that we will not be downloading the model directly from HuggingFace as it slows +down the modelserver startup time. Instead, we will use GCSFuse to download the +model from the GCS bucket which is faster. For more details on how downloading +the model from GCS saves time, take a look at +[Storage optimization guide](../../../../../../use-cases/inferencing/cost-optimization/gcsfuse/README.md) + +## Download the model to Cloud Storage + +- [Generate a Hugging Face tokens](https://huggingface.co/docs/hub/security-tokens) + with token type **Read**. +- Add the token to the secret manager + + ``` + HF_TOKEN_READ= + echo ${HF_TOKEN_READ} | gcloud secrets versions add ${huggingface_hub_access_token_read_secret_manager_secret_name} --data-file=- --project=${huggingface_secret_manager_project_id} + ``` + +- Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/scripts/set_environment_variables.sh" + ``` + +- Configure the model download job. + + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh" + ``` + +- Deploy the model download job. + + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" + ``` + +- Watch the model download job until it is complete. + + ```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10" + ``` + + When the job is complete, you will see the following: + + ```text + NAME STATUS COMPLETIONS DURATION AGE + XXXXXXXX-hf-model-to-gcs Complete 1/1 ### ### + ``` + + You can press `CTRL`+`c` to terminate the watch. + +- Delete the model download job. + + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" + ``` + +## Deploy the model server + +- Configure the model server + + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh" + ``` + +- Deploy the model server + + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + ``` + + The Kubernetes manifests are based on the + [Inference Quickstart recommendations](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference-quickstart). + +- Watch the deployment until it is ready. + + ```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get deployment/ms-inference-scheduling-llmd-modelservice-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} logs deployment/ms-inference-scheduling-llmd-modelservice-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10" + ``` + +## Verify llm-d deployment is up and running + +- Set the environment variables + + ``` + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/scripts/set_environment_variables.sh" + ``` + +- Get cluster credentials + + ``` + ${cluster_credentials_command} + ``` + +- Check the all the deployments + + ``` + kubectl get deployments -n ${ira_online_tpu_kubernetes_namespace_name} + ``` + + You should see three deployments similar to the following: + + ``` + NAME READY UP-TO-DATE AVAILABLE AGE + gaie-inference-scheduling-epp 1/1 1 1 XXXX + gradio-XXXX 1/1 1 1 XXXX + ms-inference-scheduling-llmd-modelservice-XXXX 2/2 1 1 XXXX + ``` + + Note: + + - gaie-inference-scheduling-epp is the Gateway API Inference Extension + endpoint picker. + - gradio-XXXX is the front end chat interface abstracting the model server. + - ms-inference-scheduling-llmd-modelservice-XXXX is the model server running + inference of the model you chose. It may take some time for this deployment + to be up completely depending upon the TPU availability + +- Check all the resources + + ``` + kubectl get all -n ${ira_online_tpu_kubernetes_namespace_name} + ``` + + You should see output similar to the following: + + ``` + NAME READY STATUS RESTARTS AGE + pod/gaie-inference-scheduling-epp-XXXX 1/1 Running 0 XX + pod/gradio-XXXX 1/1 Running 0 XX + pod/pod/ms-inference-scheduling-llmd-modelservice-XXXX 4/4 Running 0 XX + pod/pod/ms-inference-scheduling-llmd-modelservice-XXXX 4/4 Running 0 XX + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + service/gaie-inference-scheduling-epp ClusterIP 34.118.230.43 9002/TCP,9090/TCP XX + service/gaie-inference-scheduling-ips-XXXX ClusterIP None 54321/TCP XX + service/gradio-svc-XXXX ClusterIP 34.118.232.165 8080/TCP XX + + NAME READY UP-TO-DATE AVAILABLE AGE + deployment.apps/gaie-inference-scheduling-epp 1/1 1 1 XX + deployment.apps/gradio-XXXX 1/1 1 1 XX + deployment.apps/ms-inference-scheduling-llmd-modelservice-XXXX 2/2 2 2 XX + + NAME DESIRED CURRENT READY AGE + replicaset.apps/gaie-inference-scheduling-epp-XXXX 1 1 1 XX + replicaset.apps/gradio-XXXX 1 1 1 XX + replicaset.apps/ms-inference-scheduling-llmd-modelservice-XXXX 2 2 2 XX + ``` + +- Wait for the model server deployment to be ready before accessing the chat + interface. + + ``` + watch --color --interval 5 --no-title \ + "kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get deployment/${llmd_ms_deployment_name}-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1'" + ``` + +- When the deployment is ready, you will output similar to the following + + ``` + NAME READY UP-TO-DATE AVAILABLE AGE + ms-inference-scheduling-llmd-modelservice-XXXX 2/2 2 2 XX + ``` + +- Output the Chat URL. + + ``` + echo -e "\nChat URL: https://${llmd_endpoints_hostname}\n" + ``` + +- Open the Chat URL in a web browser. + +> [!TIP] +> If the browser doesn't load the Gradio chat interface, the SSL certificate +> could still be getting provisioned. Check the status of the certificate by +> running the following command: +> +> `gcloud compute ssl-certificates describe ${llmd_ssl_certificate_name} --project ${cluster_project_id} --format=json | jq -r '.managed.status'` +> +> If the output of the command is `PROVISIONING`, it means the certificate has +> not been provisioned yet. Wait for the status to change to `ACTIVE` + +## Generate load on the model server + +In this section, you will generate some load on the model server and view the +metrics on the monitoring dashboard. Then, you will run the stress test to spawn +many requests to build the processing queue. Note that the scripts used in this +section spawn requests to the gradio endpoint which will route the request to +the model server via llm-d's intelligent scheduling. This is done to replicate a +real-world scenario where the model server is running behind a front end. Due to +the additional front end layer(in this case, gradio), the metrics will indicate +a slightly lower performance compared to the scenario where the requests are +directly sent to the model server via llm-d internal load balancer eliminating +the latency caused by the front end layer. + +1. In order to send a request to the gradio chat interface fronting llm-d and + model server, the active `gcloud` account needs to have the + [Service Account Token Creator](https://cloud.google.com/iam/docs/roles-permissions/iam#iam.serviceAccountTokenCreator) + role for the stress test service account. The following command will add the + role to the active `gcloud` account. + + ```shell + gcloud iam service-accounts add-iam-policy-binding ${stress_test_service_account_email} \ + --member="user:$(gcloud auth list --filter=status:ACTIVE --format="value(account)")" \ + --project="${stress_test_service_account_project_id}" \ + --role="roles/iam.serviceAccountTokenCreator" + ``` + + The stress test service account has the role + `roles/iap.httpsResourceAccessor` and can access the gradio chat application + secured by Identity-Aware proxy. + +2. Generate JSON Web Token (JWT) + + ```shell + cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/initialize/scripts && \ + cat > jwt-claim.json << EOF + { + "iss": "${stress_test_service_account_email}", + "sub": "${stress_test_service_account_email}", + "aud": "https://${llmd_endpoints_hostname}/gradio_api/api/sync_chat/", + "iat": $(date +%s), + "exp": $((`date +%s` + 3600)) + } + EOF + ``` + + Wait for a couple of mins as the IAM permissions could take some time to + reflect the changes. + + ```shell + gcloud iam service-accounts sign-jwt --iam-account="${stress_test_service_account_email}" jwt-claim.json token.jwt + ``` + +3. Set up python virtual environment and install required packages + + ``` + python3 -m venv venv && + source venv/bin/activate && + pip install aiohttp + ``` + +4. Run the script to trigger generate some load. + + ```shell + python ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/initialize/scripts/generate_load.py + ``` + + The response should look like this: + + ``` + Preparing to send the requests to the MODEL google/gemma-4-31b + Starting Load: 500 concurrent users... + User 01 | Status: 200 + User 02 | Status: 200 + User 04 | Status: 200 + User 05 | Status: 200 + ``` + +5. Go to + [Cloud Monitoring Dashboard page](https://console.cloud.google.com/monitoring/dashboards?pli=1) + and search for `llm-d dashboard`. Open the dashboard. You will see various + metrics getting populated including TTFT, TPOT, Input Token/s , Output + Token/s etc. You will see something similar to the following pic. + +![dashboard](../images/llmd-dashboard.png) + +- You can view the metrics published by `vllm` and `gaie` on the dashboard. Note + that some of the network metrics like `Throughput TX Bytes per Pod` are only + applicable to non-spot and A3 and higher machine types. + +Note : Now, if you want to run different combinations of model and +accelerator(e.g. google/gemma-3-27b-it on v5e), update the Terraform variables +`llmd_model_id` and `llmd_accelerator_type` in +`"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/examples/llmd/_shared_config/llmd.auto.tfvars"` +to the model and accelerator of your choice and run +[model download](#download-the-model-to-cloud-storage) and +[model server deployment](#deploy-the-model-server) steps again. + +## Teardown + +Teardown the llm-d platform + +```shell +${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/teardown-llmd.sh +``` diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml new file mode 100644 index 000000000..9480e9f07 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/kustomization.yaml @@ -0,0 +1,140 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +images: + - name: replaced-by-kustomize + newName: vllm/vllm-tpu + newTag: gemma4 + +nameSuffix: -v6e-gemma-4-26b-a4b + +patches: + - path: patch-vllm-args.yaml + target: + kind: Deployment + name: vllm + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + # - source: + # fieldPath: data.CONTAINER_IMAGE_URL + # kind: ConfigMap + # name: vllm + # targets: + # - fieldPaths: + # - spec.template.spec.containers.[name=inference-server].image + # select: + # kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml new file mode 100644 index 000000000..e0f4839a7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v6e-2x2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml new file mode 100644 index 000000000..117ab1b36 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "4" + requests: + google.com/tpu: "4" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml new file mode 100644 index 000000000..835df7aec --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/patch-vllm-args.yaml @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- op: add + path: /spec/template/spec/containers/1/args/- + value: "--chat-template=/gcs/google/gemma-4-26b-a4b/chat_template.jinja" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env new file mode 100644 index 000000000..bf8c4759e --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-26b-a4b/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v6e-gemma-4-26b-a4b +GPU_MEMORY_UTILIZATION=0.95 +MAX_MODEL_LEN=16384 +MODEL_ID=google/gemma-4-26b-a4b +MODEL_NAME=gemma-4-26b-a4b +TENSOR_PARALLEL_SIZE=4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml new file mode 100644 index 000000000..bf566d459 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/kustomization.yaml @@ -0,0 +1,141 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +images: + - name: replaced-by-kustomize + newName: vllm/vllm-tpu + newTag: gemma4 + +nameSuffix: -v6e-gemma-4-31b + +patches: + - path: patch-vllm-args.yaml + target: + kind: Deployment + name: vllm + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.selector.matchLabels.app + - spec.template.metadata.labels.app + select: + kind: Deployment + - fieldPaths: + - spec.selector.app + select: + kind: Service + # - source: + # fieldPath: data.CONTAINER_IMAGE_URL + # kind: ConfigMap + # name: vllm + # targets: + # - fieldPaths: + # - spec.template.spec.containers.[name=inference-server].image + # select: + # kind: Deployment + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Deployment + - fieldPaths: + - metadata.namespace + select: + kind: Service + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Deployment + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: deployment + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Deployment + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Deployment + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Deployment + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml new file mode 100644 index 000000000..e0f4839a7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: tpu-v6e-2x2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml new file mode 100644 index 000000000..117ab1b36 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-resources.yaml @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: inference-server + resources: + limits: + google.com/tpu: "4" + requests: + google.com/tpu: "4" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml new file mode 100644 index 000000000..ea89e9f93 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/patch-vllm-args.yaml @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- op: add + path: /spec/template/spec/containers/1/args/- + value: "--chat-template=/gcs/google/gemma-4-31b/chat_template.jinja" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env new file mode 100644 index 000000000..3edbcd9d8 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-4-31b/runtime.env @@ -0,0 +1,6 @@ +APP_LABEL=vllm-v6e-gemma-4-31b +GPU_MEMORY_UTILIZATION=0.95 +MAX_MODEL_LEN=16384 +MODEL_ID=google/gemma-4-31b +MODEL_NAME=gemma-4-31b +TENSOR_PARALLEL_SIZE=4 From 74a944285163cee728faf2a273135ceaaebeea5c Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Fri, 1 May 2026 13:02:28 -0500 Subject: [PATCH 4/4] removing v5e examples as the accelerator is not officially supported on llm-d documentation --- .../vllm/v5e-gemma-3-1b-it/kustomization.yaml | 131 ------------------ .../v5e-gemma-3-1b-it/patch-nodeselector.yaml | 24 ---- .../v5e-gemma-3-1b-it/patch-resources.yaml | 29 ---- .../llmd/vllm/v5e-gemma-3-1b-it/runtime.env | 6 - .../v5e-gemma-3-27b-it/kustomization.yaml | 131 ------------------ .../patch-nodeselector.yaml | 24 ---- .../v5e-gemma-3-27b-it/patch-resources.yaml | 29 ---- .../llmd/vllm/v5e-gemma-3-27b-it/runtime.env | 6 - .../vllm/v5e-gemma-3-4b-it/kustomization.yaml | 131 ------------------ .../v5e-gemma-3-4b-it/patch-nodeselector.yaml | 24 ---- .../v5e-gemma-3-4b-it/patch-resources.yaml | 29 ---- .../llmd/vllm/v5e-gemma-3-4b-it/runtime.env | 6 - .../vllm/v5e-qwen3-32b/kustomization.yaml | 131 ------------------ .../v5e-qwen3-32b/patch-nodeselector.yaml | 24 ---- .../vllm/v5e-qwen3-32b/patch-resources.yaml | 29 ---- .../llmd/vllm/v5e-qwen3-32b/runtime.env | 6 - 16 files changed, 760 deletions(-) delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml deleted file mode 100644 index f90964787..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/kustomization.yaml +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -configMapGenerator: - - envs: - - runtime.env - name: runtime - namespace: replaced-by-kustomize - -nameSuffix: "-v5e-gemma-3-1b-it" - -patches: - - path: patch-nodeselector.yaml - - path: patch-resources.yaml - -replacements: - - source: - fieldPath: data.APP_LABEL - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.selector.matchLabels.app - - spec.template.metadata.labels.app - select: - kind: Deployment - - fieldPaths: - - spec.selector.app - select: - kind: Service - - source: - fieldPath: data.CONTAINER_IMAGE_URL - kind: ConfigMap - name: vllm - targets: - - fieldPaths: - - spec.template.spec.containers.[name=inference-server].image - select: - kind: Deployment - - source: - fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - metadata.namespace - select: - kind: ConfigMap - - fieldPaths: - - metadata.namespace - select: - kind: Deployment - - fieldPaths: - - metadata.namespace - select: - kind: Service - - fieldPaths: - - metadata.namespace - select: - kind: ServiceAccount - - source: - fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Deployment - - fieldPaths: - - metadata.name - select: - kind: ServiceAccount - - source: - fieldPath: data.MODEL_BUCKET_NAME - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName - options: - delimiter: . - index: 0 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_ID - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions - options: - delimiter: "only-dir:" - index: 1 - select: - kind: Deployment - - fieldPaths: - - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - options: - delimiter: / - index: 2 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_NAME - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.metadata.labels.[ai.gke.io/model] - select: - kind: Deployment - -resources: - - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml deleted file mode 100644 index 832e2fceb..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-nodeselector.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - nodeSelector: - cloud.google.com/compute-class: tpu-v5e-1x1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml deleted file mode 100644 index b3371ea99..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/patch-resources.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - containers: - - name: inference-server - resources: - limits: - google.com/tpu: "1" - requests: - google.com/tpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env deleted file mode 100644 index 8cc55f05c..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-1b-it/runtime.env +++ /dev/null @@ -1,6 +0,0 @@ -APP_LABEL=vllm-v5e-gemma-3-1b-it -GPU_MEMORY_UTILIZATION=0.9 -MAX_MODEL_LEN=1024 -MODEL_ID=google/gemma-3-1b-it -MODEL_NAME=gemma-3-1b-it -TENSOR_PARALLEL_SIZE=1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml deleted file mode 100644 index fe78bce08..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/kustomization.yaml +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -configMapGenerator: - - envs: - - runtime.env - name: runtime - namespace: replaced-by-kustomize - -nameSuffix: -v5e-gemma-3-27b-it - -patches: - - path: patch-nodeselector.yaml - - path: patch-resources.yaml - -replacements: - - source: - fieldPath: data.APP_LABEL - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.selector.matchLabels.app - - spec.template.metadata.labels.app - select: - kind: Deployment - - fieldPaths: - - spec.selector.app - select: - kind: Service - - source: - fieldPath: data.CONTAINER_IMAGE_URL - kind: ConfigMap - name: vllm - targets: - - fieldPaths: - - spec.template.spec.containers.[name=inference-server].image - select: - kind: Deployment - - source: - fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - metadata.namespace - select: - kind: ConfigMap - - fieldPaths: - - metadata.namespace - select: - kind: Deployment - - fieldPaths: - - metadata.namespace - select: - kind: Service - - fieldPaths: - - metadata.namespace - select: - kind: ServiceAccount - - source: - fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Deployment - - fieldPaths: - - metadata.name - select: - kind: ServiceAccount - - source: - fieldPath: data.MODEL_BUCKET_NAME - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName - options: - delimiter: . - index: 0 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_ID - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions - options: - delimiter: "only-dir:" - index: 1 - select: - kind: Deployment - - fieldPaths: - - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - options: - delimiter: / - index: 2 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_NAME - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.metadata.labels.[ai.gke.io/model] - select: - kind: Deployment - -resources: - - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml deleted file mode 100644 index 789ec78c6..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-nodeselector.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - nodeSelector: - cloud.google.com/compute-class: tpu-v5e-2x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml deleted file mode 100644 index a2f2513e0..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/patch-resources.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - containers: - - name: inference-server - resources: - limits: - google.com/tpu: "8" - requests: - google.com/tpu: "8" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env deleted file mode 100644 index cc066c34f..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-27b-it/runtime.env +++ /dev/null @@ -1,6 +0,0 @@ -APP_LABEL=vllm-v5e-gemma-3-27b-it -GPU_MEMORY_UTILIZATION=0.9 -MAX_MODEL_LEN=1024 -MODEL_ID=google/gemma-3-27b-it -MODEL_NAME=gemma-3-27b-it -TENSOR_PARALLEL_SIZE=8 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml deleted file mode 100644 index ffb5e7862..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/kustomization.yaml +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -configMapGenerator: - - envs: - - runtime.env - name: runtime - namespace: replaced-by-kustomize - -nameSuffix: -v5e-gemma-3-4b-it - -patches: - - path: patch-nodeselector.yaml - - path: patch-resources.yaml - -replacements: - - source: - fieldPath: data.APP_LABEL - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.selector.matchLabels.app - - spec.template.metadata.labels.app - select: - kind: Deployment - - fieldPaths: - - spec.selector.app - select: - kind: Service - - source: - fieldPath: data.CONTAINER_IMAGE_URL - kind: ConfigMap - name: vllm - targets: - - fieldPaths: - - spec.template.spec.containers.[name=inference-server].image - select: - kind: Deployment - - source: - fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - metadata.namespace - select: - kind: ConfigMap - - fieldPaths: - - metadata.namespace - select: - kind: Deployment - - fieldPaths: - - metadata.namespace - select: - kind: Service - - fieldPaths: - - metadata.namespace - select: - kind: ServiceAccount - - source: - fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Deployment - - fieldPaths: - - metadata.name - select: - kind: ServiceAccount - - source: - fieldPath: data.MODEL_BUCKET_NAME - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName - options: - delimiter: . - index: 0 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_ID - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions - options: - delimiter: "only-dir:" - index: 1 - select: - kind: Deployment - - fieldPaths: - - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - options: - delimiter: / - index: 2 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_NAME - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.metadata.labels.[ai.gke.io/model] - select: - kind: Deployment - -resources: - - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml deleted file mode 100644 index d2d74eca4..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-nodeselector.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - nodeSelector: - cloud.google.com/compute-class: tpu-v5e-2x2 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml deleted file mode 100644 index 117ab1b36..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/patch-resources.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - containers: - - name: inference-server - resources: - limits: - google.com/tpu: "4" - requests: - google.com/tpu: "4" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env deleted file mode 100644 index d00314f82..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-gemma-3-4b-it/runtime.env +++ /dev/null @@ -1,6 +0,0 @@ -APP_LABEL=vllm-v5e-gemma-3-4b-it -GPU_MEMORY_UTILIZATION=0.9 -MAX_MODEL_LEN=1024 -MODEL_ID=google/gemma-3-4b-it -MODEL_NAME=gemma-3-4b-it -TENSOR_PARALLEL_SIZE=4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml deleted file mode 100644 index 64201eff4..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/kustomization.yaml +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -configMapGenerator: - - envs: - - runtime.env - name: runtime - namespace: replaced-by-kustomize - -nameSuffix: -v5e-qwen3-32b - -patches: - - path: patch-nodeselector.yaml - - path: patch-resources.yaml - -replacements: - - source: - fieldPath: data.APP_LABEL - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.selector.matchLabels.app - - spec.template.metadata.labels.app - select: - kind: Deployment - - fieldPaths: - - spec.selector.app - select: - kind: Service - - source: - fieldPath: data.CONTAINER_IMAGE_URL - kind: ConfigMap - name: vllm - targets: - - fieldPaths: - - spec.template.spec.containers.[name=inference-server].image - select: - kind: Deployment - - source: - fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - metadata.namespace - select: - kind: ConfigMap - - fieldPaths: - - metadata.namespace - select: - kind: Deployment - - fieldPaths: - - metadata.namespace - select: - kind: Service - - fieldPaths: - - metadata.namespace - select: - kind: ServiceAccount - - source: - fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Deployment - - fieldPaths: - - metadata.name - select: - kind: ServiceAccount - - source: - fieldPath: data.MODEL_BUCKET_NAME - kind: ConfigMap - name: deployment - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName - options: - delimiter: . - index: 0 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_ID - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions - options: - delimiter: "only-dir:" - index: 1 - select: - kind: Deployment - - fieldPaths: - - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath - options: - delimiter: / - index: 2 - select: - kind: Deployment - - source: - fieldPath: data.MODEL_NAME - kind: ConfigMap - name: runtime - targets: - - fieldPaths: - - spec.template.metadata.labels.[ai.gke.io/model] - select: - kind: Deployment - -resources: - - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml deleted file mode 100644 index 789ec78c6..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-nodeselector.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - nodeSelector: - cloud.google.com/compute-class: tpu-v5e-2x4 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml deleted file mode 100644 index a2f2513e0..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/patch-resources.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm - namespace: replaced-by-kustomize -spec: - template: - spec: - containers: - - name: inference-server - resources: - limits: - google.com/tpu: "8" - requests: - google.com/tpu: "8" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env deleted file mode 100644 index 90a9f9620..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/v5e-qwen3-32b/runtime.env +++ /dev/null @@ -1,6 +0,0 @@ -APP_LABEL=vllm-v5e-qwen3-32b -GPU_MEMORY_UTILIZATION=0.95 -MAX_MODEL_LEN=32768 -MODEL_ID=qwen/qwen3-32b -MODEL_NAME=qwen3-32b -TENSOR_PARALLEL_SIZE=8