GoogleCloudPlatform · syeda-anjum · Apr 18, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
diff --git a/...forms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md b/...forms/gke/base/use-cases/inference-ref-arch/llmd/llmd-vllm-with-hf-model-tpu.md
diff --git a/...ference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml b/...ference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/deployment.yaml
@@ -0,0 +1,198 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ms-inference-scheduling-llmd-modelservice
+  namespace: replaced-by-kustomize
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      llmd.ai/inferenceServing: "true"
+      llmd.ai/model: random_model
+      llmd.ai/role: decode
+      app: replaced-by-kustomize
+  template:
+    metadata:
+      annotations:
+        gke-gcsfuse/cpu-limit: "0"
+        gke-gcsfuse/ephemeral-storage-limit: "0"
+        gke-gcsfuse/memory-limit: "0"
+        gke-gcsfuse/volumes: "true"
+      labels:
+        llmd.ai/inferenceServing: "true"
+        llmd.ai/model: random_model
+        llmd.ai/role: decode
+        ai.gke.io/model: replaced-by-kustomize
+        app: replaced-by-kustomize
+    spec:
+      initContainers:
+        - name: routing-proxy
+          args:
+            - --port=8000
+            - --vllm-port=8200
+            - --connector=nixlv2
+            - --zap-encoder=json
+            - --zap-log-level=debug
+            - --secure-proxy=false
+          image: replaced-by-kustomize
+          imagePullPolicy: Always
+          ports:
+            - containerPort: 8000
+          resources: {}
+          restartPolicy: Always
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+      serviceAccountName: replaced-by-kustomize
+      volumes:
+        - emptyDir: {}
+          name: metrics-volume
+        - emptyDir: {}
+          name: torch-compile-cache
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 20Gi
+          name: dev-shm
+        - csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: cloud-storage-bucket-name
+              mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize,,uid=2000,gid=2000
+              skipCSIBucketAccessCheck: "true"
+          name: huggingface-hub-model-bucket
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-cache
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-tmp
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-buffer
+      containers:
+        - args:
+            - |
+              echo "########### $(date) - Starting parallel-fetch-safetensors for model: ${MODEL_ID}"
+              ls -alR /gcs
+              find /gcs/${MODEL_ID}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null'
+              echo "########### $(date) - Finished parallel-fetch-safetensors"
+              sleep infinity
+          command: ["/bin/sh", "-c"]
+          env:
+            - name: MODEL_ID
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_ID
+                  name: runtime
+          image: busybox
+          name: fetch-safetensors
+          volumeMounts:
+            - mountPath: /gcs
+              name: huggingface-hub-model-bucket
+              readOnly: true
+        - name: inference-server
+          image: replaced-by-kustomize
+          command: ["vllm", "serve"]
+          args:
+            - /gcs/$(MODEL_ID)
+            - "--port"
+            - "8200"
+            - "--served-model-name"
+            - "$(MODEL_ID)"
+            - "--kv-transfer-config"
+            - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
+            - "--disable-uvicorn-access-log"
+            - "--tensor-parallel-size"
+            - "$(TENSOR_PARALLEL_SIZE)"
+            - "--gpu-memory-utilization"
+            - "$(GPU_MEMORY_UTILIZATION)"
+            - "--max-model-len"
+            - "$(MAX_MODEL_LEN)"
+          env:
+            - name: UCX_TLS
+              value: cuda_ipc,cuda_copy,tcp
+            - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+            - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+              value: "5557"
+            - name: VLLM_LOGGING_LEVEL
+              value: DEBUG
+            - name: DP_SIZE
+              value: "1"
+            - name: DP_SIZE_LOCAL
+              value: "1"
+            - name: GPU_MEMORY_UTILIZATION
+              valueFrom:
+                configMapKeyRef:
+                  key: GPU_MEMORY_UTILIZATION
+                  name: runtime
+            - name: MAX_MODEL_LEN
+              valueFrom:
+                configMapKeyRef:
+                  key: MAX_MODEL_LEN
+                  name: runtime
+            - name: MODEL_ID
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_ID
+                  name: runtime
+            - name: TENSOR_PARALLEL_SIZE
+              valueFrom:
+                configMapKeyRef:
+                  key: TENSOR_PARALLEL_SIZE
+                  name: runtime
+          ports:
+            - containerPort: 5557
+              protocol: TCP
+            - containerPort: 8200
+              name: metrics
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8200
+            periodSeconds: 10
+            timeoutSeconds: 5
+          readinessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /v1/models
+              port: 8200
+            periodSeconds: 5
+            timeoutSeconds: 2
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /v1/models
+              port: 8200
+            initialDelaySeconds: 15
+            periodSeconds: 30
+            timeoutSeconds: 5
+          resources: {}
+          volumeMounts:
+            - mountPath: /.config
+              name: metrics-volume
+            - mountPath: /.cache
+              name: torch-compile-cache
+            - mountPath: /dev/shm
+              name: dev-shm
+            - mountPath: /gcs
+              name: huggingface-hub-model-bucket
+              readOnly: true
diff --git a/...ence-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml b/...ence-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/kustomization.yaml
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - vllm.env
+    name: vllm
+    namespace: replaced-by-kustomize
+
+resources:
+  - ../../../base
+  - deployment.yaml
diff --git a/...-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env b/...-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/base/templates/vllm.tpl.env
@@ -0,0 +1,2 @@
+CONTAINER_IMAGE_URL=ghcr.io/llm-d/llm-d-cuda:v0.5.0
+ROUTING_PROXY_IMAGE=ghcr.io/llm-d/llm-d-routing-sidecar:v0.4.0-rc.1
diff --git a/.../inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh b/.../inference-ref-arch/kubernetes-manifests/online-inference-tpu/llmd/vllm/configure_vllm.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o errexit
+set -o nounset
+set -o pipefail
+
+MY_PATH="$(
+  cd "$(dirname "$0")" >/dev/null 2>&1
+  pwd -P
+)"
+
+source "${MY_PATH}/../../../../examples/llmd/_shared_config/scripts/set_environment_variables.sh"
+"${MY_PATH}/../../configure_deployment.sh"
+
+envsubst <"${MY_PATH}/base/templates/vllm.tpl.env" | sponge "${MY_PATH}/base/vllm.env"
diff --git a/...kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml b/...kubernetes-manifests/online-inference-tpu/llmd/vllm/v6e-gemma-3-27b-it/kustomization.yaml
@@ -0,0 +1,131 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -v6e-gemma-3-27b-it
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.selector.matchLabels.app
+          - spec.template.metadata.labels.app
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.selector.app
+        select:
+          kind: Service
+  - source:
+      fieldPath: data.CONTAINER_IMAGE_URL
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=inference-server].image
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Service
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: deployment
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Deployment
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Deployment
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Deployment
+
+resources:
+  - ../base
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		CONTAINER_IMAGE_URL=ghcr.io/llm-d/llm-d-cuda:v0.5.0
		ROUTING_PROXY_IMAGE=ghcr.io/llm-d/llm-d-routing-sidecar:v0.4.0-rc.1