Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ms-inference-scheduling-llmd-modelservice
namespace: replaced-by-kustomize
spec:
replicas: 2
selector:
matchLabels:
llmd.ai/inferenceServing: "true"
llmd.ai/model: random_model
llmd.ai/role: decode
app: replaced-by-kustomize
template:
metadata:
annotations:
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/volumes: "true"
labels:
llmd.ai/inferenceServing: "true"
llmd.ai/model: random_model
llmd.ai/role: decode
ai.gke.io/model: replaced-by-kustomize
app: replaced-by-kustomize
spec:
initContainers:
- name: routing-proxy
args:
- --port=8000
- --vllm-port=8200
- --connector=nixlv2
- --zap-encoder=json
- --zap-log-level=debug
- --secure-proxy=false
image: replaced-by-kustomize
imagePullPolicy: Always
ports:
- containerPort: 8000
resources: {}
restartPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
serviceAccountName: replaced-by-kustomize
volumes:
- emptyDir: {}
name: metrics-volume
- emptyDir: {}
name: torch-compile-cache
- emptyDir:
medium: Memory
sizeLimit: 20Gi
name: dev-shm
- csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: cloud-storage-bucket-name
mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize,,uid=2000,gid=2000
skipCSIBucketAccessCheck: "true"
name: huggingface-hub-model-bucket
- emptyDir:
medium: Memory
name: gke-gcsfuse-cache
- emptyDir:
medium: Memory
name: gke-gcsfuse-tmp
- emptyDir:
medium: Memory
name: gke-gcsfuse-buffer
containers:
- args:
- |
echo "########### $(date) - Starting parallel-fetch-safetensors for model: ${MODEL_ID}"
ls -alR /gcs
find /gcs/${MODEL_ID}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null'
echo "########### $(date) - Finished parallel-fetch-safetensors"
sleep infinity
command: ["/bin/sh", "-c"]
env:
- name: MODEL_ID
valueFrom:
configMapKeyRef:
key: MODEL_ID
name: runtime
image: busybox
name: fetch-safetensors
volumeMounts:
- mountPath: /gcs
name: huggingface-hub-model-bucket
readOnly: true
- name: inference-server
image: replaced-by-kustomize
command: ["vllm", "serve"]
args:
- /gcs/$(MODEL_ID)
- "--port"
- "8200"
- "--served-model-name"
- "$(MODEL_ID)"
- "--kv-transfer-config"
- '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
- "--disable-uvicorn-access-log"
- "--tensor-parallel-size"
- "$(TENSOR_PARALLEL_SIZE)"
- "--gpu-memory-utilization"
- "$(GPU_MEMORY_UTILIZATION)"
- "--max-model-len"
- "$(MAX_MODEL_LEN)"
env:
- name: UCX_TLS
value: cuda_ipc,cuda_copy,tcp
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5557"
- name: VLLM_LOGGING_LEVEL
value: DEBUG
- name: DP_SIZE
value: "1"
- name: DP_SIZE_LOCAL
value: "1"
- name: GPU_MEMORY_UTILIZATION
valueFrom:
configMapKeyRef:
key: GPU_MEMORY_UTILIZATION
name: runtime
- name: MAX_MODEL_LEN
valueFrom:
configMapKeyRef:
key: MAX_MODEL_LEN
name: runtime
- name: MODEL_ID
valueFrom:
configMapKeyRef:
key: MODEL_ID
name: runtime
- name: TENSOR_PARALLEL_SIZE
valueFrom:
configMapKeyRef:
key: TENSOR_PARALLEL_SIZE
name: runtime
ports:
- containerPort: 5557
protocol: TCP
- containerPort: 8200
name: metrics
protocol: TCP
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8200
periodSeconds: 10
timeoutSeconds: 5
readinessProbe:
failureThreshold: 3
httpGet:
path: /v1/models
port: 8200
periodSeconds: 5
timeoutSeconds: 2
startupProbe:
failureThreshold: 60
httpGet:
path: /v1/models
port: 8200
initialDelaySeconds: 15
periodSeconds: 30
timeoutSeconds: 5
resources: {}
volumeMounts:
- mountPath: /.config
name: metrics-volume
- mountPath: /.cache
name: torch-compile-cache
- mountPath: /dev/shm
name: dev-shm
- mountPath: /gcs
name: huggingface-hub-model-bucket
readOnly: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

configMapGenerator:
- envs:
- vllm.env
name: vllm
namespace: replaced-by-kustomize

resources:
- ../../../base
- deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CONTAINER_IMAGE_URL=ghcr.io/llm-d/llm-d-cuda:v0.5.0
ROUTING_PROXY_IMAGE=ghcr.io/llm-d/llm-d-routing-sidecar:v0.4.0-rc.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o nounset
set -o pipefail

MY_PATH="$(
cd "$(dirname "$0")" >/dev/null 2>&1
pwd -P
)"

source "${MY_PATH}/../../../../examples/llmd/_shared_config/scripts/set_environment_variables.sh"
"${MY_PATH}/../../configure_deployment.sh"

envsubst <"${MY_PATH}/base/templates/vllm.tpl.env" | sponge "${MY_PATH}/base/vllm.env"
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

configMapGenerator:
- envs:
- runtime.env
name: runtime
namespace: replaced-by-kustomize

nameSuffix: -v6e-gemma-3-27b-it

patches:
- path: patch-nodeselector.yaml
- path: patch-resources.yaml

replacements:
- source:
fieldPath: data.APP_LABEL
kind: ConfigMap
name: runtime
targets:
- fieldPaths:
- spec.selector.matchLabels.app
- spec.template.metadata.labels.app
select:
kind: Deployment
- fieldPaths:
- spec.selector.app
select:
kind: Service
- source:
fieldPath: data.CONTAINER_IMAGE_URL
kind: ConfigMap
name: vllm
targets:
- fieldPaths:
- spec.template.spec.containers.[name=inference-server].image
select:
kind: Deployment
- source:
fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
kind: ConfigMap
name: deployment
targets:
- fieldPaths:
- metadata.namespace
select:
kind: ConfigMap
- fieldPaths:
- metadata.namespace
select:
kind: Deployment
- fieldPaths:
- metadata.namespace
select:
kind: Service
- fieldPaths:
- metadata.namespace
select:
kind: ServiceAccount
- source:
fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
kind: ConfigMap
name: deployment
targets:
- fieldPaths:
- spec.template.spec.serviceAccountName
select:
kind: Deployment
- fieldPaths:
- metadata.name
select:
kind: ServiceAccount
- source:
fieldPath: data.MODEL_BUCKET_NAME
kind: ConfigMap
name: deployment
targets:
- fieldPaths:
- spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
options:
delimiter: .
index: 0
select:
kind: Deployment
- source:
fieldPath: data.MODEL_ID
kind: ConfigMap
name: runtime
targets:
- fieldPaths:
- spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
options:
delimiter: "only-dir:"
index: 1
select:
kind: Deployment
- fieldPaths:
- spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
- spec.template.spec.containers.[name=inference-server].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
options:
delimiter: /
index: 2
select:
kind: Deployment
- source:
fieldPath: data.MODEL_NAME
kind: ConfigMap
name: runtime
targets:
- fieldPaths:
- spec.template.metadata.labels.[ai.gke.io/model]
select:
kind: Deployment

resources:
- ../base
Loading
Loading