From 619aa5e8777b4f96f08cda2dda9b4bdc35cfdaa0 Mon Sep 17 00:00:00 2001 From: gushob21 Date: Wed, 15 Apr 2026 21:42:46 +0000 Subject: [PATCH 1/3] Adding vllm auto tuner initial files fixing pipeline Fixing permissions on scripts adding terraform for vllm auto tuner Adding a README.md for vLLM-auto-tune and renaming related files and directories fixing pipelines fixing linting issues Refining README instructions reducing the auto tuner manifests to only h100 and g4 Making README changes and improving TF code Adding reference architecture for vllm auto tuner for qwen32B model on H100 and RTX Prod 6000 --- .github/workflows/dictionary/vllm.txt | 1 + .../online-inference-gpu/vllm-auto-tuner.md | 303 ++++++++++++++++++ .../kustomization.yaml | 22 ++ .../templates/job.tpl.env | 3 + .../configure_auto_tune_job.sh | 36 +++ .../base/configmap-scripts.yaml | 57 ++++ .../vllm-auto-tuning/base/job.yaml | 169 ++++++++++ .../vllm-auto-tuning/base/kustomization.yaml | 26 ++ .../base/templates/vllm.tpl.env | 2 + .../vllm-auto-tuning/configure_vllm.sh | 29 ++ .../h100-qwen3-32b/kustomization.yaml | 134 ++++++++ .../h100-qwen3-32b/patch-nodeselector.yaml | 24 ++ .../h100-qwen3-32b/patch-resources.yaml | 33 ++ .../h100-qwen3-32b/runtime.env | 12 + .../rtx-pro-6000-qwen3-32b/kustomization.yaml | 134 ++++++++ .../patch-nodeselector.yaml | 24 ++ .../patch-resources.yaml | 33 ++ .../rtx-pro-6000-qwen3-32b/runtime.env | 12 + .../terraform/_shared_config/outputs.tf | 9 +- .../vllm-auto-tuning.auto.tfvars | 1 + .../vllm-auto-tuning_variables.tf | 44 +++ .../vllm-auto-tuning/.terraform.lock.hcl | 42 +++ .../vllm-auto-tuning/_cloudbuild.auto.tfvars | 1 + .../vllm-auto-tuning/_cloudbuild_variables.tf | 1 + .../vllm-auto-tuning/_cluster.auto.tfvars | 1 + .../vllm-auto-tuning/_cluster_variables.tf | 1 + .../vllm-auto-tuning/_huggingface.auto.tfvars | 1 + .../_huggingface_variables.tf | 1 + .../_inference-ref-arch.auto.tfvars | 1 + .../_inference-ref-arch_variables.tf | 1 + .../vllm-auto-tuning/_platform.auto.tfvars | 1 + .../vllm-auto-tuning/_platform_variables.tf | 1 + .../_vllm-auto-tuning.auto.tfvars | 1 + .../_vllm-auto-tuning_variables.tf | 1 + .../terraform/vllm-auto-tuning/iam.tf | 32 ++ .../terraform/vllm-auto-tuning/kubernetes.tf | 103 ++++++ .../terraform/vllm-auto-tuning/project.tf | 18 ++ .../vllm-auto-tuning/secret_manager.tf | 18 ++ .../terraform/vllm-auto-tuning/storage.tf | 28 ++ .../templates/kubernetes/namespace.tftpl.yaml | 18 ++ .../kubernetes/secretproviderclass.tftpl.yaml | 25 ++ .../kubernetes/serviceaccount.tftpl.yaml | 19 ++ .../terraform/vllm-auto-tuning/versions.tf | 32 ++ 43 files changed, 1454 insertions(+), 1 deletion(-) create mode 100644 docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf diff --git a/.github/workflows/dictionary/vllm.txt b/.github/workflows/dictionary/vllm.txt index 1cb5e4d73..cc03e3b4b 100644 --- a/.github/workflows/dictionary/vllm.txt +++ b/.github/workflows/dictionary/vllm.txt @@ -1,3 +1,4 @@ +autotuner dailymail dtype flashinfer diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md new file mode 100644 index 000000000..250980144 --- /dev/null +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md @@ -0,0 +1,303 @@ +# Automated vLLM Server Parameter Tuning + +This guide is aimed at finding the maximum throughput that we can get with the +single chips of nvidia-h100-80gb and nvidia-rtx-pro-6000 while running inference +of Qwen3/Qwen3-32B on Google Kubernetes Engine. We will use vLLM as the +inference server and try to find the parameters that provide us the best +performance. vLLM provides a script to automate the process of finding the +optimal server parameter combination (max-num-seqs and max-num-batched-tokens) +to maximize throughput for a vLLM server. It also supports additional +constraints such as E2E latency and prefix cache hit rate. More details are on +the +[official GitHub repository](https://github.com/vllm-project/vllm/tree/main/benchmarks/auto_tune) + +## Before you begin + +- Make sure that the + [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md) + is deployed and configured. + +## Workflow + +This example will run through the following steps: + +1. Apply the terraform, which will: + + - Create a GCS bucket for storing optimization results. + - Create a Kubernetes namespace where the vLLM auto-tune job will run. + - Create the Kubernetes service account for the running vLLM auto-tune job. + - Grant the required IAM permissions for workload identity KSA. + +2. Create the custom kubernetes manifest for the vLLM auto-tune job +3. Run the vLLM auto-tune job based on a out-of-the box configuration provided + with the reference architecture. +4. Store the results generated from the vLLM auto-tune job in the GCS bucket. + +## Resources Created + +- Cloud Storage Bucket +- Kubernetes namespace +- Kubernetes Service Account in the namespace +- IAM Permissions for Kubernetes service account + - roles/secretmanager.secretAccessor for Hugging Face token in Secret Manager + - roles/storage.bucketViewer for results bucket + - roles/storage.objectUser for results bucket + +## Pull the source code + +- Open [Cloud Shell](https://cloud.google.com/shell). + +- Clone the repository and set the repository directory environment variable. + + ```shell + git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \ + cd accelerated-platforms && \ + export ACP_REPO_DIR="$(pwd)" + ``` + +To set the `ACP_REPO_DIR` value for new shell instances, write the value to your +shell initialization file. + +`bash` + +```shell +sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.bashrc +``` + +`zsh` + +```shell +sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.zshrc +``` + +## Configuration + +Terraform loads variables in the following order, with later sources taking +precedence over earlier ones: + +- Environment variables (`TF_VAR_`) +- Any `*.auto.tfvars` or files, processed in lexical order of their filenames. +- Any `-var` and `-var-file` options on the command line, in the order they are + provided. + +For more information about providing values for Terraform input variables, see +[Terraform input variables](https://developer.hashicorp.com/terraform/language/values/variables). + +- Set the platform default project ID + + ```shell + export TF_VAR_platform_default_project_id="" + ``` + + **-- OR --** + + ```shell + vi ${ACP_REPO_DIR}/platforms/gke/base/_shared_config/platform.auto.tfvars + ``` + + ```hcl + platform_default_project_id = "" + ``` + +### Install Terraform 1.8.0+ + +> [!IMPORTANT] +> At the time this guide was written, Cloud Shell had Terraform v1.5.7 installed +> by default. Terraform version 1.8.0 or later is required for this guide. + +- Check the terraform version in your cloud shell + + ```shell + 'terraform version' + ``` + +- Run the `install_terraform.sh` script to install Terraform 1.8.0. + + ```shell + "${ACP_REPO_DIR}/tools/bin/install_terraform.sh" + ``` + +## Deploy + +### Run Terraform to create the resources + +```shell +export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" +cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning && \ +rm -rf .terraform/ terraform.tfstate* && \ +terraform init && \ +terraform plan -input=false -out=tfplan && \ +terraform apply -input=false tfplan && \ +rm tfplan +``` + +### Configure the environment variables + +The goal here is to find the most optimum vLLM parameters for serving +Qwen3/Qwen3-32B on a single chip of nvidia-h100-80gb(A3 High) and +nvidia-rtx-pro-6000(G4) in order to get the maximum throughput. This reference +architecture provides templated manifests to serve the following single +accelerator chip and model combination. + + | Model | h100 | RTX Pro 6000 | + | ------------------------------ | ---- | ------------ | + | qwen3-32b | ✅ | ✅ | + +- Select an accelerator. + + - **NVIDIA H100 80GB**: + + ```shell + export ACCELERATOR_TYPE="h100" + ``` + + - **NVIDIA RTX PRO 6000 96GB**: + + ```shell + export ACCELERATOR_TYPE="rtx-pro-6000" + ``` + + Ensure that you have enough quota in your project to provision the selected + accelerator type. For more information, see about viewing GPU quotas, see + [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota). + +- Set the model. + + - **Qwen3-32B**: + + ```shell + export HF_MODEL_ID="qwen/qwen3-32b" + ``` + +- Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +- Check the model name. + + ```shell + echo "HF_MODEL_NAME=${HF_MODEL_NAME}" + ``` + +- Export the vLLM service endpoint + + ```shell + export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + ``` + +### Download the model from HuggingFace to GCS bucket + +- [Generate a Hugging Face tokens](https://huggingface.co/docs/hub/security-tokens) + with token type **Read**. +- Add the token to the secret manager + + ``` + HF_TOKEN_READ= + echo ${HF_TOKEN_READ} | gcloud secrets versions add ${huggingface_hub_access_token_read_secret_manager_secret_name} --data-file=- --project=${huggingface_secret_manager_project_id} + ``` + +- Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` + +- Configure the model download job. + + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh" + ``` + +- Deploy the model download job. + + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" + ``` + +- Watch the model download job until it is complete. + + ```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10" + ``` + + When the job is complete, you will see the following: + + ```text + NAME STATUS COMPLETIONS DURATION AGE + XXXXXXXX-hf-model-to-gcs Complete 1/1 ### ### + ``` + + You can press `CTRL`+`c` to terminate the watch. + +- Delete the model download job. + + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface" + ``` + +### Deploy the vLLM auto-tune job + +- Configure the vLLM auto-tune job. + + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh" + ``` + +- Deploy the vLLM auto-tune job. + + ```shell + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + ``` + +### Check the status of the job + +```shell + watch --color --interval 5 --no-title \ + "kubectl --namespace=${ira_auto_tuning_vllm_kubernetes_namespace_name} get job/vllm-auto-tuning-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${ira_auto_tuning_vllm_kubernetes_namespace_name} logs job/vllm-auto-tuning-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10" +``` + +When the job is complete, you will see the following: + +```text +NAME STATUS COMPLETIONS DURATION AGE +vllm-auto-tuning-XXXXXX Complete 1/1 XXX XXX +``` + +## Analyze and Interpret Results + +Download the results from the GCS bucket to your +[Cloud Shell](https://cloud.google.com/shell). + +``` +gcloud storage cp --recursive gs://${ira_auto_tuning_vllm_results_bucket} +``` + +View the files to see the result. + +## Clean up + +- Delete the vLLM auto-tune job. + + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}" + ``` + +- Destroy the vLLM auto-tune resources created via Terraform. + + > Note: This will only destroy your benchmarking results GCS bucket only if + > its empty + + ```shell + export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" + cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning && \ + rm -rf .terraform/ terraform.tfstate* && \ + terraform init && + terraform destroy -auto-approve + ``` diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml new file mode 100644 index 000000000..ad063239b --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - job.env + name: job + namespace: replaced-by-kustomize diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env new file mode 100644 index 000000000..d02692bd6 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env @@ -0,0 +1,3 @@ +INFERENCE_KUBERNETES_NAMESPACE=${ira_auto_tuning_vllm_kubernetes_namespace_name} +INFERENCE_KUBERNETES_SERVICE_ACCOUNT=${ira_auto_tuning_vllm_kubernetes_service_account_name} +MODEL_BUCKET_NAME=${huggingface_hub_models_bucket_name} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh new file mode 100755 index 000000000..e802fda1f --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +ENV_FILE_1="${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" +ENV_FILE_2="${MY_PATH}/../../examples/llmd/_shared_config/scripts/set_environment_variables.sh" + +if [ -f "$ENV_FILE_1" ]; then + source "$ENV_FILE_1" +elif [ -f "$ENV_FILE_2" ]; then + source "$ENV_FILE_2" +else + echo "Warning: No environment variable file found." + +fi + +envsubst <"${MY_PATH}/base-vllm-auto-tune-job/templates/job.tpl.env" | sponge "${MY_PATH}/base-vllm-auto-tune-job/job.env" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml new file mode 100644 index 000000000..5160d39b0 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml @@ -0,0 +1,57 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-auto-tuning-script + namespace: replaced-by-kustomize +data: + auto_tune.sh: | + #!/bin/bash + set -euo pipefail + + # 1. Install missing system tools + apt-get update && apt-get install -y bc git curl + + # 2. Clone the repo to a neutral location + mkdir -p /tmp/vllm-source + git clone --depth 1 https://github.com/vllm-project/vllm.git /tmp/vllm-source + + # 3. Move only the benchmarks to your working directory + # and DELETE the source folder that causes the circular import. + mkdir -p /vllm-workspace/benchmarks + cp -r /tmp/vllm-source/benchmarks/auto_tune /vllm-workspace/benchmarks/ + rm -rf /tmp/vllm-source # Get rid of the conflicting source code + + # 4. Fix permissions and create the empty vllm folder the script expects + chmod +x /vllm-workspace/benchmarks/auto_tune/auto_tune.sh + mkdir -p /vllm-workspace/vllm # Script needs this to exist, but keep it empty + + # 5. Set paths + export BASE="/vllm-workspace" + export SCRIPT_DIR="/vllm-workspace/benchmarks/auto_tune" + + # 6. Run the tuner + cd /vllm-workspace + sed -i 's|"${common_args_array\[@\]}" > "$vllm_log"|"${common_args_array\[@\]}" "--kv-cache-dtype" "fp8" > "$vllm_log"|' benchmarks/auto_tune/auto_tune.sh + bash benchmarks/auto_tune/auto_tune.sh + + # 7. Results handling + LATEST_DIR=$(ls -td auto-benchmark/*/ | head -1 || true) + if [ -n "$LATEST_DIR" ]; then + cp -r "$LATEST_DIR" /output + fi + + echo "Tuning complete." diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml new file mode 100644 index 000000000..1835ca994 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml @@ -0,0 +1,169 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-auto-tuning + namespace: replaced-by-kustomize +spec: + template: + metadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + labels: + ai.gke.io/model: replaced-by-kustomize + app: vllm-auto-tuning + spec: + serviceAccountName: replaced-by-kustomize + containers: + - args: + - | + echo "########### $(date) - Starting parallel-fetch-safetensors for model: ${MODEL_ID}" + ls -alR /gcs + find /gcs/${MODEL_ID}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null' + echo "########### $(date) - Finished parallel-fetch-safetensors" + command: ["/bin/sh", "-c"] + env: + - name: MODEL_ID + valueFrom: + configMapKeyRef: + key: MODEL_ID + name: runtime + image: busybox + name: fetch-safetensors + volumeMounts: + - mountPath: /gcs + name: huggingface-hub-model-bucket + readOnly: true + - name: autotuner + image: replaced-by-kustomize + command: ["/bin/bash", "-c"] + env: + - name: VLLM_CACHE_ROOT + value: /gcs + - name: DOWNLOAD_DIR + value: /gcs + - name: HF_HUB_OFFLINE + value: "1" + - name: MODEL + valueFrom: + configMapKeyRef: + key: MODEL_LOCATION + name: runtime + - name: TRANSFORMERS_OFFLINE + value: "1" + - name: VLLM_AUTO_TUNING_RESULTS_BUCKET + valueFrom: + configMapKeyRef: + key: VLLM_AUTO_TUNING_RESULTS_BUCKET + name: vllm + - name: GPU_MEMORY_UTILIZATION + valueFrom: + configMapKeyRef: + key: GPU_MEMORY_UTILIZATION + name: runtime + - name: TP + value: "1" + - name: INPUT_LEN + valueFrom: + configMapKeyRef: + key: INPUT_LEN + name: runtime + - name: OUTPUT_LEN + valueFrom: + configMapKeyRef: + key: OUTPUT_LEN + name: runtime + - name: MAX_MODEL_LEN + valueFrom: + configMapKeyRef: + key: MAX_MODEL_LEN + name: runtime + - name: NUM_SEQS_LIST + valueFrom: + configMapKeyRef: + key: NUM_SEQS_LIST + name: runtime + - name: NUM_BATCHED_TOKENS_LIST + valueFrom: + configMapKeyRef: + key: NUM_BATCHED_TOKENS_LIST + name: runtime + - name: MIN_CACHE_HIT_PCT + valueFrom: + configMapKeyRef: + key: MIN_CACHE_HIT_PCT + name: runtime + - name: MAX_LATENCY_ALLOWED_MS + valueFrom: + configMapKeyRef: + key: MAX_LATENCY_ALLOWED_MS + name: runtime + args: ["/mnt/config/auto_tune.sh"] + volumeMounts: + - name: config-volume + mountPath: /mnt/config + - mountPath: /gcs + name: huggingface-hub-model-bucket + - mountPath: /output + name: vllm-auto-tuner-results + - name: dshm + mountPath: /dev/shm + volumes: + - name: config-volume + configMap: + name: vllm-auto-tuning-script + defaultMode: 0755 + - name: dshm + emptyDir: + medium: Memory + sizeLimit: "16Gi" + - name: results + emptyDir: {} + - csi: + driver: secrets-store-gke.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: huggingface-token-read + name: huggingface-token + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: cloud-storage-bucket-name + mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize,uid=2000,gid=2000 + skipCSIBucketAccessCheck: "true" + name: huggingface-hub-model-bucket + - csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: cloud-storage-bucket-name + mountOptions: implicit-dirs,uid=2000,gid=2000 + skipCSIBucketAccessCheck: "true" + name: vllm-auto-tuner-results + - emptyDir: + medium: Memory + name: gke-gcsfuse-cache + - emptyDir: + medium: Memory + name: gke-gcsfuse-tmp + - emptyDir: + medium: Memory + name: gke-gcsfuse-buffer + restartPolicy: OnFailure + securityContext: + fsGroup: 10000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml new file mode 100644 index 000000000..af66c2d75 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - vllm.env + name: vllm + namespace: replaced-by-kustomize +resources: + - ../../base-vllm-auto-tune-job + - configmap-scripts.yaml + - job.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env new file mode 100644 index 000000000..7e2a93553 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env @@ -0,0 +1,2 @@ +VLLM_AUTO_TUNER_IMAGE=vllm/vllm-openai:latest +VLLM_AUTO_TUNING_RESULTS_BUCKET=${ira_auto_tuning_vllm_results_bucket} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh new file mode 100755 index 000000000..e2945e271 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_variables.sh" + +"${MY_PATH}/../configure_auto_tune_job.sh" + +envsubst <"${MY_PATH}/base/templates/vllm.tpl.env" | sponge "${MY_PATH}/base/vllm.env" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml new file mode 100644 index 000000000..ca23115d9 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml @@ -0,0 +1,134 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -h100-qwen3-32b + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.app + select: + kind: Job + - source: + fieldPath: data.VLLM_AUTO_TUNER_IMAGE + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=autotuner].image + select: + kind: Job + - source: + fieldPath: data.VLLM_AUTO_TUNING_RESULTS_BUCKET + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=vllm-auto-tuner-results].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Job + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: job + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: job + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: job + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Job + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Job + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=autotuner].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Job + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Job + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml new file mode 100644 index 000000000..8dfd58e81 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-auto-tuning + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-h100-80gb-high-x1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml new file mode 100644 index 000000000..23d3c5f6d --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-auto-tuning + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: autotuner + resources: + limits: + cpu: "10" + memory: 128G + nvidia.com/gpu: "1" + requests: + cpu: "10" + memory: 128G + nvidia.com/gpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env new file mode 100644 index 000000000..f410f3064 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env @@ -0,0 +1,12 @@ +APP_LABEL=vllm-h100-qwen3-32b +GPU_MEMORY_UTILIZATION=0.90 +INPUT_LEN=1028 +MAX_LATENCY_ALLOWED_MS=100000000000 +MAX_MODEL_LEN=32768 +MIN_CACHE_HIT_PCT=0 +MODEL_ID=qwen/qwen3-32b +MODEL_NAME=qwen3-32b +MODEL_LOCATION=/gcs/qwen/qwen3-32b +NUM_BATCHED_TOKENS_LIST=4096 8192 16384 32768 65536 +NUM_SEQS_LIST=64 128 256 512 +OUTPUT_LEN=128 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml new file mode 100644 index 000000000..dc3f3c328 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml @@ -0,0 +1,134 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -rtx-pro-6000-qwen3-32b + +patches: + - path: patch-nodeselector.yaml + - path: patch-resources.yaml + +replacements: + - source: + fieldPath: data.APP_LABEL + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.app + select: + kind: Job + - source: + fieldPath: data.VLLM_AUTO_TUNER_IMAGE + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.containers.[name=autotuner].image + select: + kind: Job + - source: + fieldPath: data.VLLM_AUTO_TUNING_RESULTS_BUCKET + kind: ConfigMap + name: vllm + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=vllm-auto-tuner-results].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Job + - source: + fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE + kind: ConfigMap + name: job + targets: + - fieldPaths: + - metadata.namespace + select: + kind: ConfigMap + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: ServiceAccount + - source: + fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: job + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + - fieldPaths: + - metadata.name + select: + kind: ServiceAccount + - source: + fieldPath: data.MODEL_BUCKET_NAME + kind: ConfigMap + name: job + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName + options: + delimiter: . + index: 0 + select: + kind: Job + - source: + fieldPath: data.MODEL_ID + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions + options: + delimiter: "only-dir:" + index: 1 + select: + kind: Job + - fieldPaths: + - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + - spec.template.spec.containers.[name=autotuner].volumeMounts.[name=huggingface-hub-model-bucket].mountPath + options: + delimiter: / + index: 2 + select: + kind: Job + - source: + fieldPath: data.MODEL_NAME + kind: ConfigMap + name: runtime + targets: + - fieldPaths: + - spec.template.metadata.labels.[ai.gke.io/model] + select: + kind: Job + +resources: + - ../base diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml new file mode 100644 index 000000000..53e256d73 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-auto-tuning + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml new file mode 100644 index 000000000..23d3c5f6d --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-auto-tuning + namespace: replaced-by-kustomize +spec: + template: + spec: + containers: + - name: autotuner + resources: + limits: + cpu: "10" + memory: 128G + nvidia.com/gpu: "1" + requests: + cpu: "10" + memory: 128G + nvidia.com/gpu: "1" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env new file mode 100644 index 000000000..cbb6c9c61 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env @@ -0,0 +1,12 @@ +APP_LABEL=vllm-rtx-pro-6000-qwen3-32b +GPU_MEMORY_UTILIZATION=0.90 +INPUT_LEN=1028 +MAX_LATENCY_ALLOWED_MS=100000000000 +MAX_MODEL_LEN=32768 +MIN_CACHE_HIT_PCT=0 +MODEL_ID=qwen/qwen3-32b +MODEL_NAME=qwen3-32b +MODEL_LOCATION=/gcs/qwen/qwen3-32b +NUM_BATCHED_TOKENS_LIST=4096 8192 16384 32768 65536 +NUM_SEQS_LIST=64 128 256 512 +OUTPUT_LEN=128 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf index 8f573d8f7..64f7f0dd5 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf @@ -104,6 +104,14 @@ output "ira_async_pubsub_prompt_messages_topic_name" { value = local.ira_async_pubsub_prompt_messages_topic_name } +output "ira_auto_tuning_vllm_kubernetes_namespace_name" { + value = local.ira_auto_tuning_vllm_kubernetes_namespace_name +} + +output "ira_auto_tuning_vllm_results_bucket" { + value = local.ira_auto_tuning_vllm_results_bucket +} + output "ira_inference_perf_bench_kubernetes_service_account_name" { value = local.ira_inference_perf_bench_kubernetes_service_account_name } @@ -239,4 +247,3 @@ output "workflow_api_service_account_oauth_display_name" { output "workflow_api_service_account_project_id" { value = local.workflow_api_service_account_project_id } - diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars @@ -0,0 +1 @@ + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf new file mode 100644 index 000000000..ccc9e3a25 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +locals { + ira_auto_tuning_vllm_kubernetes_namespace_name = var.ira_auto_tuning_vllm_kubernetes_namespace_name != null ? var.ira_auto_tuning_vllm_kubernetes_namespace_name : "${local.unique_identifier_prefix}-vllm-auto-tuning" + ira_auto_tuning_vllm_kubernetes_service_account_name = var.ira_auto_tuning_vllm_kubernetes_service_account_name != null ? var.ira_auto_tuning_vllm_kubernetes_service_account_name : "${local.unique_identifier_prefix}-vllm-auto-tuning-ksa" + ira_auto_tuning_vllm_results_bucket = var.ira_auto_tuning_vllm_results_bucket != null ? var.ira_auto_tuning_vllm_results_bucket : "${local.cluster_project_id}-${local.unique_identifier_prefix}-vllm-auto-tuning-results" + ira_auto_tuning_vllm_secretproviderclass = var.ira_auto_tuning_vllm_secretproviderclass != null ? var.ira_auto_tuning_vllm_secretproviderclass : "huggingface-token-read" +} + +variable "ira_auto_tuning_vllm_kubernetes_namespace_name" { + default = null + description = "The Kubernetes namespace for the batch CPU load generator workloads." + type = string +} + +variable "ira_auto_tuning_vllm_kubernetes_service_account_name" { + default = null + description = "The Kubernetes service account for the batch CPU load generator workloads." + type = string +} + +variable "ira_auto_tuning_vllm_results_bucket" { + default = null + description = "The GCS bucket for storing auto-tuning results." + type = string +} + +variable "ira_auto_tuning_vllm_secretproviderclass" { + default = null + description = "The Secretproviderclass to access huggingface read token." + type = string +} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl new file mode 100644 index 000000000..27d625960 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl @@ -0,0 +1,42 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "7.6.0" + constraints = "7.6.0" + hashes = [ + "h1:JYsO3fV5OtaNuRTdjGZC1Z3Ku1ZIrRJGwXwsBjtWudk=", + "zh:0c70c768b0a34d7a61de70d0e85cf0057820556647bbce2384972a45d7092e4e", + "zh:0cb7aab89cd435c5c8dab9231ea176d64fdf1df1125db15a6b9ead978a93c0b2", + "zh:32f25c42214bb356bb67cef6057c9904f2878cd053a7760e5ee3737619f28638", + "zh:38b05b1171ab086c88b95d379120fb6c28c9e895ae924557c11c35e138319119", + "zh:39d8206d453a614fa0be3aeac8ea3921fb3ab7ed122205cbbcc2a41ca6176cb5", + "zh:58d9059aa6b4aab5ede4fc173dcdc7b4d042d0b1a1ab55407dd345931d7f4815", + "zh:a4bc001c8ac7700d0107155296250c3b8969511e1a488f3b318f3db62362eef2", + "zh:cc75e25db4bb672ebc200a89d6cff9ff0b9911e14e188d1b4429bb3511d2b35f", + "zh:d7f7639930735f17b2b4f73814204a9a050186ea7e1c2671a52e0fa7ddf7a001", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:ff1190ae618dae9243de59caf4149abb4a9b775cb6439f119cd32a30f1a21820", + "zh:ff15b7b86787f6fd186211e7c37a72f2cc70374b284aaf063e1f989717441161", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.5.3" + constraints = "2.5.3" + hashes = [ + "h1:1Nkh16jQJMp0EuDmvP/96f5Unnir0z12WyDuoR6HjMo=", + "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927", + "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e", + "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf", + "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d", + "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09", + "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f", + "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1", + "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6", + "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47", + "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82", + ] +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars new file mode 120000 index 000000000..c730c32e8 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf new file mode 120000 index 000000000..5a143590a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cloudbuild_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars new file mode 120000 index 000000000..98a694db9 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/cluster.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf new file mode 120000 index 000000000..00625515b --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/cluster_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars new file mode 120000 index 000000000..276530b81 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/huggingface.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf new file mode 120000 index 000000000..f384bc7e1 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/huggingface_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars new file mode 120000 index 000000000..31e8d28d2 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/inference-ref-arch.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf new file mode 120000 index 000000000..502fdca32 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf @@ -0,0 +1 @@ +../_shared_config/inference-ref-arch_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars new file mode 120000 index 000000000..125a652cf --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars @@ -0,0 +1 @@ +../../../../_shared_config/platform.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf new file mode 120000 index 000000000..486b3eaef --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf @@ -0,0 +1 @@ +../../../../_shared_config/platform_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars new file mode 120000 index 000000000..3499ccadc --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars @@ -0,0 +1 @@ +../_shared_config/vllm-auto-tuning.auto.tfvars \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf new file mode 120000 index 000000000..ba00da1f7 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf @@ -0,0 +1 @@ +../_shared_config/vllm-auto-tuning_variables.tf \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf new file mode 100644 index 000000000..6f02f6041 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_storage_bucket_iam_member" "auto_tuning_bucket_access_at_ksa" { + bucket = google_storage_bucket.auto_tuning_results.name + member = local.ira_auto_tuning_vllm_ksa_member + role = "roles/storage.objectUser" +} + +resource "google_storage_bucket_iam_member" "hub_models_bucket_access_at_ksa" { + bucket = local.huggingface_hub_models_bucket_name + member = local.ira_auto_tuning_vllm_ksa_member + role = "roles/storage.objectUser" +} + +resource "google_secret_manager_secret_iam_member" "hub_token_read_access_at_ksa" { + member = local.ira_auto_tuning_vllm_ksa_member + project = data.google_secret_manager_secret.hub_access_token_read.project + role = "roles/secretmanager.secretAccessor" + secret_id = data.google_secret_manager_secret.hub_access_token_read.secret_id +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf new file mode 100644 index 000000000..85bc698ff --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf @@ -0,0 +1,103 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + cluster_wi_principal_prefix = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject" + ira_auto_tuning_vllm_ksa_member = "${local.cluster_wi_principal_prefix}/ns/${local.ira_auto_tuning_vllm_kubernetes_namespace_name}/sa/${local.ira_auto_tuning_vllm_kubernetes_service_account_name}" + ira_auto_tuning_vllm_kubernetes_namespace_directory = "${local.namespaces_directory}/${local.ira_auto_tuning_vllm_kubernetes_namespace_name}" + kubeconfig_directory = "${path.module}/../../../../kubernetes/kubeconfig" + kubeconfig_file = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}" + manifests_directory_root = "${path.module}/../../../../kubernetes/manifests" + namespaces_directory = "${local.manifests_directory_root}/namespace" +} + +data "local_file" "kubeconfig" { + filename = local.kubeconfig_file +} + +# Create Namespace +resource "local_file" "namespace_yaml" { + content = templatefile( + "${path.module}/templates/kubernetes/namespace.tftpl.yaml", + { + kubernetes_namespace = local.ira_auto_tuning_vllm_kubernetes_namespace_name + } + ) + file_permission = "0644" + filename = "${local.namespaces_directory}/namespace-${local.ira_auto_tuning_vllm_kubernetes_namespace_name}.yaml" +} + +module "kubectl_apply_namespace" { + depends_on = [ + local_file.namespace_yaml, + ] + + source = "../../../../modules/kubectl_apply" + + delete_timeout = "60s" + error_on_delete_failure = false + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${local.namespaces_directory}/namespace-${local.ira_auto_tuning_vllm_kubernetes_namespace_name}.yaml" + manifest_includes_namespace = true +} + + +resource "local_file" "serviceaccount_yaml" { + content = templatefile( + "${path.module}/templates/kubernetes/serviceaccount.tftpl.yaml", + { + name = local.ira_auto_tuning_vllm_kubernetes_service_account_name + namespace = local.ira_auto_tuning_vllm_kubernetes_namespace_name + } + ) + filename = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/serviceaccount-${local.ira_auto_tuning_vllm_kubernetes_service_account_name}.yaml" +} + +module "kubectl_apply_service_account" { + source = "../../../../modules/kubectl_apply" + depends_on = [ + local_file.serviceaccount_yaml, module.kubectl_apply_namespace + ] + + apply_server_side = true + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/serviceaccount-${local.ira_auto_tuning_vllm_kubernetes_service_account_name}.yaml" + manifest_includes_namespace = true +} + +resource "local_file" "secretproviderclass_yaml" { + content = templatefile( + "${path.module}/templates/kubernetes/secretproviderclass.tftpl.yaml", + { + namespace = local.ira_auto_tuning_vllm_kubernetes_namespace_name + project_id = data.google_secret_manager_secret.hub_access_token_read.project + secretproviderclass_name = local.ira_auto_tuning_vllm_secretproviderclass + secret_name = local.huggingface_hub_access_token_read_secret_manager_secret_name + + } + ) + filename = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/secretproviderclass-${local.ira_auto_tuning_vllm_secretproviderclass}.yaml" +} + +module "kubectl_apply_secretproviderclass" { + source = "../../../../modules/kubectl_apply" + depends_on = [ + local_file.secretproviderclass_yaml, module.kubectl_apply_namespace + ] + + apply_server_side = true + kubeconfig_file = data.local_file.kubeconfig.filename + manifest = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/secretproviderclass-${local.ira_auto_tuning_vllm_secretproviderclass}.yaml" + manifest_includes_namespace = true +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf new file mode 100644 index 000000000..c6dc19a4a --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_project" "cluster" { + project_id = local.cluster_project_id +} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf new file mode 100644 index 000000000..94f82d903 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_secret_manager_secret" "hub_access_token_read" { + project = var.platform_default_project_id + secret_id = local.huggingface_hub_access_token_read_secret_manager_secret_name +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf new file mode 100644 index 000000000..1e9cd6cb8 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +### Hugging Face Model GCS Bucket Name ### +data "google_storage_bucket" "hub_models" { + name = local.huggingface_hub_models_bucket_name + project = local.huggingface_hub_models_bucket_project_id +} + +### GCS bucket for storage of vllm-auto-tuning results ### +resource "google_storage_bucket" "auto_tuning_results" { + name = local.ira_auto_tuning_vllm_results_bucket + location = local.cluster_region + uniform_bucket_level_access = true + force_destroy = false + project = var.platform_default_project_id +} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml new file mode 100644 index 000000000..0b3559cd2 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ${kubernetes_namespace} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml new file mode 100644 index 000000000..7f381f934 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: ${secretproviderclass_name} + namespace: ${namespace} +spec: + parameters: + secrets: | + - resourceName: "projects/${project_id}/secrets/${secret_name}/versions/latest" + path: "token" + provider: gke diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml new file mode 100644 index 000000000..a0f63c9dc --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml @@ -0,0 +1,19 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ${name} + namespace: ${namespace} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf new file mode 100644 index 000000000..dca89f111 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.5.7" + + required_providers { + google = { + source = "hashicorp/google" + version = "7.6.0" + } + local = { + source = "hashicorp/local" + version = "2.5.3" + } + } + + provider_meta "google" { + module_name = "cloud-solutions/vllm_auto_tuner-v1" + } +} From d000695d5afe7325e3b64aacdecb652a580012c0 Mon Sep 17 00:00:00 2001 From: gushob21 Date: Thu, 28 May 2026 16:19:42 +0000 Subject: [PATCH 2/3] fixing end of the file newline --- .../_shared_config/vllm-auto-tuning.auto.tfvars | 14 +++++++++++++- .../_shared_config/vllm-auto-tuning_variables.tf | 1 - 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars index 8b1378917..c37e93b74 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars @@ -1 +1,13 @@ - +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf index ccc9e3a25..1824fbf2c 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf +++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf @@ -41,4 +41,3 @@ variable "ira_auto_tuning_vllm_secretproviderclass" { description = "The Secretproviderclass to access huggingface read token." type = string } - From c0561904d51bc34f8aba2a130e3cd149d0b097a2 Mon Sep 17 00:00:00 2001 From: gushob21 Date: Fri, 29 May 2026 15:05:10 +0000 Subject: [PATCH 3/3] Add validate_kustomize test for vllm-auto-tuning --- .../gke/base/use-cases/inference-ref-arch/validate_kustomize.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh index 102621046..0fb5a0cae 100755 --- a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh +++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh @@ -49,6 +49,7 @@ export ACCELERATOR_TYPE="l4" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/vllm/configure_vllm.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm/configure_vllm.sh" +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh" export ACCELERATOR_TYPE="v5e" "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/max-diffusion/configure_max_diffusion.sh"