From 619aa5e8777b4f96f08cda2dda9b4bdc35cfdaa0 Mon Sep 17 00:00:00 2001
From: gushob21 <gushob@google.com>
Date: Wed, 15 Apr 2026 21:42:46 +0000
Subject: [PATCH 1/3] Adding vllm auto tuner initial files

fixing  pipeline

Fixing permissions on scripts

adding terraform for vllm auto tuner

Adding a README.md for vLLM-auto-tune and renaming related files and directories

fixing pipelines

fixing linting issues

Refining  README instructions  reducing the auto tuner manifests to only h100 and g4

Making README changes and improving TF code

Adding reference architecture for vllm auto tuner for qwen32B model on
H100 and RTX Prod 6000
---
 .github/workflows/dictionary/vllm.txt         |   1 +
 .../online-inference-gpu/vllm-auto-tuner.md   | 303 ++++++++++++++++++
 .../kustomization.yaml                        |  22 ++
 .../templates/job.tpl.env                     |   3 +
 .../configure_auto_tune_job.sh                |  36 +++
 .../base/configmap-scripts.yaml               |  57 ++++
 .../vllm-auto-tuning/base/job.yaml            | 169 ++++++++++
 .../vllm-auto-tuning/base/kustomization.yaml  |  26 ++
 .../base/templates/vllm.tpl.env               |   2 +
 .../vllm-auto-tuning/configure_vllm.sh        |  29 ++
 .../h100-qwen3-32b/kustomization.yaml         | 134 ++++++++
 .../h100-qwen3-32b/patch-nodeselector.yaml    |  24 ++
 .../h100-qwen3-32b/patch-resources.yaml       |  33 ++
 .../h100-qwen3-32b/runtime.env                |  12 +
 .../rtx-pro-6000-qwen3-32b/kustomization.yaml | 134 ++++++++
 .../patch-nodeselector.yaml                   |  24 ++
 .../patch-resources.yaml                      |  33 ++
 .../rtx-pro-6000-qwen3-32b/runtime.env        |  12 +
 .../terraform/_shared_config/outputs.tf       |   9 +-
 .../vllm-auto-tuning.auto.tfvars              |   1 +
 .../vllm-auto-tuning_variables.tf             |  44 +++
 .../vllm-auto-tuning/.terraform.lock.hcl      |  42 +++
 .../vllm-auto-tuning/_cloudbuild.auto.tfvars  |   1 +
 .../vllm-auto-tuning/_cloudbuild_variables.tf |   1 +
 .../vllm-auto-tuning/_cluster.auto.tfvars     |   1 +
 .../vllm-auto-tuning/_cluster_variables.tf    |   1 +
 .../vllm-auto-tuning/_huggingface.auto.tfvars |   1 +
 .../_huggingface_variables.tf                 |   1 +
 .../_inference-ref-arch.auto.tfvars           |   1 +
 .../_inference-ref-arch_variables.tf          |   1 +
 .../vllm-auto-tuning/_platform.auto.tfvars    |   1 +
 .../vllm-auto-tuning/_platform_variables.tf   |   1 +
 .../_vllm-auto-tuning.auto.tfvars             |   1 +
 .../_vllm-auto-tuning_variables.tf            |   1 +
 .../terraform/vllm-auto-tuning/iam.tf         |  32 ++
 .../terraform/vllm-auto-tuning/kubernetes.tf  | 103 ++++++
 .../terraform/vllm-auto-tuning/project.tf     |  18 ++
 .../vllm-auto-tuning/secret_manager.tf        |  18 ++
 .../terraform/vllm-auto-tuning/storage.tf     |  28 ++
 .../templates/kubernetes/namespace.tftpl.yaml |  18 ++
 .../kubernetes/secretproviderclass.tftpl.yaml |  25 ++
 .../kubernetes/serviceaccount.tftpl.yaml      |  19 ++
 .../terraform/vllm-auto-tuning/versions.tf    |  32 ++
 43 files changed, 1454 insertions(+), 1 deletion(-)
 create mode 100644 docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env
 create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env
 create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars
 create mode 120000 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml
 create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf

diff --git a/.github/workflows/dictionary/vllm.txt b/.github/workflows/dictionary/vllm.txt
index 1cb5e4d73..cc03e3b4b 100644
--- a/.github/workflows/dictionary/vllm.txt
+++ b/.github/workflows/dictionary/vllm.txt
@@ -1,3 +1,4 @@
+autotuner
 dailymail
 dtype
 flashinfer
diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md
new file mode 100644
index 000000000..250980144
--- /dev/null
+++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-auto-tuner.md
@@ -0,0 +1,303 @@
+# Automated vLLM Server Parameter Tuning
+
+This guide is aimed at finding the maximum throughput that we can get with the
+single chips of nvidia-h100-80gb and nvidia-rtx-pro-6000 while running inference
+of Qwen3/Qwen3-32B on Google Kubernetes Engine. We will use vLLM as the
+inference server and try to find the parameters that provide us the best
+performance. vLLM provides a script to automate the process of finding the
+optimal server parameter combination (max-num-seqs and max-num-batched-tokens)
+to maximize throughput for a vLLM server. It also supports additional
+constraints such as E2E latency and prefix cache hit rate. More details are on
+the
+[official GitHub repository](https://github.com/vllm-project/vllm/tree/main/benchmarks/auto_tune)
+
+## Before you begin
+
+- Make sure that the
+  [GKE Inference reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
+  is deployed and configured.
+
+## Workflow
+
+This example will run through the following steps:
+
+1. Apply the terraform, which will:
+
+   - Create a GCS bucket for storing optimization results.
+   - Create a Kubernetes namespace where the vLLM auto-tune job will run.
+   - Create the Kubernetes service account for the running vLLM auto-tune job.
+   - Grant the required IAM permissions for workload identity KSA.
+
+2. Create the custom kubernetes manifest for the vLLM auto-tune job
+3. Run the vLLM auto-tune job based on a out-of-the box configuration provided
+   with the reference architecture.
+4. Store the results generated from the vLLM auto-tune job in the GCS bucket.
+
+## Resources Created
+
+- Cloud Storage Bucket
+- Kubernetes namespace
+- Kubernetes Service Account in the namespace
+- IAM Permissions for Kubernetes service account
+  - roles/secretmanager.secretAccessor for Hugging Face token in Secret Manager
+  - roles/storage.bucketViewer for results bucket
+  - roles/storage.objectUser for results bucket
+
+## Pull the source code
+
+- Open [Cloud Shell](https://cloud.google.com/shell).
+
+- Clone the repository and set the repository directory environment variable.
+
+  ```shell
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms && \
+  export ACP_REPO_DIR="$(pwd)"
+  ```
+
+To set the `ACP_REPO_DIR` value for new shell instances, write the value to your
+shell initialization file.
+
+`bash`
+
+```shell
+sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.bashrc
+```
+
+`zsh`
+
+```shell
+sed -n -i -e '/^export ACP_REPO_DIR=/!p' -i -e '$aexport ACP_REPO_DIR="'"${ACP_REPO_DIR}"'"' ${HOME}/.zshrc
+```
+
+## Configuration
+
+Terraform loads variables in the following order, with later sources taking
+precedence over earlier ones:
+
+- Environment variables (`TF_VAR_<variable_name>`)
+- Any `*.auto.tfvars` or files, processed in lexical order of their filenames.
+- Any `-var` and `-var-file` options on the command line, in the order they are
+  provided.
+
+For more information about providing values for Terraform input variables, see
+[Terraform input variables](https://developer.hashicorp.com/terraform/language/values/variables).
+
+- Set the platform default project ID
+
+  ```shell
+  export TF_VAR_platform_default_project_id="<PROJECT_ID>"
+  ```
+
+  **-- OR --**
+
+  ```shell
+  vi ${ACP_REPO_DIR}/platforms/gke/base/_shared_config/platform.auto.tfvars
+  ```
+
+  ```hcl
+  platform_default_project_id = "<PROJECT_ID>"
+  ```
+
+### Install Terraform 1.8.0+
+
+> [!IMPORTANT]  
+> At the time this guide was written, Cloud Shell had Terraform v1.5.7 installed
+> by default. Terraform version 1.8.0 or later is required for this guide.
+
+- Check the terraform version in your cloud shell
+
+  ```shell
+    'terraform version'
+  ```
+
+- Run the `install_terraform.sh` script to install Terraform 1.8.0.
+
+  ```shell
+  "${ACP_REPO_DIR}/tools/bin/install_terraform.sh"
+  ```
+
+## Deploy
+
+### Run Terraform to create the resources
+
+```shell
+export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning && \
+rm -rf .terraform/ terraform.tfstate* && \
+terraform init && \
+terraform plan -input=false -out=tfplan && \
+terraform apply -input=false tfplan && \
+rm tfplan
+```
+
+### Configure the environment variables
+
+The goal here is to find the most optimum vLLM parameters for serving
+Qwen3/Qwen3-32B on a single chip of nvidia-h100-80gb(A3 High) and
+nvidia-rtx-pro-6000(G4) in order to get the maximum throughput. This reference
+architecture provides templated manifests to serve the following single
+accelerator chip and model combination.
+
+    | Model                          | h100 | RTX Pro 6000 |
+    | ------------------------------ | ---- | ------------ |
+    | qwen3-32b                      | ✅   | ✅           |
+
+- Select an accelerator.
+
+  - **NVIDIA H100 80GB**:
+
+    ```shell
+    export ACCELERATOR_TYPE="h100"
+    ```
+
+  - **NVIDIA RTX PRO 6000 96GB**:
+
+    ```shell
+    export ACCELERATOR_TYPE="rtx-pro-6000"
+    ```
+
+    Ensure that you have enough quota in your project to provision the selected
+    accelerator type. For more information, see about viewing GPU quotas, see
+    [Allocation quotas: GPU quota](https://cloud.google.com/compute/resource-usage#gpu_quota).
+
+- Set the model.
+
+  - **Qwen3-32B**:
+
+    ```shell
+    export HF_MODEL_ID="qwen/qwen3-32b"
+    ```
+
+- Source the environment configuration.
+
+  ```shell
+  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+  ```
+
+- Check the model name.
+
+  ```shell
+  echo "HF_MODEL_NAME=${HF_MODEL_NAME}"
+  ```
+
+- Export the vLLM service endpoint
+
+  ```shell
+  export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+  ```
+
+### Download the model from HuggingFace to GCS bucket
+
+- [Generate a Hugging Face tokens](https://huggingface.co/docs/hub/security-tokens)
+  with token type **Read**.
+- Add the token to the secret manager
+
+  ```
+  HF_TOKEN_READ=<YOUR_HUGGINGFACE_READ_TOKEN>
+  echo ${HF_TOKEN_READ} | gcloud secrets versions add ${huggingface_hub_access_token_read_secret_manager_secret_name} --data-file=- --project=${huggingface_secret_manager_project_id}
+  ```
+
+- Source the environment configuration.
+
+  ```shell
+  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+  ```
+
+- Configure the model download job.
+
+  ```shell
+  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/configure_huggingface.sh"
+  ```
+
+- Deploy the model download job.
+
+  ```shell
+  kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
+  ```
+
+- Watch the model download job until it is complete.
+
+  ```shell
+  watch --color --interval 5 --no-title \
+  "kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-hf-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'
+  echo '\nLogs(last 10 lines):'
+  kubectl --namespace=${huggingface_hub_downloader_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-hf-model-to-gcs --all-containers --tail 10"
+  ```
+
+  When the job is complete, you will see the following:
+
+  ```text
+  NAME                       STATUS     COMPLETIONS   DURATION   AGE
+  XXXXXXXX-hf-model-to-gcs   Complete   1/1           ###        ###
+  ```
+
+  You can press `CTRL`+`c` to terminate the watch.
+
+- Delete the model download job.
+
+  ```shell
+  kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/huggingface"
+  ```
+
+### Deploy the vLLM auto-tune job
+
+- Configure the vLLM auto-tune job.
+
+  ```shell
+  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh"
+  ```
+
+- Deploy the vLLM auto-tune job.
+
+  ```shell
+  kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+  ```
+
+### Check the status of the job
+
+```shell
+  watch --color --interval 5 --no-title \
+  "kubectl --namespace=${ira_auto_tuning_vllm_kubernetes_namespace_name} get job/vllm-auto-tuning-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
+  echo '\nLogs(last 10 lines):'
+  kubectl --namespace=${ira_auto_tuning_vllm_kubernetes_namespace_name} logs job/vllm-auto-tuning-${ACCELERATOR_TYPE}-${HF_MODEL_NAME} --all-containers --tail 10"
+```
+
+When the job is complete, you will see the following:
+
+```text
+NAME                         STATUS     COMPLETIONS   DURATION   AGE
+vllm-auto-tuning-XXXXXX      Complete    1/1           XXX       XXX
+```
+
+## Analyze and Interpret Results
+
+Download the results from the GCS bucket to your
+[Cloud Shell](https://cloud.google.com/shell).
+
+```
+gcloud storage cp --recursive gs://${ira_auto_tuning_vllm_results_bucket}
+```
+
+View the files to see the result.
+
+## Clean up
+
+- Delete the vLLM auto-tune job.
+
+  ```shell
+  kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}"
+  ```
+
+- Destroy the vLLM auto-tune resources created via Terraform.
+
+  > Note: This will only destroy your benchmarking results GCS bucket only if
+  > its empty
+
+  ```shell
+  export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+  cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning  && \
+  rm -rf .terraform/ terraform.tfstate* && \
+  terraform init &&
+  terraform destroy -auto-approve
+  ```
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml
new file mode 100644
index 000000000..ad063239b
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/kustomization.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - job.env
+    name: job
+    namespace: replaced-by-kustomize
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env
new file mode 100644
index 000000000..d02692bd6
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/base-vllm-auto-tune-job/templates/job.tpl.env
@@ -0,0 +1,3 @@
+INFERENCE_KUBERNETES_NAMESPACE=${ira_auto_tuning_vllm_kubernetes_namespace_name}
+INFERENCE_KUBERNETES_SERVICE_ACCOUNT=${ira_auto_tuning_vllm_kubernetes_service_account_name}
+MODEL_BUCKET_NAME=${huggingface_hub_models_bucket_name}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh
new file mode 100755
index 000000000..e802fda1f
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/configure_auto_tune_job.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o errexit
+set -o nounset
+set -o pipefail
+
+MY_PATH="$(
+  cd "$(dirname "$0")" >/dev/null 2>&1
+  pwd -P
+)"
+ENV_FILE_1="${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh"
+ENV_FILE_2="${MY_PATH}/../../examples/llmd/_shared_config/scripts/set_environment_variables.sh"
+
+if [ -f "$ENV_FILE_1" ]; then
+  source "$ENV_FILE_1"
+elif [ -f "$ENV_FILE_2" ]; then
+  source "$ENV_FILE_2"
+else
+  echo "Warning: No environment variable file found."
+
+fi
+
+envsubst <"${MY_PATH}/base-vllm-auto-tune-job/templates/job.tpl.env" | sponge "${MY_PATH}/base-vllm-auto-tune-job/job.env"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml
new file mode 100644
index 000000000..5160d39b0
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/configmap-scripts.yaml
@@ -0,0 +1,57 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-auto-tuning-script
+  namespace: replaced-by-kustomize
+data:
+  auto_tune.sh: |
+    #!/bin/bash
+    set -euo pipefail
+
+    # 1. Install missing system tools
+    apt-get update && apt-get install -y bc git curl
+
+    # 2. Clone the repo to a neutral location
+    mkdir -p /tmp/vllm-source
+    git clone --depth 1 https://github.com/vllm-project/vllm.git /tmp/vllm-source
+
+    # 3. Move only the benchmarks to your working directory
+    # and DELETE the source folder that causes the circular import.
+    mkdir -p /vllm-workspace/benchmarks
+    cp -r /tmp/vllm-source/benchmarks/auto_tune /vllm-workspace/benchmarks/
+    rm -rf /tmp/vllm-source  # Get rid of the conflicting source code
+
+    # 4. Fix permissions and create the empty vllm folder the script expects
+    chmod +x /vllm-workspace/benchmarks/auto_tune/auto_tune.sh
+    mkdir -p /vllm-workspace/vllm  # Script needs this to exist, but keep it empty
+
+    # 5. Set paths
+    export BASE="/vllm-workspace"
+    export SCRIPT_DIR="/vllm-workspace/benchmarks/auto_tune"
+
+    # 6. Run the tuner
+    cd /vllm-workspace
+    sed -i 's|"${common_args_array\[@\]}" > "$vllm_log"|"${common_args_array\[@\]}" "--kv-cache-dtype" "fp8" > "$vllm_log"|'  benchmarks/auto_tune/auto_tune.sh
+    bash benchmarks/auto_tune/auto_tune.sh
+
+    # 7. Results handling
+    LATEST_DIR=$(ls -td auto-benchmark/*/ | head -1 || true)
+    if [ -n "$LATEST_DIR" ]; then
+      cp -r "$LATEST_DIR" /output
+    fi
+
+    echo "Tuning complete."
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml
new file mode 100644
index 000000000..1835ca994
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/job.yaml
@@ -0,0 +1,169 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vllm-auto-tuning
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    metadata:
+      annotations:
+        gke-gcsfuse/cpu-limit: "0"
+        gke-gcsfuse/ephemeral-storage-limit: "0"
+        gke-gcsfuse/memory-limit: "0"
+        gke-gcsfuse/volumes: "true"
+      labels:
+        ai.gke.io/model: replaced-by-kustomize
+        app: vllm-auto-tuning
+    spec:
+      serviceAccountName: replaced-by-kustomize
+      containers:
+        - args:
+            - |
+              echo "########### $(date) - Starting parallel-fetch-safetensors for model: ${MODEL_ID}"
+              ls -alR /gcs
+              find /gcs/${MODEL_ID}/*safetensors -type f | xargs -I {} -P 15 sh -c 'echo "########### $(date) - Fetching: {}"; dd if={} of=/dev/null'
+              echo "########### $(date) - Finished parallel-fetch-safetensors"
+          command: ["/bin/sh", "-c"]
+          env:
+            - name: MODEL_ID
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_ID
+                  name: runtime
+          image: busybox
+          name: fetch-safetensors
+          volumeMounts:
+            - mountPath: /gcs
+              name: huggingface-hub-model-bucket
+              readOnly: true
+        - name: autotuner
+          image: replaced-by-kustomize
+          command: ["/bin/bash", "-c"]
+          env:
+            - name: VLLM_CACHE_ROOT
+              value: /gcs
+            - name: DOWNLOAD_DIR
+              value: /gcs
+            - name: HF_HUB_OFFLINE
+              value: "1"
+            - name: MODEL
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_LOCATION
+                  name: runtime
+            - name: TRANSFORMERS_OFFLINE
+              value: "1"
+            - name: VLLM_AUTO_TUNING_RESULTS_BUCKET
+              valueFrom:
+                configMapKeyRef:
+                  key: VLLM_AUTO_TUNING_RESULTS_BUCKET
+                  name: vllm
+            - name: GPU_MEMORY_UTILIZATION
+              valueFrom:
+                configMapKeyRef:
+                  key: GPU_MEMORY_UTILIZATION
+                  name: runtime
+            - name: TP
+              value: "1"
+            - name: INPUT_LEN
+              valueFrom:
+                configMapKeyRef:
+                  key: INPUT_LEN
+                  name: runtime
+            - name: OUTPUT_LEN
+              valueFrom:
+                configMapKeyRef:
+                  key: OUTPUT_LEN
+                  name: runtime
+            - name: MAX_MODEL_LEN
+              valueFrom:
+                configMapKeyRef:
+                  key: MAX_MODEL_LEN
+                  name: runtime
+            - name: NUM_SEQS_LIST
+              valueFrom:
+                configMapKeyRef:
+                  key: NUM_SEQS_LIST
+                  name: runtime
+            - name: NUM_BATCHED_TOKENS_LIST
+              valueFrom:
+                configMapKeyRef:
+                  key: NUM_BATCHED_TOKENS_LIST
+                  name: runtime
+            - name: MIN_CACHE_HIT_PCT
+              valueFrom:
+                configMapKeyRef:
+                  key: MIN_CACHE_HIT_PCT
+                  name: runtime
+            - name: MAX_LATENCY_ALLOWED_MS
+              valueFrom:
+                configMapKeyRef:
+                  key: MAX_LATENCY_ALLOWED_MS
+                  name: runtime
+          args: ["/mnt/config/auto_tune.sh"]
+          volumeMounts:
+            - name: config-volume
+              mountPath: /mnt/config
+            - mountPath: /gcs
+              name: huggingface-hub-model-bucket
+            - mountPath: /output
+              name: vllm-auto-tuner-results
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: config-volume
+          configMap:
+            name: vllm-auto-tuning-script
+            defaultMode: 0755
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: "16Gi"
+        - name: results
+          emptyDir: {}
+        - csi:
+            driver: secrets-store-gke.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: huggingface-token-read
+          name: huggingface-token
+        - csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: cloud-storage-bucket-name
+              mountOptions: metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1,metadata-cache:negative-ttl-secs:0,file-cache:max-size-mb:-1,file-cache:cache-file-for-range-read:true,file-cache:enable-parallel-downloads:true,implicit-dirs,file-system:kernel-list-cache-ttl-secs:-1,only-dir:replaced-by-kustomize,uid=2000,gid=2000
+              skipCSIBucketAccessCheck: "true"
+          name: huggingface-hub-model-bucket
+        - csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: cloud-storage-bucket-name
+              mountOptions: implicit-dirs,uid=2000,gid=2000
+              skipCSIBucketAccessCheck: "true"
+          name: vllm-auto-tuner-results
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-cache
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-tmp
+        - emptyDir:
+            medium: Memory
+          name: gke-gcsfuse-buffer
+      restartPolicy: OnFailure
+      securityContext:
+        fsGroup: 10000
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml
new file mode 100644
index 000000000..af66c2d75
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/kustomization.yaml
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - vllm.env
+    name: vllm
+    namespace: replaced-by-kustomize
+resources:
+  - ../../base-vllm-auto-tune-job
+  - configmap-scripts.yaml
+  - job.yaml
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env
new file mode 100644
index 000000000..7e2a93553
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/base/templates/vllm.tpl.env
@@ -0,0 +1,2 @@
+VLLM_AUTO_TUNER_IMAGE=vllm/vllm-openai:latest
+VLLM_AUTO_TUNING_RESULTS_BUCKET=${ira_auto_tuning_vllm_results_bucket}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh
new file mode 100755
index 000000000..e2945e271
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o errexit
+set -o nounset
+set -o pipefail
+
+MY_PATH="$(
+  cd "$(dirname "$0")" >/dev/null 2>&1
+  pwd -P
+)"
+
+source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_variables.sh"
+
+"${MY_PATH}/../configure_auto_tune_job.sh"
+
+envsubst <"${MY_PATH}/base/templates/vllm.tpl.env" | sponge "${MY_PATH}/base/vllm.env"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml
new file mode 100644
index 000000000..ca23115d9
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/kustomization.yaml
@@ -0,0 +1,134 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -h100-qwen3-32b
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.app
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.VLLM_AUTO_TUNER_IMAGE
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=autotuner].image
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.VLLM_AUTO_TUNING_RESULTS_BUCKET
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=vllm-auto-tuner-results].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: job
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Job
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: job
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Job
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: job
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Job
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=autotuner].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Job
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml
new file mode 100644
index 000000000..8dfd58e81
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vllm-auto-tuning
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-h100-80gb-high-x1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml
new file mode 100644
index 000000000..23d3c5f6d
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vllm-auto-tuning
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: autotuner
+          resources:
+            limits:
+              cpu: "10"
+              memory: 128G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 128G
+              nvidia.com/gpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env
new file mode 100644
index 000000000..f410f3064
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/h100-qwen3-32b/runtime.env
@@ -0,0 +1,12 @@
+APP_LABEL=vllm-h100-qwen3-32b
+GPU_MEMORY_UTILIZATION=0.90
+INPUT_LEN=1028
+MAX_LATENCY_ALLOWED_MS=100000000000
+MAX_MODEL_LEN=32768
+MIN_CACHE_HIT_PCT=0
+MODEL_ID=qwen/qwen3-32b
+MODEL_NAME=qwen3-32b
+MODEL_LOCATION=/gcs/qwen/qwen3-32b
+NUM_BATCHED_TOKENS_LIST=4096 8192 16384 32768 65536
+NUM_SEQS_LIST=64 128 256 512
+OUTPUT_LEN=128
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml
new file mode 100644
index 000000000..dc3f3c328
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/kustomization.yaml
@@ -0,0 +1,134 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - envs:
+      - runtime.env
+    name: runtime
+    namespace: replaced-by-kustomize
+
+nameSuffix: -rtx-pro-6000-qwen3-32b
+
+patches:
+  - path: patch-nodeselector.yaml
+  - path: patch-resources.yaml
+
+replacements:
+  - source:
+      fieldPath: data.APP_LABEL
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.app
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.VLLM_AUTO_TUNER_IMAGE
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.containers.[name=autotuner].image
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.VLLM_AUTO_TUNING_RESULTS_BUCKET
+      kind: ConfigMap
+      name: vllm
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=vllm-auto-tuner-results].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_NAMESPACE
+      kind: ConfigMap
+      name: job
+    targets:
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ConfigMap
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: Job
+      - fieldPaths:
+          - metadata.namespace
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.INFERENCE_KUBERNETES_SERVICE_ACCOUNT
+      kind: ConfigMap
+      name: job
+    targets:
+      - fieldPaths:
+          - spec.template.spec.serviceAccountName
+        select:
+          kind: Job
+      - fieldPaths:
+          - metadata.name
+        select:
+          kind: ServiceAccount
+  - source:
+      fieldPath: data.MODEL_BUCKET_NAME
+      kind: ConfigMap
+      name: job
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.MODEL_ID
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.spec.volumes.[name=huggingface-hub-model-bucket].csi.volumeAttributes.mountOptions
+        options:
+          delimiter: "only-dir:"
+          index: 1
+        select:
+          kind: Job
+      - fieldPaths:
+          - spec.template.spec.containers.[name=fetch-safetensors].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+          - spec.template.spec.containers.[name=autotuner].volumeMounts.[name=huggingface-hub-model-bucket].mountPath
+        options:
+          delimiter: /
+          index: 2
+        select:
+          kind: Job
+  - source:
+      fieldPath: data.MODEL_NAME
+      kind: ConfigMap
+      name: runtime
+    targets:
+      - fieldPaths:
+          - spec.template.metadata.labels.[ai.gke.io/model]
+        select:
+          kind: Job
+
+resources:
+  - ../base
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml
new file mode 100644
index 000000000..53e256d73
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-nodeselector.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vllm-auto-tuning
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      nodeSelector:
+        cloud.google.com/compute-class: gpu-rtx-pro-6000-96gb-x1
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml
new file mode 100644
index 000000000..23d3c5f6d
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/patch-resources.yaml
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vllm-auto-tuning
+  namespace: replaced-by-kustomize
+spec:
+  template:
+    spec:
+      containers:
+        - name: autotuner
+          resources:
+            limits:
+              cpu: "10"
+              memory: 128G
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 128G
+              nvidia.com/gpu: "1"
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env
new file mode 100644
index 000000000..cbb6c9c61
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/rtx-pro-6000-qwen3-32b/runtime.env
@@ -0,0 +1,12 @@
+APP_LABEL=vllm-rtx-pro-6000-qwen3-32b
+GPU_MEMORY_UTILIZATION=0.90
+INPUT_LEN=1028
+MAX_LATENCY_ALLOWED_MS=100000000000
+MAX_MODEL_LEN=32768
+MIN_CACHE_HIT_PCT=0
+MODEL_ID=qwen/qwen3-32b
+MODEL_NAME=qwen3-32b
+MODEL_LOCATION=/gcs/qwen/qwen3-32b
+NUM_BATCHED_TOKENS_LIST=4096 8192 16384 32768 65536
+NUM_SEQS_LIST=64 128 256 512
+OUTPUT_LEN=128
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf
index 8f573d8f7..64f7f0dd5 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/outputs.tf
@@ -104,6 +104,14 @@ output "ira_async_pubsub_prompt_messages_topic_name" {
   value = local.ira_async_pubsub_prompt_messages_topic_name
 }
 
+output "ira_auto_tuning_vllm_kubernetes_namespace_name" {
+  value = local.ira_auto_tuning_vllm_kubernetes_namespace_name
+}
+
+output "ira_auto_tuning_vllm_results_bucket" {
+  value = local.ira_auto_tuning_vllm_results_bucket
+}
+
 output "ira_inference_perf_bench_kubernetes_service_account_name" {
   value = local.ira_inference_perf_bench_kubernetes_service_account_name
 }
@@ -239,4 +247,3 @@ output "workflow_api_service_account_oauth_display_name" {
 output "workflow_api_service_account_project_id" {
   value = local.workflow_api_service_account_project_id
 }
-
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars
@@ -0,0 +1 @@
+
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf
new file mode 100644
index 000000000..ccc9e3a25
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf
@@ -0,0 +1,44 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+locals {
+  ira_auto_tuning_vllm_kubernetes_namespace_name       = var.ira_auto_tuning_vllm_kubernetes_namespace_name != null ? var.ira_auto_tuning_vllm_kubernetes_namespace_name : "${local.unique_identifier_prefix}-vllm-auto-tuning"
+  ira_auto_tuning_vllm_kubernetes_service_account_name = var.ira_auto_tuning_vllm_kubernetes_service_account_name != null ? var.ira_auto_tuning_vllm_kubernetes_service_account_name : "${local.unique_identifier_prefix}-vllm-auto-tuning-ksa"
+  ira_auto_tuning_vllm_results_bucket                  = var.ira_auto_tuning_vllm_results_bucket != null ? var.ira_auto_tuning_vllm_results_bucket : "${local.cluster_project_id}-${local.unique_identifier_prefix}-vllm-auto-tuning-results"
+  ira_auto_tuning_vllm_secretproviderclass             = var.ira_auto_tuning_vllm_secretproviderclass != null ? var.ira_auto_tuning_vllm_secretproviderclass : "huggingface-token-read"
+}
+
+variable "ira_auto_tuning_vllm_kubernetes_namespace_name" {
+  default     = null
+  description = "The Kubernetes namespace for the batch CPU load generator workloads."
+  type        = string
+}
+
+variable "ira_auto_tuning_vllm_kubernetes_service_account_name" {
+  default     = null
+  description = "The Kubernetes service account for the batch CPU load generator workloads."
+  type        = string
+}
+
+variable "ira_auto_tuning_vllm_results_bucket" {
+  default     = null
+  description = "The GCS bucket for storing auto-tuning results."
+  type        = string
+}
+
+variable "ira_auto_tuning_vllm_secretproviderclass" {
+  default     = null
+  description = "The Secretproviderclass to access huggingface read token."
+  type        = string
+}
+
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl
new file mode 100644
index 000000000..27d625960
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/.terraform.lock.hcl
@@ -0,0 +1,42 @@
+# This file is maintained automatically by "terraform init".
+# Manual edits may be lost in future updates.
+
+provider "registry.terraform.io/hashicorp/google" {
+  version     = "7.6.0"
+  constraints = "7.6.0"
+  hashes = [
+    "h1:JYsO3fV5OtaNuRTdjGZC1Z3Ku1ZIrRJGwXwsBjtWudk=",
+    "zh:0c70c768b0a34d7a61de70d0e85cf0057820556647bbce2384972a45d7092e4e",
+    "zh:0cb7aab89cd435c5c8dab9231ea176d64fdf1df1125db15a6b9ead978a93c0b2",
+    "zh:32f25c42214bb356bb67cef6057c9904f2878cd053a7760e5ee3737619f28638",
+    "zh:38b05b1171ab086c88b95d379120fb6c28c9e895ae924557c11c35e138319119",
+    "zh:39d8206d453a614fa0be3aeac8ea3921fb3ab7ed122205cbbcc2a41ca6176cb5",
+    "zh:58d9059aa6b4aab5ede4fc173dcdc7b4d042d0b1a1ab55407dd345931d7f4815",
+    "zh:a4bc001c8ac7700d0107155296250c3b8969511e1a488f3b318f3db62362eef2",
+    "zh:cc75e25db4bb672ebc200a89d6cff9ff0b9911e14e188d1b4429bb3511d2b35f",
+    "zh:d7f7639930735f17b2b4f73814204a9a050186ea7e1c2671a52e0fa7ddf7a001",
+    "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
+    "zh:ff1190ae618dae9243de59caf4149abb4a9b775cb6439f119cd32a30f1a21820",
+    "zh:ff15b7b86787f6fd186211e7c37a72f2cc70374b284aaf063e1f989717441161",
+  ]
+}
+
+provider "registry.terraform.io/hashicorp/local" {
+  version     = "2.5.3"
+  constraints = "2.5.3"
+  hashes = [
+    "h1:1Nkh16jQJMp0EuDmvP/96f5Unnir0z12WyDuoR6HjMo=",
+    "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927",
+    "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e",
+    "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b",
+    "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
+    "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf",
+    "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d",
+    "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09",
+    "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f",
+    "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1",
+    "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6",
+    "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47",
+    "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82",
+  ]
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars
new file mode 120000
index 000000000..c730c32e8
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild.auto.tfvars
@@ -0,0 +1 @@
+../../../../_shared_config/cloudbuild.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf
new file mode 120000
index 000000000..5a143590a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cloudbuild_variables.tf
@@ -0,0 +1 @@
+../../../../_shared_config/cloudbuild_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars
new file mode 120000
index 000000000..98a694db9
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster.auto.tfvars
@@ -0,0 +1 @@
+../../../../_shared_config/cluster.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf
new file mode 120000
index 000000000..00625515b
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_cluster_variables.tf
@@ -0,0 +1 @@
+../../../../_shared_config/cluster_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars
new file mode 120000
index 000000000..276530b81
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface.auto.tfvars
@@ -0,0 +1 @@
+../../../../_shared_config/huggingface.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf
new file mode 120000
index 000000000..f384bc7e1
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_huggingface_variables.tf
@@ -0,0 +1 @@
+../../../../_shared_config/huggingface_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars
new file mode 120000
index 000000000..31e8d28d2
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch.auto.tfvars
@@ -0,0 +1 @@
+../_shared_config/inference-ref-arch.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf
new file mode 120000
index 000000000..502fdca32
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_inference-ref-arch_variables.tf
@@ -0,0 +1 @@
+../_shared_config/inference-ref-arch_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars
new file mode 120000
index 000000000..125a652cf
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform.auto.tfvars
@@ -0,0 +1 @@
+../../../../_shared_config/platform.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf
new file mode 120000
index 000000000..486b3eaef
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_platform_variables.tf
@@ -0,0 +1 @@
+../../../../_shared_config/platform_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars
new file mode 120000
index 000000000..3499ccadc
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning.auto.tfvars
@@ -0,0 +1 @@
+../_shared_config/vllm-auto-tuning.auto.tfvars
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf
new file mode 120000
index 000000000..ba00da1f7
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/_vllm-auto-tuning_variables.tf
@@ -0,0 +1 @@
+../_shared_config/vllm-auto-tuning_variables.tf
\ No newline at end of file
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf
new file mode 100644
index 000000000..6f02f6041
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/iam.tf
@@ -0,0 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+resource "google_storage_bucket_iam_member" "auto_tuning_bucket_access_at_ksa" {
+  bucket = google_storage_bucket.auto_tuning_results.name
+  member = local.ira_auto_tuning_vllm_ksa_member
+  role   = "roles/storage.objectUser"
+}
+
+resource "google_storage_bucket_iam_member" "hub_models_bucket_access_at_ksa" {
+  bucket = local.huggingface_hub_models_bucket_name
+  member = local.ira_auto_tuning_vllm_ksa_member
+  role   = "roles/storage.objectUser"
+}
+
+resource "google_secret_manager_secret_iam_member" "hub_token_read_access_at_ksa" {
+  member    = local.ira_auto_tuning_vllm_ksa_member
+  project   = data.google_secret_manager_secret.hub_access_token_read.project
+  role      = "roles/secretmanager.secretAccessor"
+  secret_id = data.google_secret_manager_secret.hub_access_token_read.secret_id
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf
new file mode 100644
index 000000000..85bc698ff
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/kubernetes.tf
@@ -0,0 +1,103 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+locals {
+  cluster_wi_principal_prefix                         = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject"
+  ira_auto_tuning_vllm_ksa_member                     = "${local.cluster_wi_principal_prefix}/ns/${local.ira_auto_tuning_vllm_kubernetes_namespace_name}/sa/${local.ira_auto_tuning_vllm_kubernetes_service_account_name}"
+  ira_auto_tuning_vllm_kubernetes_namespace_directory = "${local.namespaces_directory}/${local.ira_auto_tuning_vllm_kubernetes_namespace_name}"
+  kubeconfig_directory                                = "${path.module}/../../../../kubernetes/kubeconfig"
+  kubeconfig_file                                     = "${local.kubeconfig_directory}/${local.kubeconfig_file_name}"
+  manifests_directory_root                            = "${path.module}/../../../../kubernetes/manifests"
+  namespaces_directory                                = "${local.manifests_directory_root}/namespace"
+}
+
+data "local_file" "kubeconfig" {
+  filename = local.kubeconfig_file
+}
+
+# Create Namespace
+resource "local_file" "namespace_yaml" {
+  content = templatefile(
+    "${path.module}/templates/kubernetes/namespace.tftpl.yaml",
+    {
+      kubernetes_namespace = local.ira_auto_tuning_vllm_kubernetes_namespace_name
+    }
+  )
+  file_permission = "0644"
+  filename        = "${local.namespaces_directory}/namespace-${local.ira_auto_tuning_vllm_kubernetes_namespace_name}.yaml"
+}
+
+module "kubectl_apply_namespace" {
+  depends_on = [
+    local_file.namespace_yaml,
+  ]
+
+  source = "../../../../modules/kubectl_apply"
+
+  delete_timeout              = "60s"
+  error_on_delete_failure     = false
+  kubeconfig_file             = data.local_file.kubeconfig.filename
+  manifest                    = "${local.namespaces_directory}/namespace-${local.ira_auto_tuning_vllm_kubernetes_namespace_name}.yaml"
+  manifest_includes_namespace = true
+}
+
+
+resource "local_file" "serviceaccount_yaml" {
+  content = templatefile(
+    "${path.module}/templates/kubernetes/serviceaccount.tftpl.yaml",
+    {
+      name      = local.ira_auto_tuning_vllm_kubernetes_service_account_name
+      namespace = local.ira_auto_tuning_vllm_kubernetes_namespace_name
+    }
+  )
+  filename = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/serviceaccount-${local.ira_auto_tuning_vllm_kubernetes_service_account_name}.yaml"
+}
+
+module "kubectl_apply_service_account" {
+  source = "../../../../modules/kubectl_apply"
+  depends_on = [
+    local_file.serviceaccount_yaml, module.kubectl_apply_namespace
+  ]
+
+  apply_server_side           = true
+  kubeconfig_file             = data.local_file.kubeconfig.filename
+  manifest                    = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/serviceaccount-${local.ira_auto_tuning_vllm_kubernetes_service_account_name}.yaml"
+  manifest_includes_namespace = true
+}
+
+resource "local_file" "secretproviderclass_yaml" {
+  content = templatefile(
+    "${path.module}/templates/kubernetes/secretproviderclass.tftpl.yaml",
+    {
+      namespace                = local.ira_auto_tuning_vllm_kubernetes_namespace_name
+      project_id               = data.google_secret_manager_secret.hub_access_token_read.project
+      secretproviderclass_name = local.ira_auto_tuning_vllm_secretproviderclass
+      secret_name              = local.huggingface_hub_access_token_read_secret_manager_secret_name
+
+    }
+  )
+  filename = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/secretproviderclass-${local.ira_auto_tuning_vllm_secretproviderclass}.yaml"
+}
+
+module "kubectl_apply_secretproviderclass" {
+  source = "../../../../modules/kubectl_apply"
+  depends_on = [
+    local_file.secretproviderclass_yaml, module.kubectl_apply_namespace
+  ]
+
+  apply_server_side           = true
+  kubeconfig_file             = data.local_file.kubeconfig.filename
+  manifest                    = "${local.ira_auto_tuning_vllm_kubernetes_namespace_directory}/secretproviderclass-${local.ira_auto_tuning_vllm_secretproviderclass}.yaml"
+  manifest_includes_namespace = true
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf
new file mode 100644
index 000000000..c6dc19a4a
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/project.tf
@@ -0,0 +1,18 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+data "google_project" "cluster" {
+  project_id = local.cluster_project_id
+}
+
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf
new file mode 100644
index 000000000..94f82d903
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/secret_manager.tf
@@ -0,0 +1,18 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+data "google_secret_manager_secret" "hub_access_token_read" {
+  project   = var.platform_default_project_id
+  secret_id = local.huggingface_hub_access_token_read_secret_manager_secret_name
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf
new file mode 100644
index 000000000..1e9cd6cb8
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/storage.tf
@@ -0,0 +1,28 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+### Hugging Face Model GCS Bucket Name ###
+data "google_storage_bucket" "hub_models" {
+  name    = local.huggingface_hub_models_bucket_name
+  project = local.huggingface_hub_models_bucket_project_id
+}
+
+### GCS bucket for storage of vllm-auto-tuning results ###
+resource "google_storage_bucket" "auto_tuning_results" {
+  name                        = local.ira_auto_tuning_vllm_results_bucket
+  location                    = local.cluster_region
+  uniform_bucket_level_access = true
+  force_destroy               = false
+  project                     = var.platform_default_project_id
+}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml
new file mode 100644
index 000000000..0b3559cd2
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/namespace.tftpl.yaml
@@ -0,0 +1,18 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ${kubernetes_namespace}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml
new file mode 100644
index 000000000..7f381f934
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/secretproviderclass.tftpl.yaml
@@ -0,0 +1,25 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: ${secretproviderclass_name}
+  namespace: ${namespace}
+spec:
+  parameters:
+    secrets: |
+      - resourceName: "projects/${project_id}/secrets/${secret_name}/versions/latest"
+        path: "token"
+  provider: gke
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml
new file mode 100644
index 000000000..a0f63c9dc
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/templates/kubernetes/serviceaccount.tftpl.yaml
@@ -0,0 +1,19 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ${name}
+  namespace: ${namespace}
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf
new file mode 100644
index 000000000..dca89f111
--- /dev/null
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/vllm-auto-tuning/versions.tf
@@ -0,0 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+terraform {
+  required_version = ">= 1.5.7"
+
+  required_providers {
+    google = {
+      source  = "hashicorp/google"
+      version = "7.6.0"
+    }
+    local = {
+      source  = "hashicorp/local"
+      version = "2.5.3"
+    }
+  }
+
+  provider_meta "google" {
+    module_name = "cloud-solutions/vllm_auto_tuner-v1"
+  }
+}

From d000695d5afe7325e3b64aacdecb652a580012c0 Mon Sep 17 00:00:00 2001
From: gushob21 <gushob@google.com>
Date: Thu, 28 May 2026 16:19:42 +0000
Subject: [PATCH 2/3] fixing end of the file newline

---
 .../_shared_config/vllm-auto-tuning.auto.tfvars    | 14 +++++++++++++-
 .../_shared_config/vllm-auto-tuning_variables.tf   |  1 -
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars
index 8b1378917..c37e93b74 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning.auto.tfvars
@@ -1 +1,13 @@
-
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf
index ccc9e3a25..1824fbf2c 100644
--- a/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf
+++ b/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/vllm-auto-tuning_variables.tf
@@ -41,4 +41,3 @@ variable "ira_auto_tuning_vllm_secretproviderclass" {
   description = "The Secretproviderclass to access huggingface read token."
   type        = string
 }
-

From c0561904d51bc34f8aba2a130e3cd149d0b097a2 Mon Sep 17 00:00:00 2001
From: gushob21 <gushob@google.com>
Date: Fri, 29 May 2026 15:05:10 +0000
Subject: [PATCH 3/3] Add validate_kustomize test for vllm-auto-tuning

---
 .../gke/base/use-cases/inference-ref-arch/validate_kustomize.sh  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh
index 102621046..0fb5a0cae 100755
--- a/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh
+++ b/test/ci-cd/scripts/platforms/gke/base/use-cases/inference-ref-arch/validate_kustomize.sh
@@ -49,6 +49,7 @@ export ACCELERATOR_TYPE="l4"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/async-inference-gpu/vllm/configure_vllm.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/diffusers/configure_diffusers.sh"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm/configure_vllm.sh"
+"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-auto-tuning/configure_vllm.sh"
 
 export ACCELERATOR_TYPE="v5e"
 "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-tpu/max-diffusion/configure_max_diffusion.sh"