Merge branch 'kubernetes-sigs:main' into main

rahulgurnani · web-flow · commit a9577cc59e40 · 2025-09-03T15:57:18.000-07:00
diff --git a/apix/v1alpha2/inferenceobjective_types.go b/apix/v1alpha2/inferenceobjective_types.go
@@ -25,7 +25,6 @@ import (
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:storageversion
-// +kubebuilder:printcolumn:name="Model Name",type=string,JSONPath=`.spec.modelName`
 // +kubebuilder:printcolumn:name="Inference Pool",type=string,JSONPath=`.spec.poolRef.name`
 // +kubebuilder:printcolumn:name="Priority",type=string,JSONPath=`.spec.priority`
 // +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
@@ -56,12 +55,6 @@ type InferenceObjectiveList struct {
 // performance and latency goals for the model. These workloads are
 // expected to operate within an InferencePool sharing compute capacity with other
 // InferenceObjectives, defined by the Inference Platform Admin.
-//
-// InferenceObjective's modelName (not the ObjectMeta name) is unique for a given InferencePool,
-// if the name is reused, an error will be shown on the status of a
-// InferenceObjective that attempted to reuse. The oldest InferenceObjective, based on
-// creation timestamp, will be selected to remain valid. In the event of a race
-// condition, one will be selected at random.
 type InferenceObjectiveSpec struct {
 
 	// Priority defines how important it is to serve the request compared to other requests in the same pool.
@@ -135,10 +128,6 @@ const (
 	//
 	// * "Accepted"
 	//
-	// Possible reasons for this condition to be False are:
-	//
-	// * "ModelNameInUse"
-	//
 	// Possible reasons for this condition to be Unknown are:
 	//
 	// * "Pending"
@@ -148,10 +137,6 @@ const (
 	// ObjectiveReasonAccepted is the desired state. Model conforms to the state of the pool.
 	ObjectiveReasonAccepted InferenceObjectiveConditionReason = "Accepted"
 
-	// ObjectiveReasonNameInUse is used when a given ModelName already exists within the pool.
-	// Details about naming conflict resolution are on the ModelName field itself.
-	ObjectiveReasonNameInUse InferenceObjectiveConditionReason = "ModelNameInUse"
-
 	// ObjectiveReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceObjective.
 	ObjectiveReasonPending InferenceObjectiveConditionReason = "Pending"
 )
diff --git a/config/charts/inferencepool/templates/inferencepool.yaml b/config/charts/inferencepool/templates/inferencepool.yaml
@@ -20,3 +20,7 @@ spec:
       {{- end }}
   endpointPickerRef:
     name: {{ include "gateway-api-inference-extension.name" . }}
+    port:
+      number: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
+
+
diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferenceobjectives.yaml
@@ -15,9 +15,6 @@ spec:
   scope: Namespaced
   versions:
   - additionalPrinterColumns:
-    - jsonPath: .spec.modelName
-      name: Model Name
-      type: string
     - jsonPath: .spec.poolRef.name
       name: Inference Pool
       type: string
@@ -61,12 +58,6 @@ spec:
               performance and latency goals for the model. These workloads are
               expected to operate within an InferencePool sharing compute capacity with other
               InferenceObjectives, defined by the Inference Platform Admin.
-
-              InferenceObjective's modelName (not the ObjectMeta name) is unique for a given InferencePool,
-              if the name is reused, an error will be shown on the status of a
-              InferenceObjective that attempted to reuse. The oldest InferenceObjective, based on
-              creation timestamp, will be selected to remain valid. In the event of a race
-              condition, one will be selected at random.
             properties:
               poolRef:
                 description: PoolRef is a reference to the inference pool, the pool
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -12,6 +12,7 @@ theme:
   logo: images/logo/logo-text-large-horizontal-white.png
   favicon: images/favicon-64.png
   features:
+    - content.code.annotate
     - search.highlight
     - navigation.tabs
     - navigation.top
@@ -55,7 +56,7 @@ nav:
         Design Principles: concepts/design-principles.md
         Conformance: concepts/conformance.md
         Roles and Personas: concepts/roles-and-personas.md
-    - Implementations: 
+    - Implementations:
       - Gateways: implementations/gateways.md
       - Model Servers: implementations/model-servers.md
     - FAQ: faq.md
@@ -70,7 +71,7 @@ nav:
         - InferencePool Rollout: guides/inferencepool-rollout.md
       - Metrics and Observability: guides/metrics-and-observability.md
       - Configuration Guide:
-          - Configuring the plugins via configuration files or text: guides/epp-configuration/config-text.md    
+          - Configuring the plugins via configuration files or text: guides/epp-configuration/config-text.md
           - Prefix Cache Aware Plugin: guides/epp-configuration/prefix-aware.md
       - Troubleshooting Guide: guides/troubleshooting.md
     - Implementer Guides:
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -137,8 +137,9 @@ Tooling:
 
 === "GKE"
 
-      1. Enable the Gateway API and configure proxy-only subnets when necessary. See [Deploy Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways)
-      for detailed instructions.
+      1. Enable the Google Kubernetes Engine API, Compute Engine API, the Network Services API and configure proxy-only subnets when necessary. 
+         See [Deploy Inference Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploy-gke-inference-gateway)
+         for detailed instructions.
 
       2. Deploy Inference Gateway:
 
diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md
@@ -1,18 +1,53 @@
 # Serve multiple generative AI models
-A company wants to deploy multiple large language models (LLMs) to serve different workloads. 
-For example, they might want to deploy a Gemma3 model for a chatbot interface and a Deepseek model for a recommendation application. 
+
+A company wants to deploy multiple large language models (LLMs) to a cluster to serve different workloads.
+For example, they might want to deploy a Gemma3 model for a chatbot interface and a DeepSeek model for a recommendation application.
 The company needs to ensure optimal serving performance for these LLMs.
-By using an Inference Gateway, you can deploy these LLMs on your cluster with your chosen accelerator configuration in an `InferencePool`. 
-You can then route requests based on the model name (such as "chatbot" and "recommender") and the `Criticality` property.
+By using an Inference Gateway, you can deploy these LLMs on your cluster with your chosen accelerator configuration in an `InferencePool`.
+You can then route requests based on the model name (such as `chatbot` and `recommender`) and the `Criticality` property.
 
 ## How
+
 The following diagram illustrates how an Inference Gateway routes requests to different models based on the model name.
-The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md)
+The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) (BBR)
  from the request body to the header. The header is then matched to dispatch
  requests to different `InferencePool` (and their EPPs) instances.
 ![Serving multiple generative AI models](../images/serve-mul-gen-AI-models.png)
 
+### Deploy Body-Based Routing
+
+To enable body-based routing, you need to deploy the Body-Based Routing ExtProc server using Helm. Depending on your Gateway provider, you can use one of the following commands:
+
+=== "GKE"
+
+    ```bash
+    helm install body-based-router \
+      --set provider.name=gke \
+      --version v0.5.1 \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing
+    ```
+
+=== "Istio"
+
+    ```bash
+    helm install body-based-router \
+      --set provider.name=istio \
+      --version v0.5.1 \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing
+    ```
+
+=== "Other"
+
+    ```bash
+    helm install body-based-router \
+      --version v0.5.1 \
+      oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing
+    ```
+
+### Configure HTTPRoute
+
 This example illustrates a conceptual example regarding how to use the `HTTPRoute` object to route based on model name like “chatbot” or “recommender” to `InferencePool`.
+
 ```yaml
 apiVersion: gateway.networking.k8s.io/v1
 kind: HTTPRoute
@@ -25,8 +60,7 @@ spec:
   - matches:
     - headers:
       - type: Exact
-        #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-        name: X-Gateway-Model-Name
+        name: X-Gateway-Model-Name # (1)!
         value: chatbot
       path:
         type: PathPrefix
@@ -37,38 +71,74 @@ spec:
   - matches:
     - headers:
       - type: Exact
-        #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-        name: X-Gateway-Model-Name
+        name: X-Gateway-Model-Name # (2)!
         value: recommender
       path:
         type: PathPrefix
         value: /
     backendRefs:
     - name: deepseek-r1
-      kind: InferencePool     
+      kind: InferencePool
 ```
 
+1. [BBR](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header with key `X-Gateway-Model-Name`. The header can then be used in the `HTTPRoute` to route requests to different `InferencePool` instances.
+2. [BBR](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header with key `X-Gateway-Model-Name`. The header can then be used in the `HTTPRoute` to route requests to different `InferencePool` instances.
+
 ## Try it out
 
 1. Get the gateway IP:
 ```bash
 IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80
 ```
-2. Send a few requests to model "chatbot" as follows:
-```bash
-curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
-"model": "chatbot",
-"prompt": "What is the color of the sky",
-"max_tokens": 100,
-"temperature": 0
-}'
-```
-3. Send a few requests to model "recommender" as follows:
-```bash
-curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
-"model": "recommender",
-"prompt": "Give me restaurant recommendations in Paris",
-"max_tokens": 100,
-"temperature": 0
-}'
-```
+
+=== "Chat Completions API"
+
+      1. Send a few requests to model `chatbot` as follows:
+      ```bash
+      curl -X POST -i ${IP}:${PORT}/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+          "model": "chatbot",
+          "messages": [{"role": "user", "content": "What is the color of the sky?"}],
+          "max_tokens": 100,
+          "temperature": 0
+        }'
+      ```
+
+      2. Send a few requests to model `recommender` as follows:
+      ```bash
+      curl -X POST -i ${IP}:${PORT}/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+          "model": "recommender",
+          "messages": [{"role": "user", "content": "Give me restaurant recommendations in Paris"}],
+          "max_tokens": 100,
+          "temperature": 0
+        }'
+      ```
+
+=== "Completions API"
+
+      1. Send a few requests to model `chatbot` as follows:
+      ```bash
+      curl -X POST -i ${IP}:${PORT}/v1/completions \
+        -H 'Content-Type: application/json' \
+        -d '{
+          "model": "chatbot",
+          "prompt": "What is the color of the sky",
+          "max_tokens": 100,
+          "temperature": 0
+        }'
+      ```
+
+      2. Send a few requests to model `recommender` as follows:
+      ```bash
+      curl -X POST -i ${IP}:${PORT}/v1/completions \
+        -H 'Content-Type: application/json' \
+        -d '{
+          "model": "recommender",
+          "prompt": "Give me restaurant recommendations in Paris",
+          "max_tokens": 100,
+          "temperature": 0
+        }'
+      ```
diff --git a/site-src/index.md b/site-src/index.md
@@ -29,7 +29,7 @@ The following specific terms to this project:
   from [Model Serving](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol/README.md).
 - **Metrics and Capabilities**: Data provided by model serving platforms about
   performance, availability and capabilities to optimize routing. Includes
-  things like [Prefix Cache] status or [LoRA Adapters] availability.
+  things like [Prefix Cache](https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html) status or [LoRA Adapters](https://docs.vllm.ai/en/stable/features/lora.html) availability.
 - **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal).
 
 [Inference Gateway]:#concepts-and-definitions