fix: update activator label and var name.

X1aoZEOuO · X1aoZEOuO · commit 43f79d3c906d · 2025-10-30T01:49:02.000+08:00
Signed-off-by: X1aoZEOuO &lt;nizefeng2002@outlook.com&gt;
diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
@@ -35,10 +35,10 @@ const (
 	// Once either of them qualified, we'll expose this as a field in Model.
 	ModelPreheatAnnoKey = "llmaz.io/model-preheat"
 
-	// ModelActivatorAnnotationKey is used to indicate whether the model is activated by the activator.
-	ModelActivatorAnnoKey = "activator.llmaz.io/playground"
-	// CachedModelActivatorAnnotationKey is used to cache the activator info of the model.
-	CachedModelActivatorAnnoKey = "cached.activator.llmaz.io"
+	// ModelActivatorAnnoKey is used to indicate the model name activated by the activator.
+	ModelActivatorAnnoKey = "activator.llmaz.io/model-name"
+	// CachedModelActivatorAnnoKey is used to cache the activator state of the model.
+	CachedModelActivatorAnnoKey = "activator.llmaz.io/cached-state"
 
 	HUGGING_FACE = "Huggingface"
 	MODEL_SCOPE  = "ModelScope"
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -5,7 +5,7 @@ controllerManager:
     - --metrics-bind-address=:8443
     - --leader-elect
     - --namespace=llmaz-system
-    - --enable-serverless
+    - --enable-service-activator
     - --pod-ip=$(POD_IP)
     containerSecurityContext:
       allowPrivilegeEscalation: false
diff --git a/cmd/main.go b/cmd/main.go
@@ -64,14 +64,14 @@ func main() {
 	var enableLeaderElection bool
 	var probeAddr string
 	var namespace string
-	var enableServerless bool
+	var enableServiceActivator bool
 	var podIP string
 
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 	flag.StringVar(&namespace, "namespace", "llmaz-system", "The namespace of the llmaz to deploy")
-	flag.BoolVar(&enableServerless, "enable-serverless", false, "Enable the serverless feature")
-	flag.StringVar(&podIP, "pod-ip", "", "The pod IP of the llmaz controller manager")
+	flag.BoolVar(&enableServiceActivator, "enable-service-activator", false, "Enable the service activator feature. This is an experimental feature.")
+	flag.StringVar(&podIP, "pod-ip", "", "The pod IP of the llmaz controller manager. Only used when service activator is enabled.")
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
 		"Enable leader election for controller manager. "+
 			"Enabling this will ensure there is only one active controller manager.")
@@ -125,7 +125,7 @@ func main() {
 	// Cert won't be ready until manager starts, so start a goroutine here which
 	// will block until the cert is ready before setting up the controllers.
 	// Controllers who register after manager starts will start directly.
-	go setupControllers(mgr, certsReady, enableServerless, podIP)
+	go setupControllers(mgr, certsReady, enableServiceActivator, podIP)
 
 	//+kubebuilder:scaffold:builder
 
@@ -145,7 +145,7 @@ func main() {
 	}
 }
 
-func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServerless bool, podIP string) {
+func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServiceActivator bool, podIP string) {
 	// The controllers won't work until the webhooks are operating,
 	// and the webhook won't work until the certs are all in places.
 	setupLog.Info("waiting for the cert generation to complete")
@@ -181,7 +181,7 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, enableServerle
 		os.Exit(1)
 	}
 
-	if enableServerless {
+	if enableServiceActivator {
 		dynamicClient, err := dynamic.NewForConfig(mgr.GetConfig())
 		if err != nil {
 			setupLog.Error(err, "unable to create dynamic client")
diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
@@ -388,6 +388,12 @@ spec:
                         - port
                         type: object
                     type: object
+                  stopSignal:
+                    description: |-
+                      StopSignal defines which signal will be sent to a container when it is being stopped.
+                      If not specified, the default is defined by the container runtime in use.
+                      StopSignal can only be set for Pods with a non-empty .spec.os.name
+                    type: string
                 type: object
               livenessProbe:
                 description: |-
@@ -770,7 +776,9 @@ spec:
                                     policies:
                                       description: |-
                                         policies is a list of potential scaling polices which can be used during scaling.
-                                        At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                        If not set, use the default values:
+                                        - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                        - For scale down: allow all pods to be removed in a 15s window.
                                       items:
                                         description: HPAScalingPolicy is a single
                                           policy which must hold true for a specified
@@ -814,6 +822,24 @@ spec:
                                         - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                       format: int32
                                       type: integer
+                                    tolerance:
+                                      anyOf:
+                                      - type: integer
+                                      - type: string
+                                      description: |-
+                                        tolerance is the tolerance on the ratio between the current and desired
+                                        metric value under which no updates are made to the desired number of
+                                        replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                        set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                        For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                        and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                        triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                        This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                        feature gate.
+                                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                      x-kubernetes-int-or-string: true
                                   type: object
                                 scaleUp:
                                   description: |-
@@ -826,7 +852,9 @@ spec:
                                     policies:
                                       description: |-
                                         policies is a list of potential scaling polices which can be used during scaling.
-                                        At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                        If not set, use the default values:
+                                        - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                        - For scale down: allow all pods to be removed in a 15s window.
                                       items:
                                         description: HPAScalingPolicy is a single
                                           policy which must hold true for a specified
@@ -870,6 +898,24 @@ spec:
                                         - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                       format: int32
                                       type: integer
+                                    tolerance:
+                                      anyOf:
+                                      - type: integer
+                                      - type: string
+                                      description: |-
+                                        tolerance is the tolerance on the ratio between the current and desired
+                                        metric value under which no updates are made to the desired number of
+                                        replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                        set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                        For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                        and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                        triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                        This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                        feature gate.
+                                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                      x-kubernetes-int-or-string: true
                                   type: object
                               type: object
                             metrics:
diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -295,7 +295,9 @@ spec:
                                   policies:
                                     description: |-
                                       policies is a list of potential scaling polices which can be used during scaling.
-                                      At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                      If not set, use the default values:
+                                      - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                      - For scale down: allow all pods to be removed in a 15s window.
                                     items:
                                       description: HPAScalingPolicy is a single policy
                                         which must hold true for a specified past
@@ -339,6 +341,24 @@ spec:
                                       - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                     format: int32
                                     type: integer
+                                  tolerance:
+                                    anyOf:
+                                    - type: integer
+                                    - type: string
+                                    description: |-
+                                      tolerance is the tolerance on the ratio between the current and desired
+                                      metric value under which no updates are made to the desired number of
+                                      replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                      set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                      For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                      and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                      triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                      This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                      feature gate.
+                                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                    x-kubernetes-int-or-string: true
                                 type: object
                               scaleUp:
                                 description: |-
@@ -351,7 +371,9 @@ spec:
                                   policies:
                                     description: |-
                                       policies is a list of potential scaling polices which can be used during scaling.
-                                      At least one policy must be specified, otherwise the HPAScalingRules will be discarded as invalid
+                                      If not set, use the default values:
+                                      - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window.
+                                      - For scale down: allow all pods to be removed in a 15s window.
                                     items:
                                       description: HPAScalingPolicy is a single policy
                                         which must hold true for a specified past
@@ -395,6 +417,24 @@ spec:
                                       - For scale down: 300 (i.e. the stabilization window is 300 seconds long).
                                     format: int32
                                     type: integer
+                                  tolerance:
+                                    anyOf:
+                                    - type: integer
+                                    - type: string
+                                    description: |-
+                                      tolerance is the tolerance on the ratio between the current and desired
+                                      metric value under which no updates are made to the desired number of
+                                      replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not
+                                      set, the default cluster-wide tolerance is applied (by default 10%).
+
+                                      For example, if autoscaling is configured with a memory consumption target of 100Mi,
+                                      and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be
+                                      triggered when the actual consumption falls below 95Mi or exceeds 101Mi.
+
+                                      This is an alpha field and requires enabling the HPAConfigurableTolerance
+                                      feature gate.
+                                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                    x-kubernetes-int-or-string: true
                                 type: object
                             type: object
                           metrics:
diff --git a/config/crd/bases/inference.llmaz.io_services.yaml b/config/crd/bases/inference.llmaz.io_services.yaml
diff --git a/pkg/controller/inference/activator_controller.go b/pkg/controller/inference/activator_controller.go