diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 60bed37533ba..561c4eea4cca 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -294,6 +294,8 @@ type AutoscalingOptions struct { DynamicNodeDeleteDelayAfterTaintEnabled bool // BypassedSchedulers are used to specify which schedulers to bypass their processing BypassedSchedulers map[string]bool + // AllowedSchedulers when specified CA will proceed only with pods that are targeting allowed schedulers from unschedulable pods and unprocessed pods by BypassedSchedulers. + AllowedSchedulers map[string]bool // ProvisioningRequestEnabled tells if CA processes ProvisioningRequest. ProvisioningRequestEnabled bool // AsyncNodeGroupsEnabled tells if CA creates/deletes node groups asynchronously. diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go index 45b86fe90663..6969c2b9dcba 100644 --- a/cluster-autoscaler/config/flags/flags.go +++ b/cluster-autoscaler/config/flags/flags.go @@ -208,6 +208,7 @@ var ( forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") bypassedSchedulers = pflag.StringSlice("bypassed-scheduler-names", []string{}, "Names of schedulers to bypass. If set to non-empty value, CA will not wait for pods to reach a certain age before triggering a scale-up.") + allowedSchedulers = pflag.StringSlice("allowed-scheduler-names", []string{}, "If set to non-empty value, CA will proceed only with pods targeting schedulers in the list, from the list of unschedulable and scheduler unprocessed pods") drainPriorityConfig = flag.String("drain-priority-config", "", "List of ',' separated pairs (priority:terminationGracePeriodSeconds) of integers separated by ':' enables priority evictor. Priority evictor groups pods into priority groups based on pod priority and evict pods in the ascending order of group priorities"+ "--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+ @@ -395,7 +396,8 @@ func createAutoscalingOptions() config.AutoscalingOptions { MaxFreeDifferenceRatio: *maxFreeDifferenceRatio, }, DynamicNodeDeleteDelayAfterTaintEnabled: *dynamicNodeDeleteDelayAfterTaintEnabled, - BypassedSchedulers: scheduler_util.GetBypassedSchedulersMap(*bypassedSchedulers), + BypassedSchedulers: scheduler_util.SchedulersMap(*bypassedSchedulers), + AllowedSchedulers: parseAllowedSchedulers(*allowedSchedulers, *bypassedSchedulers), ProvisioningRequestEnabled: *provisioningRequestsEnabled, AsyncNodeGroupsEnabled: *asyncNodeGroupsEnabled, ProvisioningRequestInitialBackoffTime: *provisioningRequestInitialBackoffTime, @@ -542,3 +544,18 @@ func parseShutdownGracePeriodsAndPriorities(priorityGracePeriodStr string) []kub } return priorityGracePeriodMap } + +func parseAllowedSchedulers(allowedSchedulers, bypassedSchedulers []string) map[string]bool { + allowedSchedulersMap := scheduler_util.SchedulersMap(allowedSchedulers) + if len(allowedSchedulers) == 0 { + return allowedSchedulersMap + } + bypassedSchedulersMap := scheduler_util.SchedulersMap(bypassedSchedulers) + + for scheduler := range bypassedSchedulersMap { + if found := allowedSchedulersMap[scheduler]; !found { + klog.Fatalf("Invalid configuration. --bypassed-scheduler-names should be a subset of --allowed-scheduler-names. %s not included in --allowed-scheduler-names", scheduler) + } + } + return allowedSchedulersMap +} diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 4512aa5beedf..4791a68900df 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -292,7 +292,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return err } - podsBySchedulability, err := listPods(podLister, a.BypassedSchedulers) + podsBySchedulability, err := listPods(podLister, a.BypassedSchedulers, a.AllowedSchedulers) if err != nil { return caerrors.ToAutoscalerError(caerrors.ApiCallError, err) } @@ -1156,18 +1156,23 @@ func nodeNames(ns []*apiv1.Node) []string { return names } -func listPods(podLister kube_util.PodLister, bypassedSchedulers map[string]bool) (podsBySchedulability kube_util.PodsBySchedulability, err error) { +func listPods(podLister kube_util.PodLister, bypassedSchedulers, allowedSchedulers map[string]bool) (podsBySchedulability kube_util.PodsBySchedulability, err error) { pods, err := podLister.List() if err != nil { klog.Errorf("Failed to list pods: %v", err) return podsBySchedulability, err } + initialPodCount := len(pods) + if len(allowedSchedulers) > 0 { + pods = kube_util.FilterOutPodsByScheduler(pods, allowedSchedulers) + } podsBySchedulability = kube_util.ArrangePodsBySchedulability(pods, bypassedSchedulers) // Skip logging in case of the boring scenario, when all pods are scheduled. if len(pods) != len(podsBySchedulability.Scheduled) { + ignoredDueToDisallowed := initialPodCount - len(pods) ignored := len(pods) - len(podsBySchedulability.Scheduled) - len(podsBySchedulability.Unschedulable) - len(podsBySchedulability.Unprocessed) - klog.Infof("Found %d pods in the cluster: %d scheduled, %d unschedulable, %d unprocessed by scheduler, %d ignored (most likely using custom scheduler)", - len(pods), len(podsBySchedulability.Scheduled), len(podsBySchedulability.Unschedulable), len(podsBySchedulability.Unprocessed), ignored) + klog.Infof("Found %d pods in the cluster: %d scheduled, %d unschedulable, %d unprocessed by scheduler, %d ignored by allowed schedulers (most likely using custom scheduler), %d ignored due to dissallowed schedulers", + initialPodCount, len(podsBySchedulability.Scheduled), len(podsBySchedulability.Unschedulable), len(podsBySchedulability.Unprocessed), ignored, ignoredDueToDisallowed) } return } diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 73a8b3ae729f..e1f74e79cd84 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1473,7 +1473,7 @@ func TestStaticAutoscalerRunOnceWithBypassedSchedulers(t *testing.T) { MaxNodesTotal: 10, MaxCoresTotal: 10, MaxMemoryTotal: 100000, - BypassedSchedulers: scheduler.GetBypassedSchedulersMap([]string{ + BypassedSchedulers: scheduler.SchedulersMap([]string{ apiv1.DefaultSchedulerName, bypassedScheduler, }), diff --git a/cluster-autoscaler/utils/kubernetes/listers.go b/cluster-autoscaler/utils/kubernetes/listers.go index 72eee8cf2fbb..b9e9221724ef 100644 --- a/cluster-autoscaler/utils/kubernetes/listers.go +++ b/cluster-autoscaler/utils/kubernetes/listers.go @@ -200,6 +200,17 @@ func ArrangePodsBySchedulability(allPods []*apiv1.Pod, bypassedSchedulers map[st return } +// FilterOutPodsByScheduler is a helper method that filters out pods not in the given set of allowed schedulers. +func FilterOutPodsByScheduler(allPods []*apiv1.Pod, allowedSchedulers map[string]bool) []*apiv1.Pod { + var remainingPods []*apiv1.Pod + for _, pod := range allPods { + if keep := allowedSchedulers[pod.Spec.SchedulerName]; keep { + remainingPods = append(remainingPods, pod) + } + } + return remainingPods +} + // SchedulingGatedPods is a helper method that returns all pods which has scheduling gate // SchedulingGated pods are not scheduled nor deleted by the implementation and are not unschedulable nor unprocessed by definition func SchedulingGatedPods(allPods []*apiv1.Pod) []*apiv1.Pod { diff --git a/cluster-autoscaler/utils/scheduler/scheduler.go b/cluster-autoscaler/utils/scheduler/scheduler.go index cc7070242bc4..3899a1f6fbbf 100644 --- a/cluster-autoscaler/utils/scheduler/scheduler.go +++ b/cluster-autoscaler/utils/scheduler/scheduler.go @@ -126,15 +126,15 @@ func ConfigFromPath(path string) (*scheduler_config.KubeSchedulerConfiguration, return cfgObj, nil } -// GetBypassedSchedulersMap returns a map of scheduler names that should be bypassed as keys, and values are set to true -// Also sets "" (empty string) to true if default scheduler is bypassed -func GetBypassedSchedulersMap(bypassedSchedulers []string) map[string]bool { - bypassedSchedulersMap := make(map[string]bool, len(bypassedSchedulers)) - for _, scheduler := range bypassedSchedulers { - bypassedSchedulersMap[scheduler] = true +// SchedulersMap returns a map of scheduler names as keys, and values are set to true +// Also sets "" (empty string) to true if default scheduler is in the list +func SchedulersMap(schedulers []string) map[string]bool { + schedulersMap := make(map[string]bool, len(schedulers)) + for _, scheduler := range schedulers { + schedulersMap[scheduler] = true } - if canBypass := bypassedSchedulersMap[apiv1.DefaultSchedulerName]; canBypass { - bypassedSchedulersMap[""] = true + if found := schedulersMap[apiv1.DefaultSchedulerName]; found { + schedulersMap[""] = true } - return bypassedSchedulersMap + return schedulersMap }