|
| 1 | +package machineset |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "context" |
| 6 | + "encoding/json" |
| 7 | + "fmt" |
| 8 | + "time" |
| 9 | + |
| 10 | + archtranslater "github.com/coreos/stream-metadata-go/arch" |
| 11 | + "github.com/coreos/stream-metadata-go/stream" |
| 12 | + osconfigv1 "github.com/openshift/api/config/v1" |
| 13 | + features "github.com/openshift/api/features" |
| 14 | + machinev1 "github.com/openshift/api/machine/v1" |
| 15 | + opv1 "github.com/openshift/api/operator/v1" |
| 16 | + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" |
| 17 | + operatorversion "github.com/openshift/machine-config-operator/pkg/version" |
| 18 | + "sigs.k8s.io/yaml" |
| 19 | + |
| 20 | + corev1 "k8s.io/api/core/v1" |
| 21 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 22 | + kruntime "k8s.io/apimachinery/pkg/runtime" |
| 23 | + "k8s.io/apimachinery/pkg/types" |
| 24 | + kubeErrs "k8s.io/apimachinery/pkg/util/errors" |
| 25 | + "k8s.io/apimachinery/pkg/util/jsonmergepatch" |
| 26 | + "k8s.io/apimachinery/pkg/util/wait" |
| 27 | + clientset "k8s.io/client-go/kubernetes" |
| 28 | + "k8s.io/klog/v2" |
| 29 | +) |
| 30 | + |
| 31 | +// syncControlPlaneMachineSets will attempt to enqueue every control plane machineset |
| 32 | +// ControlPlaneMachineSets are singletons, but for the sake of consistency with the other |
| 33 | +// syncs, I chose to keep this function similar. |
| 34 | +// nolint:dupl // I separated these from syncMAPIMachineSets for readability |
| 35 | +func (ctrl *Controller) syncControlPlaneMachineSets(reason string) { |
| 36 | + |
| 37 | + // Check if CPMS feature gate is enabled |
| 38 | + if !ctrl.fgHandler.Enabled(features.FeatureGateManagedBootImagesCPMS) { |
| 39 | + klog.V(4).Infof("ManagedBootImagesCPMS feature gate is not enabled, skipping CPMS sync") |
| 40 | + return |
| 41 | + } |
| 42 | + |
| 43 | + ctrl.cpmsSyncMutex.Lock() |
| 44 | + defer ctrl.cpmsSyncMutex.Unlock() |
| 45 | + |
| 46 | + var mcop *opv1.MachineConfiguration |
| 47 | + var pollError error |
| 48 | + // Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long |
| 49 | + // as this is done by the operator sync loop. |
| 50 | + if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { |
| 51 | + mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) |
| 52 | + if pollError != nil { |
| 53 | + klog.Errorf("MachineConfiguration/cluster has not been created yet") |
| 54 | + return false, nil |
| 55 | + } |
| 56 | + |
| 57 | + // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration |
| 58 | + if mcop.Generation != mcop.Status.ObservedGeneration { |
| 59 | + klog.Errorf("MachineConfiguration.Status is not up to date.") |
| 60 | + pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") |
| 61 | + return false, nil |
| 62 | + } |
| 63 | + return true, nil |
| 64 | + }); err != nil { |
| 65 | + klog.Errorf("MachineConfiguration was not ready: %v", pollError) |
| 66 | + ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 67 | + return |
| 68 | + } |
| 69 | + |
| 70 | + machineManagerFound, machineResourceSelector, err := getMachineResourceSelectorFromMachineManagers(mcop.Status.ManagedBootImagesStatus.MachineManagers, opv1.MachineAPI, opv1.ControlPlaneMachineSets) |
| 71 | + if err != nil { |
| 72 | + klog.Errorf("failed to create a machineset selector while enqueueing controlplanemachineset %v", err) |
| 73 | + ctrl.updateConditions(reason, fmt.Errorf("failed to create a machineset selector while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 74 | + return |
| 75 | + } |
| 76 | + if !machineManagerFound { |
| 77 | + klog.V(4).Infof("No ControlPlaneMachineSet manager was found, so no ControlPlaneMachineSet will be enrolled.") |
| 78 | + // clear out MAPI boot image history |
| 79 | + for k := range ctrl.cpmsBootImageState { |
| 80 | + delete(ctrl.cpmsBootImageState, k) |
| 81 | + } |
| 82 | + } |
| 83 | + |
| 84 | + controlPlaneMachineSets, err := ctrl.cpmsLister.List(machineResourceSelector) |
| 85 | + if err != nil { |
| 86 | + klog.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err) |
| 87 | + ctrl.updateConditions(reason, fmt.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 88 | + return |
| 89 | + } |
| 90 | + |
| 91 | + // If no machine resources were enrolled; exit the enqueue process without errors. |
| 92 | + if len(controlPlaneMachineSets) == 0 { |
| 93 | + klog.Infof("No ControlPlaneMachineSet was enrolled, so no ControlPlaneMachineSet will be enqueued.") |
| 94 | + // clear out ControlPlaneMachineSet boot image history |
| 95 | + for k := range ctrl.cpmsBootImageState { |
| 96 | + delete(ctrl.cpmsBootImageState, k) |
| 97 | + } |
| 98 | + } |
| 99 | + |
| 100 | + // Reset stats before initiating reconciliation loop |
| 101 | + ctrl.cpmsStats.inProgress = 0 |
| 102 | + ctrl.cpmsStats.totalCount = len(controlPlaneMachineSets) |
| 103 | + ctrl.cpmsStats.erroredCount = 0 |
| 104 | + |
| 105 | + // Signal start of reconciliation process, by setting progressing to true |
| 106 | + var syncErrors []error |
| 107 | + ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing) |
| 108 | + |
| 109 | + for _, controlPlaneMachineSet := range controlPlaneMachineSets { |
| 110 | + err := ctrl.syncControlPlaneMachineSet(controlPlaneMachineSet) |
| 111 | + if err == nil { |
| 112 | + ctrl.cpmsStats.inProgress++ |
| 113 | + } else { |
| 114 | + klog.Errorf("Error syncing ControlPlaneMachineSet %v", err) |
| 115 | + syncErrors = append(syncErrors, fmt.Errorf("error syncing ControlPlaneMachineSet %s: %v", controlPlaneMachineSet.Name, err)) |
| 116 | + ctrl.cpmsStats.erroredCount++ |
| 117 | + } |
| 118 | + // Update progressing conditions every step of the loop |
| 119 | + ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing) |
| 120 | + } |
| 121 | + // Update/Clear degrade conditions based on errors from this loop |
| 122 | + ctrl.updateConditions(reason, kubeErrs.NewAggregate(syncErrors), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 123 | +} |
| 124 | + |
| 125 | +// syncControlPlaneMachineSet will attempt to reconcile the provided ControlPlaneMachineSet |
| 126 | +func (ctrl *Controller) syncControlPlaneMachineSet(controlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error { |
| 127 | + |
| 128 | + startTime := time.Now() |
| 129 | + klog.V(4).Infof("Started syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, startTime) |
| 130 | + defer func() { |
| 131 | + klog.V(4).Infof("Finished syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, time.Since(startTime)) |
| 132 | + }() |
| 133 | + |
| 134 | + // If the machineset has an owner reference, exit and report error. This means |
| 135 | + // that the machineset may be managed by another workflow and should not be reconciled. |
| 136 | + if len(controlPlaneMachineSet.GetOwnerReferences()) != 0 { |
| 137 | + klog.Infof("ControlPlaneMachineSet %s has OwnerReference: %v, skipping boot image update", controlPlaneMachineSet.GetOwnerReferences()[0].Kind+"/"+controlPlaneMachineSet.GetOwnerReferences()[0].Name, controlPlaneMachineSet.Name) |
| 138 | + return nil |
| 139 | + } |
| 140 | + |
| 141 | + if os, ok := controlPlaneMachineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.Labels[OSLabelKey]; ok { |
| 142 | + if os == "Windows" { |
| 143 | + klog.Infof("ControlPlaneMachineSet %s has a windows os label, skipping boot image update", controlPlaneMachineSet.Name) |
| 144 | + return nil |
| 145 | + } |
| 146 | + } |
| 147 | + |
| 148 | + // ControlPlaneMachineSets do not normally have an arch annotation, so use the architecture of the node |
| 149 | + // running this pod, which will always be a control plane node. |
| 150 | + arch := archtranslater.CurrentRpmArch() |
| 151 | + |
| 152 | + // Fetch the infra object to determine the platform type |
| 153 | + infra, err := ctrl.infraLister.Get("cluster") |
| 154 | + if err != nil { |
| 155 | + return fmt.Errorf("failed to fetch infra object during ControlPlaneMachineSet sync: %w", err) |
| 156 | + } |
| 157 | + |
| 158 | + // Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by |
| 159 | + // the operator when a master node successfully updates to a new image. This is |
| 160 | + // to prevent machinesets from being updated before the operator itself has updated. |
| 161 | + // If it hasn't been updated, exit and wait for a resync. |
| 162 | + configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName) |
| 163 | + if err != nil { |
| 164 | + return fmt.Errorf("failed to fetch coreos-bootimages config map duringControlPlaneMachineSet sync: %w", err) |
| 165 | + } |
| 166 | + versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey] |
| 167 | + if !versionHashFound { |
| 168 | + klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) |
| 169 | + return nil |
| 170 | + } |
| 171 | + if versionHashFromCM != operatorversion.Hash { |
| 172 | + klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete") |
| 173 | + return nil |
| 174 | + } |
| 175 | + releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.OCPReleaseVersionKey] |
| 176 | + if !releaseVersionFound { |
| 177 | + klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) |
| 178 | + return nil |
| 179 | + } |
| 180 | + if releaseVersionFromCM != operatorversion.ReleaseVersion { |
| 181 | + klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete") |
| 182 | + return nil |
| 183 | + } |
| 184 | + |
| 185 | + // Check if the this ControlPlaneMachineSet requires an update |
| 186 | + patchRequired, newControlPlaneMachineSet, err := checkControlPlaneMachineSet(infra, controlPlaneMachineSet, configMap, arch, ctrl.kubeClient) |
| 187 | + if err != nil { |
| 188 | + return fmt.Errorf("failed to reconcile ControlPlaneMachineSet %s, err: %w", controlPlaneMachineSet.Name, err) |
| 189 | + } |
| 190 | + |
| 191 | + // Patch the machineset if required |
| 192 | + if patchRequired { |
| 193 | + // First, check if we're hot looping |
| 194 | + if ctrl.checkControlPlaneMachineSetHotLoop(newControlPlaneMachineSet) { |
| 195 | + return fmt.Errorf("refusing to reconcile ControlPlaneMachineSet %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", controlPlaneMachineSet.Name) |
| 196 | + } |
| 197 | + klog.Infof("Patching ControlPlaneMachineSet %s", controlPlaneMachineSet.Name) |
| 198 | + return ctrl.patchControlPlaneMachineSet(controlPlaneMachineSet, newControlPlaneMachineSet) |
| 199 | + } |
| 200 | + klog.Infof("No patching required for ControlPlaneMachineSet %s", controlPlaneMachineSet.Name) |
| 201 | + return nil |
| 202 | +} |
| 203 | + |
| 204 | +// Checks against a local store of boot image updates to detect hot looping |
| 205 | +func (ctrl *Controller) checkControlPlaneMachineSetHotLoop(machineSet *machinev1.ControlPlaneMachineSet) bool { |
| 206 | + bis, ok := ctrl.cpmsBootImageState[machineSet.Name] |
| 207 | + if !ok { |
| 208 | + // If the controlplanemachineset doesn't currently have a record, create a new one. |
| 209 | + ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{ |
| 210 | + value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, |
| 211 | + hotLoopCount: 1, |
| 212 | + } |
| 213 | + } else { |
| 214 | + hotLoopCount := 1 |
| 215 | + // If the controller is updating to a value that was previously updated to, increase the hot loop counter |
| 216 | + if bytes.Equal(bis.value, machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw) { |
| 217 | + hotLoopCount = (bis.hotLoopCount) + 1 |
| 218 | + } |
| 219 | + // Return an error and degrade if the hot loop counter is above threshold |
| 220 | + if hotLoopCount > HotLoopLimit { |
| 221 | + return true |
| 222 | + } |
| 223 | + ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{ |
| 224 | + value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, |
| 225 | + hotLoopCount: hotLoopCount, |
| 226 | + } |
| 227 | + } |
| 228 | + return false |
| 229 | +} |
| 230 | + |
| 231 | +// This function patches the ControlPlaneMachineSet object using the machineClient |
| 232 | +// Returns an error if marshsalling or patching fails. |
| 233 | +func (ctrl *Controller) patchControlPlaneMachineSet(oldControlPlaneMachineSet, newControlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error { |
| 234 | + oldControlPlaneMachineSetMarshal, err := json.Marshal(oldControlPlaneMachineSet) |
| 235 | + if err != nil { |
| 236 | + return fmt.Errorf("unable to marshal old ControlPlaneMachineSet: %w", err) |
| 237 | + } |
| 238 | + newControlPlaneMachineSetMarshal, err := json.Marshal(newControlPlaneMachineSet) |
| 239 | + if err != nil { |
| 240 | + return fmt.Errorf("unable to marshal new ControlPlaneMachineSet: %w", err) |
| 241 | + } |
| 242 | + patchBytes, err := jsonmergepatch.CreateThreeWayJSONMergePatch(oldControlPlaneMachineSetMarshal, newControlPlaneMachineSetMarshal, oldControlPlaneMachineSetMarshal) |
| 243 | + if err != nil { |
| 244 | + return fmt.Errorf("unable to create patch for new ControlPlaneMachineSet: %w", err) |
| 245 | + } |
| 246 | + _, err = ctrl.machineClient.MachineV1().ControlPlaneMachineSets(MachineAPINamespace).Patch(context.TODO(), oldControlPlaneMachineSet.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) |
| 247 | + if err != nil { |
| 248 | + return fmt.Errorf("unable to patch new ControlPlaneMachineSet: %w", err) |
| 249 | + } |
| 250 | + klog.Infof("Successfully patched ControlPlaneMachineSet %s", oldControlPlaneMachineSet.Name) |
| 251 | + return nil |
| 252 | +} |
| 253 | + |
| 254 | +// This function calls the appropriate reconcile function based on the infra type |
| 255 | +// On success, it will return a bool indicating if a patch is required, and an updated |
| 256 | +// machineset object if any. It will return an error if any of the above steps fail. |
| 257 | +func checkControlPlaneMachineSet(infra *osconfigv1.Infrastructure, machineSet *machinev1.ControlPlaneMachineSet, configMap *corev1.ConfigMap, arch string, secretClient clientset.Interface) (bool, *machinev1.ControlPlaneMachineSet, error) { |
| 258 | + switch infra.Status.PlatformStatus.Type { |
| 259 | + case osconfigv1.AWSPlatformType: |
| 260 | + return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAWSProviderSpec) |
| 261 | + case osconfigv1.AzurePlatformType: |
| 262 | + return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAzureProviderSpec) |
| 263 | + case osconfigv1.GCPPlatformType: |
| 264 | + return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileGCPProviderSpec) |
| 265 | + // TODO: vsphere CPMS template seems to be empty in CI runs, and will need further investigation |
| 266 | + default: |
| 267 | + klog.Infof("Skipping controlplanemachineset %s, unsupported platform %s", machineSet.Name, infra.Status.PlatformStatus.Type) |
| 268 | + return false, nil, nil |
| 269 | + } |
| 270 | +} |
| 271 | + |
| 272 | +// Generic reconcile function that handles the common pattern across all platforms |
| 273 | +// nolint:dupl // I separated this from reconcilePlatform for readability |
| 274 | +func reconcilePlatformCPMS[T any]( |
| 275 | + cpms *machinev1.ControlPlaneMachineSet, |
| 276 | + infra *osconfigv1.Infrastructure, |
| 277 | + configMap *corev1.ConfigMap, |
| 278 | + arch string, |
| 279 | + secretClient clientset.Interface, |
| 280 | + reconcileProviderSpec func(*stream.Stream, string, *osconfigv1.Infrastructure, *T, string, clientset.Interface) (bool, *T, error), |
| 281 | +) (patchRequired bool, newCPMS *machinev1.ControlPlaneMachineSet, err error) { |
| 282 | + klog.Infof("Reconciling controlplanemachineset %s on %s, with arch %s", cpms.Name, string(infra.Status.PlatformStatus.Type), arch) |
| 283 | + |
| 284 | + // Unmarshal the provider spec |
| 285 | + providerSpec := new(T) |
| 286 | + if err := unmarshalProviderSpecCPMS(cpms, providerSpec); err != nil { |
| 287 | + return false, nil, err |
| 288 | + } |
| 289 | + |
| 290 | + // Unmarshal the configmap into a stream object |
| 291 | + streamData := new(stream.Stream) |
| 292 | + if err := unmarshalStreamDataConfigMap(configMap, streamData); err != nil { |
| 293 | + return false, nil, err |
| 294 | + } |
| 295 | + |
| 296 | + // Reconcile the provider spec |
| 297 | + patchRequired, newProviderSpec, err := reconcileProviderSpec(streamData, arch, infra, providerSpec, cpms.Name, secretClient) |
| 298 | + if err != nil { |
| 299 | + return false, nil, err |
| 300 | + } |
| 301 | + |
| 302 | + // If no patch is required, exit early |
| 303 | + if !patchRequired { |
| 304 | + return false, nil, nil |
| 305 | + } |
| 306 | + |
| 307 | + // If patch is required, marshal the new providerspec into the controlplanemachineset |
| 308 | + newCPMS = cpms.DeepCopy() |
| 309 | + if err := marshalProviderSpecCPMS(newCPMS, newProviderSpec); err != nil { |
| 310 | + return false, nil, err |
| 311 | + } |
| 312 | + return patchRequired, newCPMS, nil |
| 313 | +} |
| 314 | + |
| 315 | +// This function unmarshals the controlplanemachineset's provider spec into |
| 316 | +// a ProviderSpec object. Returns an error if providerSpec field is nil, |
| 317 | +// or the unmarshal fails |
| 318 | +func unmarshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error { |
| 319 | + if ms == nil { |
| 320 | + return fmt.Errorf("ControlPlaneMachineSet object was nil") |
| 321 | + } |
| 322 | + if ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value == nil { |
| 323 | + return fmt.Errorf("providerSpec field was empty") |
| 324 | + } |
| 325 | + if err := yaml.Unmarshal(ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, &providerSpec); err != nil { |
| 326 | + return fmt.Errorf("unmarshal into providerSpec failed %w", err) |
| 327 | + } |
| 328 | + return nil |
| 329 | +} |
| 330 | + |
| 331 | +// This function marshals the ProviderSpec object into a ControlPlaneMachineSet object. |
| 332 | +// Returns an error if ProviderSpec or ControlPlaneMachineSet is nil, or if the marshal fails |
| 333 | +func marshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error { |
| 334 | + if ms == nil { |
| 335 | + return fmt.Errorf("ControlPlaneMachineSet object was nil") |
| 336 | + } |
| 337 | + if providerSpec == nil { |
| 338 | + return fmt.Errorf("ProviderSpec object was nil") |
| 339 | + } |
| 340 | + rawBytes, err := json.Marshal(providerSpec) |
| 341 | + if err != nil { |
| 342 | + return fmt.Errorf("marshal into machineset failed: %w", err) |
| 343 | + } |
| 344 | + ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value = &kruntime.RawExtension{Raw: rawBytes} |
| 345 | + return nil |
| 346 | +} |
0 commit comments