|
| 1 | +package machineset |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "context" |
| 6 | + "encoding/json" |
| 7 | + "fmt" |
| 8 | + "time" |
| 9 | + |
| 10 | + archtranslater "github.com/coreos/stream-metadata-go/arch" |
| 11 | + "github.com/coreos/stream-metadata-go/stream" |
| 12 | + osconfigv1 "github.com/openshift/api/config/v1" |
| 13 | + machinev1 "github.com/openshift/api/machine/v1" |
| 14 | + opv1 "github.com/openshift/api/operator/v1" |
| 15 | + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" |
| 16 | + operatorversion "github.com/openshift/machine-config-operator/pkg/version" |
| 17 | + "sigs.k8s.io/yaml" |
| 18 | + |
| 19 | + corev1 "k8s.io/api/core/v1" |
| 20 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 21 | + kruntime "k8s.io/apimachinery/pkg/runtime" |
| 22 | + "k8s.io/apimachinery/pkg/types" |
| 23 | + kubeErrs "k8s.io/apimachinery/pkg/util/errors" |
| 24 | + "k8s.io/apimachinery/pkg/util/jsonmergepatch" |
| 25 | + "k8s.io/apimachinery/pkg/util/wait" |
| 26 | + clientset "k8s.io/client-go/kubernetes" |
| 27 | + "k8s.io/klog/v2" |
| 28 | +) |
| 29 | + |
| 30 | +// syncControlPlaneMachineSets will attempt to enqueue every control plane machineset |
| 31 | +// ControlPlaneMachineSets are singletons, but for the sake of consistency with the other |
| 32 | +// syncs, I chose to keep this function similar. |
| 33 | +// nolint:dupl // I separated these from syncMAPIMachineSets for readability |
| 34 | +func (ctrl *Controller) syncControlPlaneMachineSets(reason string) { |
| 35 | + |
| 36 | + // TODO: check if CPMS feature gate exists after API lands |
| 37 | + |
| 38 | + ctrl.cpmsSyncMutex.Lock() |
| 39 | + defer ctrl.cpmsSyncMutex.Unlock() |
| 40 | + |
| 41 | + var mcop *opv1.MachineConfiguration |
| 42 | + var pollError error |
| 43 | + // Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long |
| 44 | + // as this is done by the operator sync loop. |
| 45 | + if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) { |
| 46 | + mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) |
| 47 | + if pollError != nil { |
| 48 | + klog.Errorf("MachineConfiguration/cluster has not been created yet") |
| 49 | + return false, nil |
| 50 | + } |
| 51 | + |
| 52 | + // Ensure status.ObservedGeneration matches the last generation of MachineConfiguration |
| 53 | + if mcop.Generation != mcop.Status.ObservedGeneration { |
| 54 | + klog.Errorf("MachineConfiguration.Status is not up to date.") |
| 55 | + pollError = fmt.Errorf("MachineConfiguration.Status is not up to date") |
| 56 | + return false, nil |
| 57 | + } |
| 58 | + return true, nil |
| 59 | + }); err != nil { |
| 60 | + klog.Errorf("MachineConfiguration was not ready: %v", pollError) |
| 61 | + ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 62 | + return |
| 63 | + } |
| 64 | + |
| 65 | + // TODO: Change this to CPMS type after API lands |
| 66 | + machineManagerFound, machineResourceSelector, err := getMachineResourceSelectorFromMachineManagers(mcop.Status.ManagedBootImagesStatus.MachineManagers, opv1.MachineAPI, opv1.MachineSets) |
| 67 | + if err != nil { |
| 68 | + klog.Errorf("failed to create a machineset selector while enqueueing controlplanemachineset %v", err) |
| 69 | + ctrl.updateConditions(reason, fmt.Errorf("failed to create a machineset selector while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 70 | + return |
| 71 | + } |
| 72 | + if !machineManagerFound { |
| 73 | + klog.V(4).Infof("No ControlPlaneMachineSet manager was found, so no ControlPlaneMachineSet will be enrolled.") |
| 74 | + // clear out MAPI boot image history |
| 75 | + for k := range ctrl.cpmsBootImageState { |
| 76 | + delete(ctrl.cpmsBootImageState, k) |
| 77 | + } |
| 78 | + } |
| 79 | + |
| 80 | + controlPlaneMachineSets, err := ctrl.cpmsLister.List(machineResourceSelector) |
| 81 | + if err != nil { |
| 82 | + klog.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err) |
| 83 | + ctrl.updateConditions(reason, fmt.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 84 | + return |
| 85 | + } |
| 86 | + |
| 87 | + // If no machine resources were enrolled; exit the enqueue process without errors. |
| 88 | + if len(controlPlaneMachineSets) == 0 { |
| 89 | + klog.Infof("No ControlPlaneMachineSet were enrolled, so no MAPI machinesets will be enqueued.") |
| 90 | + // clear out ControlPlaneMachineSet boot image history |
| 91 | + for k := range ctrl.cpmsBootImageState { |
| 92 | + delete(ctrl.cpmsBootImageState, k) |
| 93 | + } |
| 94 | + } |
| 95 | + |
| 96 | + // Reset stats before initiating reconciliation loop |
| 97 | + ctrl.cpmsStats.inProgress = 0 |
| 98 | + ctrl.cpmsStats.totalCount = len(controlPlaneMachineSets) |
| 99 | + ctrl.cpmsStats.erroredCount = 0 |
| 100 | + |
| 101 | + // Signal start of reconciliation process, by setting progressing to true |
| 102 | + var syncErrors []error |
| 103 | + ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing) |
| 104 | + |
| 105 | + for _, controlPlaneMachineSet := range controlPlaneMachineSets { |
| 106 | + err := ctrl.syncControlPlaneMachineSet(controlPlaneMachineSet) |
| 107 | + if err == nil { |
| 108 | + ctrl.cpmsStats.inProgress++ |
| 109 | + } else { |
| 110 | + klog.Errorf("Error syncing ControlPlaneMachineSet %v", err) |
| 111 | + syncErrors = append(syncErrors, fmt.Errorf("error syncing ControlPlaneMachineSet %s: %v", controlPlaneMachineSet.Name, err)) |
| 112 | + ctrl.cpmsStats.erroredCount++ |
| 113 | + } |
| 114 | + // Update progressing conditions every step of the loop |
| 115 | + ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing) |
| 116 | + } |
| 117 | + // Update/Clear degrade conditions based on errors from this loop |
| 118 | + ctrl.updateConditions(reason, kubeErrs.NewAggregate(syncErrors), opv1.MachineConfigurationBootImageUpdateDegraded) |
| 119 | +} |
| 120 | + |
| 121 | +// syncControlPlaneMachineSet will attempt to reconcile the provided ControlPlaneMachineSet |
| 122 | +func (ctrl *Controller) syncControlPlaneMachineSet(controlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error { |
| 123 | + |
| 124 | + startTime := time.Now() |
| 125 | + klog.V(4).Infof("Started syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, startTime) |
| 126 | + defer func() { |
| 127 | + klog.V(4).Infof("Finished syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, time.Since(startTime)) |
| 128 | + }() |
| 129 | + |
| 130 | + // If the machineset has an owner reference, exit and report error. This means |
| 131 | + // that the machineset may be managed by another workflow and should not be reconciled. |
| 132 | + if len(controlPlaneMachineSet.GetOwnerReferences()) != 0 { |
| 133 | + klog.Infof("ControlPlaneMachineSet %s has OwnerReference: %v, skipping boot image update", controlPlaneMachineSet.GetOwnerReferences()[0].Kind+"/"+controlPlaneMachineSet.GetOwnerReferences()[0].Name, controlPlaneMachineSet.Name) |
| 134 | + return nil |
| 135 | + } |
| 136 | + |
| 137 | + if os, ok := controlPlaneMachineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.Labels[OSLabelKey]; ok { |
| 138 | + if os == "Windows" { |
| 139 | + klog.Infof("ControlPlaneMachineSet %s has a windows os label, skipping boot image update", controlPlaneMachineSet.Name) |
| 140 | + return nil |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + // ControlPlaneMachineSets do not normally have an arch annotation, so use the architecture of the node |
| 145 | + // running this pod, which will always be a control plane node. |
| 146 | + arch := archtranslater.CurrentRpmArch() |
| 147 | + |
| 148 | + // Fetch the infra object to determine the platform type |
| 149 | + infra, err := ctrl.infraLister.Get("cluster") |
| 150 | + if err != nil { |
| 151 | + return fmt.Errorf("failed to fetch infra object during ControlPlaneMachineSet sync: %w", err) |
| 152 | + } |
| 153 | + |
| 154 | + // Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by |
| 155 | + // the operator when a master node successfully updates to a new image. This is |
| 156 | + // to prevent machinesets from being updated before the operator itself has updated. |
| 157 | + // If it hasn't been updated, exit and wait for a resync. |
| 158 | + configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName) |
| 159 | + if err != nil { |
| 160 | + return fmt.Errorf("failed to fetch coreos-bootimages config map duringControlPlaneMachineSet sync: %w", err) |
| 161 | + } |
| 162 | + versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey] |
| 163 | + if !versionHashFound { |
| 164 | + klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) |
| 165 | + return nil |
| 166 | + } |
| 167 | + if versionHashFromCM != operatorversion.Hash { |
| 168 | + klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete") |
| 169 | + return nil |
| 170 | + } |
| 171 | + releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.OCPReleaseVersionKey] |
| 172 | + if !releaseVersionFound { |
| 173 | + klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName) |
| 174 | + return nil |
| 175 | + } |
| 176 | + if releaseVersionFromCM != operatorversion.ReleaseVersion { |
| 177 | + klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete") |
| 178 | + return nil |
| 179 | + } |
| 180 | + |
| 181 | + // Check if the this ControlPlaneMachineSet requires an update |
| 182 | + patchRequired, newControlPlaneMachineSet, err := checkControlPlaneMachineSet(infra, controlPlaneMachineSet, configMap, arch, ctrl.kubeClient) |
| 183 | + if err != nil { |
| 184 | + return fmt.Errorf("failed to reconcile ControlPlaneMachineSet %s, err: %w", controlPlaneMachineSet.Name, err) |
| 185 | + } |
| 186 | + |
| 187 | + // Patch the machineset if required |
| 188 | + if patchRequired { |
| 189 | + // First, check if we're hot looping |
| 190 | + if ctrl.checkControlPlaneMachineSetHotLoop(newControlPlaneMachineSet) { |
| 191 | + return fmt.Errorf("refusing to reconcile ControlPlaneMachineSet %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", controlPlaneMachineSet.Name) |
| 192 | + } |
| 193 | + klog.Infof("Patching ControlPlaneMachineSet %s", controlPlaneMachineSet.Name) |
| 194 | + return ctrl.patchControlPlaneMachineSet(controlPlaneMachineSet, newControlPlaneMachineSet) |
| 195 | + } |
| 196 | + klog.Infof("No patching required for ControlPlaneMachineSet %s", controlPlaneMachineSet.Name) |
| 197 | + return nil |
| 198 | +} |
| 199 | + |
| 200 | +// Checks against a local store of boot image updates to detect hot looping |
| 201 | +func (ctrl *Controller) checkControlPlaneMachineSetHotLoop(machineSet *machinev1.ControlPlaneMachineSet) bool { |
| 202 | + bis, ok := ctrl.cpmsBootImageState[machineSet.Name] |
| 203 | + if !ok { |
| 204 | + // If the controlplanemachineset doesn't currently have a record, create a new one. |
| 205 | + ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{ |
| 206 | + value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, |
| 207 | + hotLoopCount: 1, |
| 208 | + } |
| 209 | + } else { |
| 210 | + hotLoopCount := 1 |
| 211 | + // If the controller is updating to a value that was previously updated to, increase the hot loop counter |
| 212 | + if bytes.Equal(bis.value, machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw) { |
| 213 | + hotLoopCount = (bis.hotLoopCount) + 1 |
| 214 | + } |
| 215 | + // Return an error and degrade if the hot loop counter is above threshold |
| 216 | + if hotLoopCount > HotLoopLimit { |
| 217 | + return true |
| 218 | + } |
| 219 | + ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{ |
| 220 | + value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, |
| 221 | + hotLoopCount: hotLoopCount, |
| 222 | + } |
| 223 | + } |
| 224 | + return false |
| 225 | +} |
| 226 | + |
| 227 | +// This function patches the ControlPlaneMachineSet object using the machineClient |
| 228 | +// Returns an error if marshsalling or patching fails. |
| 229 | +func (ctrl *Controller) patchControlPlaneMachineSet(oldControlPlaneMachineSet, newControlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error { |
| 230 | + oldControlPlaneMachineSetMarshal, err := json.Marshal(oldControlPlaneMachineSet) |
| 231 | + if err != nil { |
| 232 | + return fmt.Errorf("unable to marshal old ControlPlaneMachineSet: %w", err) |
| 233 | + } |
| 234 | + newControlPlaneMachineSetMarshal, err := json.Marshal(newControlPlaneMachineSet) |
| 235 | + if err != nil { |
| 236 | + return fmt.Errorf("unable to marshal new ControlPlaneMachineSet: %w", err) |
| 237 | + } |
| 238 | + patchBytes, err := jsonmergepatch.CreateThreeWayJSONMergePatch(oldControlPlaneMachineSetMarshal, newControlPlaneMachineSetMarshal, oldControlPlaneMachineSetMarshal) |
| 239 | + if err != nil { |
| 240 | + return fmt.Errorf("unable to create patch for new ControlPlaneMachineSet: %w", err) |
| 241 | + } |
| 242 | + _, err = ctrl.machineClient.MachineV1().ControlPlaneMachineSets(MachineAPINamespace).Patch(context.TODO(), oldControlPlaneMachineSet.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) |
| 243 | + if err != nil { |
| 244 | + return fmt.Errorf("unable to patch new ControlPlaneMachineSet: %w", err) |
| 245 | + } |
| 246 | + klog.Infof("Successfully patched ControlPlaneMachineSet %s", oldControlPlaneMachineSet.Name) |
| 247 | + return nil |
| 248 | +} |
| 249 | + |
| 250 | +// This function calls the appropriate reconcile function based on the infra type |
| 251 | +// On success, it will return a bool indicating if a patch is required, and an updated |
| 252 | +// machineset object if any. It will return an error if any of the above steps fail. |
| 253 | +func checkControlPlaneMachineSet(infra *osconfigv1.Infrastructure, machineSet *machinev1.ControlPlaneMachineSet, configMap *corev1.ConfigMap, arch string, secretClient clientset.Interface) (bool, *machinev1.ControlPlaneMachineSet, error) { |
| 254 | + switch infra.Status.PlatformStatus.Type { |
| 255 | + case osconfigv1.AWSPlatformType: |
| 256 | + return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAWSProviderSpec) |
| 257 | + case osconfigv1.AzurePlatformType: |
| 258 | + return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAzureProviderSpec) |
| 259 | + case osconfigv1.GCPPlatformType: |
| 260 | + return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileGCPProviderSpec) |
| 261 | + // TODO: vsphere CPMS template seems to be empty in CI runs, and will need further investigation |
| 262 | + default: |
| 263 | + klog.Infof("Skipping controlplanemachineset %s, unsupported platform %s", machineSet.Name, infra.Status.PlatformStatus.Type) |
| 264 | + return false, nil, nil |
| 265 | + } |
| 266 | +} |
| 267 | + |
| 268 | +// Generic reconcile function that handles the common pattern across all platforms |
| 269 | +// nolint:dupl // I separated this from reconcilePlatform for readability |
| 270 | +func reconcilePlatformCPMS[T any]( |
| 271 | + cpms *machinev1.ControlPlaneMachineSet, |
| 272 | + infra *osconfigv1.Infrastructure, |
| 273 | + configMap *corev1.ConfigMap, |
| 274 | + arch string, |
| 275 | + secretClient clientset.Interface, |
| 276 | + reconcileProviderSpec func(*stream.Stream, string, *osconfigv1.Infrastructure, *T, string, clientset.Interface) (bool, *T, error), |
| 277 | +) (patchRequired bool, newCPMS *machinev1.ControlPlaneMachineSet, err error) { |
| 278 | + klog.Infof("Reconciling controlplanemachineset %s on %s, with arch %s", cpms.Name, string(infra.Status.PlatformStatus.Type), arch) |
| 279 | + |
| 280 | + // Unmarshal the provider spec |
| 281 | + providerSpec := new(T) |
| 282 | + if err := unmarshalProviderSpecCPMS(cpms, providerSpec); err != nil { |
| 283 | + return false, nil, err |
| 284 | + } |
| 285 | + |
| 286 | + // Unmarshal the configmap into a stream object |
| 287 | + streamData := new(stream.Stream) |
| 288 | + if err := unmarshalStreamDataConfigMap(configMap, streamData); err != nil { |
| 289 | + return false, nil, err |
| 290 | + } |
| 291 | + |
| 292 | + // Reconcile the provider spec |
| 293 | + patchRequired, newProviderSpec, err := reconcileProviderSpec(streamData, arch, infra, providerSpec, cpms.Name, secretClient) |
| 294 | + if err != nil { |
| 295 | + return false, nil, err |
| 296 | + } |
| 297 | + |
| 298 | + // If no patch is required, exit early |
| 299 | + if !patchRequired { |
| 300 | + return false, nil, nil |
| 301 | + } |
| 302 | + |
| 303 | + // If patch is required, marshal the new providerspec into the controlplanemachineset |
| 304 | + newCPMS = cpms.DeepCopy() |
| 305 | + if err := marshalProviderSpecCPMS(newCPMS, newProviderSpec); err != nil { |
| 306 | + return false, nil, err |
| 307 | + } |
| 308 | + return patchRequired, newCPMS, nil |
| 309 | +} |
| 310 | + |
| 311 | +// This function unmarshals the controlplanemachineset's provider spec into |
| 312 | +// a ProviderSpec object. Returns an error if providerSpec field is nil, |
| 313 | +// or the unmarshal fails |
| 314 | +func unmarshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error { |
| 315 | + if ms == nil { |
| 316 | + return fmt.Errorf("ControlPlaneMachineSet object was nil") |
| 317 | + } |
| 318 | + if ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value == nil { |
| 319 | + return fmt.Errorf("providerSpec field was empty") |
| 320 | + } |
| 321 | + if err := yaml.Unmarshal(ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, &providerSpec); err != nil { |
| 322 | + return fmt.Errorf("unmarshal into providerSpec failed %w", err) |
| 323 | + } |
| 324 | + return nil |
| 325 | +} |
| 326 | + |
| 327 | +// This function marshals the ProviderSpec object into a ControlPlaneMachineSet object. |
| 328 | +// Returns an error if ProviderSpec or ControlPlaneMachineSet is nil, or if the marshal fails |
| 329 | +func marshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error { |
| 330 | + if ms == nil { |
| 331 | + return fmt.Errorf("ControlPlaneMachineSet object was nil") |
| 332 | + } |
| 333 | + if providerSpec == nil { |
| 334 | + return fmt.Errorf("ProviderSpec object was nil") |
| 335 | + } |
| 336 | + rawBytes, err := json.Marshal(providerSpec) |
| 337 | + if err != nil { |
| 338 | + return fmt.Errorf("marshal into machineset failed: %w", err) |
| 339 | + } |
| 340 | + ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value = &kruntime.RawExtension{Raw: rawBytes} |
| 341 | + return nil |
| 342 | +} |
0 commit comments