Skip to content

Commit 09b1528

Browse files
committed
msbic: implement CPMS support
1 parent 5e37bd7 commit 09b1528

File tree

6 files changed

+470
-23
lines changed

6 files changed

+470
-23
lines changed

cmd/machine-config-controller/start.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ func runStartCmd(_ *cobra.Command, _ []string) {
134134
ctrlctx.ClientBuilder.MachineClientOrDie("machine-set-boot-image-controller"),
135135
ctrlctx.KubeNamespacedInformerFactory.Core().V1().ConfigMaps(),
136136
ctrlctx.MachineInformerFactory.Machine().V1beta1().MachineSets(),
137+
ctrlctx.MachineInformerFactory.Machine().V1().ControlPlaneMachineSets(),
137138
ctrlctx.ConfigInformerFactory.Config().V1().Infrastructures(),
138139
ctrlctx.ClientBuilder.OperatorClientOrDie(componentName),
139140
ctrlctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),

manifests/machineconfigcontroller/clusterrole.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ rules:
3737
resources: ["daemonsets"]
3838
verbs: ["get"]
3939
- apiGroups: ["machine.openshift.io"]
40-
resources: ["machinesets","machines"]
40+
resources: ["machinesets","machines","controlplanemachinesets"]
4141
verbs: ["get", "list", "watch", "patch"]
4242
- apiGroups: ["operator.openshift.io"]
4343
resources: ["machineconfigurations/status"]
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
package machineset
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"time"
9+
10+
archtranslater "github.com/coreos/stream-metadata-go/arch"
11+
"github.com/coreos/stream-metadata-go/stream"
12+
osconfigv1 "github.com/openshift/api/config/v1"
13+
features "github.com/openshift/api/features"
14+
machinev1 "github.com/openshift/api/machine/v1"
15+
opv1 "github.com/openshift/api/operator/v1"
16+
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
17+
operatorversion "github.com/openshift/machine-config-operator/pkg/version"
18+
"sigs.k8s.io/yaml"
19+
20+
corev1 "k8s.io/api/core/v1"
21+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
kruntime "k8s.io/apimachinery/pkg/runtime"
23+
"k8s.io/apimachinery/pkg/types"
24+
kubeErrs "k8s.io/apimachinery/pkg/util/errors"
25+
"k8s.io/apimachinery/pkg/util/jsonmergepatch"
26+
"k8s.io/apimachinery/pkg/util/wait"
27+
clientset "k8s.io/client-go/kubernetes"
28+
"k8s.io/klog/v2"
29+
)
30+
31+
// syncControlPlaneMachineSets will attempt to enqueue every control plane machineset
32+
// ControlPlaneMachineSets are singletons, but for the sake of consistency with the other
33+
// syncs, I chose to keep this function similar.
34+
// nolint:dupl // I separated these from syncMAPIMachineSets for readability
35+
func (ctrl *Controller) syncControlPlaneMachineSets(reason string) {
36+
37+
// Check if CPMS feature gate is enabled
38+
if !ctrl.fgHandler.Enabled(features.FeatureGateManagedBootImagesCPMS) {
39+
klog.V(4).Infof("ManagedBootImagesCPMS feature gate is not enabled, skipping CPMS sync")
40+
return
41+
}
42+
43+
ctrl.cpmsSyncMutex.Lock()
44+
defer ctrl.cpmsSyncMutex.Unlock()
45+
46+
var mcop *opv1.MachineConfiguration
47+
var pollError error
48+
// Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long
49+
// as this is done by the operator sync loop.
50+
if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) {
51+
mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName)
52+
if pollError != nil {
53+
klog.Errorf("MachineConfiguration/cluster has not been created yet")
54+
return false, nil
55+
}
56+
57+
// Ensure status.ObservedGeneration matches the last generation of MachineConfiguration
58+
if mcop.Generation != mcop.Status.ObservedGeneration {
59+
klog.Errorf("MachineConfiguration.Status is not up to date.")
60+
pollError = fmt.Errorf("MachineConfiguration.Status is not up to date")
61+
return false, nil
62+
}
63+
return true, nil
64+
}); err != nil {
65+
klog.Errorf("MachineConfiguration was not ready: %v", pollError)
66+
ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded)
67+
return
68+
}
69+
70+
machineManagerFound, machineResourceSelector, err := getMachineResourceSelectorFromMachineManagers(mcop.Status.ManagedBootImagesStatus.MachineManagers, opv1.MachineAPI, opv1.ControlPlaneMachineSets)
71+
if err != nil {
72+
klog.Errorf("failed to create a machineset selector while enqueueing controlplanemachineset %v", err)
73+
ctrl.updateConditions(reason, fmt.Errorf("failed to create a machineset selector while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded)
74+
return
75+
}
76+
if !machineManagerFound {
77+
klog.V(4).Infof("No ControlPlaneMachineSet manager was found, so no ControlPlaneMachineSet will be enrolled.")
78+
// clear out MAPI boot image history
79+
for k := range ctrl.cpmsBootImageState {
80+
delete(ctrl.cpmsBootImageState, k)
81+
}
82+
}
83+
84+
controlPlaneMachineSets, err := ctrl.cpmsLister.List(machineResourceSelector)
85+
if err != nil {
86+
klog.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err)
87+
ctrl.updateConditions(reason, fmt.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded)
88+
return
89+
}
90+
91+
// If no machine resources were enrolled; exit the enqueue process without errors.
92+
if len(controlPlaneMachineSets) == 0 {
93+
klog.Infof("No ControlPlaneMachineSet was enrolled, so no ControlPlaneMachineSet will be enqueued.")
94+
// clear out ControlPlaneMachineSet boot image history
95+
for k := range ctrl.cpmsBootImageState {
96+
delete(ctrl.cpmsBootImageState, k)
97+
}
98+
}
99+
100+
// Reset stats before initiating reconciliation loop
101+
ctrl.cpmsStats.inProgress = 0
102+
ctrl.cpmsStats.totalCount = len(controlPlaneMachineSets)
103+
ctrl.cpmsStats.erroredCount = 0
104+
105+
// Signal start of reconciliation process, by setting progressing to true
106+
var syncErrors []error
107+
ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing)
108+
109+
for _, controlPlaneMachineSet := range controlPlaneMachineSets {
110+
err := ctrl.syncControlPlaneMachineSet(controlPlaneMachineSet)
111+
if err == nil {
112+
ctrl.cpmsStats.inProgress++
113+
} else {
114+
klog.Errorf("Error syncing ControlPlaneMachineSet %v", err)
115+
syncErrors = append(syncErrors, fmt.Errorf("error syncing ControlPlaneMachineSet %s: %v", controlPlaneMachineSet.Name, err))
116+
ctrl.cpmsStats.erroredCount++
117+
}
118+
// Update progressing conditions every step of the loop
119+
ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing)
120+
}
121+
// Update/Clear degrade conditions based on errors from this loop
122+
ctrl.updateConditions(reason, kubeErrs.NewAggregate(syncErrors), opv1.MachineConfigurationBootImageUpdateDegraded)
123+
}
124+
125+
// syncControlPlaneMachineSet will attempt to reconcile the provided ControlPlaneMachineSet
126+
func (ctrl *Controller) syncControlPlaneMachineSet(controlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error {
127+
128+
startTime := time.Now()
129+
klog.V(4).Infof("Started syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, startTime)
130+
defer func() {
131+
klog.V(4).Infof("Finished syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, time.Since(startTime))
132+
}()
133+
134+
// If the machineset has an owner reference, exit and report error. This means
135+
// that the machineset may be managed by another workflow and should not be reconciled.
136+
if len(controlPlaneMachineSet.GetOwnerReferences()) != 0 {
137+
klog.Infof("ControlPlaneMachineSet %s has OwnerReference: %v, skipping boot image update", controlPlaneMachineSet.GetOwnerReferences()[0].Kind+"/"+controlPlaneMachineSet.GetOwnerReferences()[0].Name, controlPlaneMachineSet.Name)
138+
return nil
139+
}
140+
141+
if os, ok := controlPlaneMachineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.Labels[OSLabelKey]; ok {
142+
if os == "Windows" {
143+
klog.Infof("ControlPlaneMachineSet %s has a windows os label, skipping boot image update", controlPlaneMachineSet.Name)
144+
return nil
145+
}
146+
}
147+
148+
// ControlPlaneMachineSets do not normally have an arch annotation, so use the architecture of the node
149+
// running this pod, which will always be a control plane node.
150+
arch := archtranslater.CurrentRpmArch()
151+
152+
// Fetch the infra object to determine the platform type
153+
infra, err := ctrl.infraLister.Get("cluster")
154+
if err != nil {
155+
return fmt.Errorf("failed to fetch infra object during ControlPlaneMachineSet sync: %w", err)
156+
}
157+
158+
// Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by
159+
// the operator when a master node successfully updates to a new image. This is
160+
// to prevent machinesets from being updated before the operator itself has updated.
161+
// If it hasn't been updated, exit and wait for a resync.
162+
configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName)
163+
if err != nil {
164+
return fmt.Errorf("failed to fetch coreos-bootimages config map duringControlPlaneMachineSet sync: %w", err)
165+
}
166+
versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey]
167+
if !versionHashFound {
168+
klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
169+
return nil
170+
}
171+
if versionHashFromCM != operatorversion.Hash {
172+
klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete")
173+
return nil
174+
}
175+
releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.OCPReleaseVersionKey]
176+
if !releaseVersionFound {
177+
klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
178+
return nil
179+
}
180+
if releaseVersionFromCM != operatorversion.ReleaseVersion {
181+
klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete")
182+
return nil
183+
}
184+
185+
// Check if the this ControlPlaneMachineSet requires an update
186+
patchRequired, newControlPlaneMachineSet, err := checkControlPlaneMachineSet(infra, controlPlaneMachineSet, configMap, arch, ctrl.kubeClient)
187+
if err != nil {
188+
return fmt.Errorf("failed to reconcile ControlPlaneMachineSet %s, err: %w", controlPlaneMachineSet.Name, err)
189+
}
190+
191+
// Patch the machineset if required
192+
if patchRequired {
193+
// First, check if we're hot looping
194+
if ctrl.checkControlPlaneMachineSetHotLoop(newControlPlaneMachineSet) {
195+
return fmt.Errorf("refusing to reconcile ControlPlaneMachineSet %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", controlPlaneMachineSet.Name)
196+
}
197+
klog.Infof("Patching ControlPlaneMachineSet %s", controlPlaneMachineSet.Name)
198+
return ctrl.patchControlPlaneMachineSet(controlPlaneMachineSet, newControlPlaneMachineSet)
199+
}
200+
klog.Infof("No patching required for ControlPlaneMachineSet %s", controlPlaneMachineSet.Name)
201+
return nil
202+
}
203+
204+
// Checks against a local store of boot image updates to detect hot looping
205+
func (ctrl *Controller) checkControlPlaneMachineSetHotLoop(machineSet *machinev1.ControlPlaneMachineSet) bool {
206+
bis, ok := ctrl.cpmsBootImageState[machineSet.Name]
207+
if !ok {
208+
// If the controlplanemachineset doesn't currently have a record, create a new one.
209+
ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{
210+
value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw,
211+
hotLoopCount: 1,
212+
}
213+
} else {
214+
hotLoopCount := 1
215+
// If the controller is updating to a value that was previously updated to, increase the hot loop counter
216+
if bytes.Equal(bis.value, machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw) {
217+
hotLoopCount = (bis.hotLoopCount) + 1
218+
}
219+
// Return an error and degrade if the hot loop counter is above threshold
220+
if hotLoopCount > HotLoopLimit {
221+
return true
222+
}
223+
ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{
224+
value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw,
225+
hotLoopCount: hotLoopCount,
226+
}
227+
}
228+
return false
229+
}
230+
231+
// This function patches the ControlPlaneMachineSet object using the machineClient
232+
// Returns an error if marshsalling or patching fails.
233+
func (ctrl *Controller) patchControlPlaneMachineSet(oldControlPlaneMachineSet, newControlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error {
234+
oldControlPlaneMachineSetMarshal, err := json.Marshal(oldControlPlaneMachineSet)
235+
if err != nil {
236+
return fmt.Errorf("unable to marshal old ControlPlaneMachineSet: %w", err)
237+
}
238+
newControlPlaneMachineSetMarshal, err := json.Marshal(newControlPlaneMachineSet)
239+
if err != nil {
240+
return fmt.Errorf("unable to marshal new ControlPlaneMachineSet: %w", err)
241+
}
242+
patchBytes, err := jsonmergepatch.CreateThreeWayJSONMergePatch(oldControlPlaneMachineSetMarshal, newControlPlaneMachineSetMarshal, oldControlPlaneMachineSetMarshal)
243+
if err != nil {
244+
return fmt.Errorf("unable to create patch for new ControlPlaneMachineSet: %w", err)
245+
}
246+
_, err = ctrl.machineClient.MachineV1().ControlPlaneMachineSets(MachineAPINamespace).Patch(context.TODO(), oldControlPlaneMachineSet.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{})
247+
if err != nil {
248+
return fmt.Errorf("unable to patch new ControlPlaneMachineSet: %w", err)
249+
}
250+
klog.Infof("Successfully patched ControlPlaneMachineSet %s", oldControlPlaneMachineSet.Name)
251+
return nil
252+
}
253+
254+
// This function calls the appropriate reconcile function based on the infra type
255+
// On success, it will return a bool indicating if a patch is required, and an updated
256+
// machineset object if any. It will return an error if any of the above steps fail.
257+
func checkControlPlaneMachineSet(infra *osconfigv1.Infrastructure, machineSet *machinev1.ControlPlaneMachineSet, configMap *corev1.ConfigMap, arch string, secretClient clientset.Interface) (bool, *machinev1.ControlPlaneMachineSet, error) {
258+
switch infra.Status.PlatformStatus.Type {
259+
case osconfigv1.AWSPlatformType:
260+
return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAWSProviderSpec)
261+
case osconfigv1.AzurePlatformType:
262+
return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAzureProviderSpec)
263+
case osconfigv1.GCPPlatformType:
264+
return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileGCPProviderSpec)
265+
// TODO: vsphere CPMS template seems to be empty in CI runs, and will need further investigation
266+
default:
267+
klog.Infof("Skipping controlplanemachineset %s, unsupported platform %s", machineSet.Name, infra.Status.PlatformStatus.Type)
268+
return false, nil, nil
269+
}
270+
}
271+
272+
// Generic reconcile function that handles the common pattern across all platforms
273+
// nolint:dupl // I separated this from reconcilePlatform for readability
274+
func reconcilePlatformCPMS[T any](
275+
cpms *machinev1.ControlPlaneMachineSet,
276+
infra *osconfigv1.Infrastructure,
277+
configMap *corev1.ConfigMap,
278+
arch string,
279+
secretClient clientset.Interface,
280+
reconcileProviderSpec func(*stream.Stream, string, *osconfigv1.Infrastructure, *T, string, clientset.Interface) (bool, *T, error),
281+
) (patchRequired bool, newCPMS *machinev1.ControlPlaneMachineSet, err error) {
282+
klog.Infof("Reconciling controlplanemachineset %s on %s, with arch %s", cpms.Name, string(infra.Status.PlatformStatus.Type), arch)
283+
284+
// Unmarshal the provider spec
285+
providerSpec := new(T)
286+
if err := unmarshalProviderSpecCPMS(cpms, providerSpec); err != nil {
287+
return false, nil, err
288+
}
289+
290+
// Unmarshal the configmap into a stream object
291+
streamData := new(stream.Stream)
292+
if err := unmarshalStreamDataConfigMap(configMap, streamData); err != nil {
293+
return false, nil, err
294+
}
295+
296+
// Reconcile the provider spec
297+
patchRequired, newProviderSpec, err := reconcileProviderSpec(streamData, arch, infra, providerSpec, cpms.Name, secretClient)
298+
if err != nil {
299+
return false, nil, err
300+
}
301+
302+
// If no patch is required, exit early
303+
if !patchRequired {
304+
return false, nil, nil
305+
}
306+
307+
// If patch is required, marshal the new providerspec into the controlplanemachineset
308+
newCPMS = cpms.DeepCopy()
309+
if err := marshalProviderSpecCPMS(newCPMS, newProviderSpec); err != nil {
310+
return false, nil, err
311+
}
312+
return patchRequired, newCPMS, nil
313+
}
314+
315+
// This function unmarshals the controlplanemachineset's provider spec into
316+
// a ProviderSpec object. Returns an error if providerSpec field is nil,
317+
// or the unmarshal fails
318+
func unmarshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error {
319+
if ms == nil {
320+
return fmt.Errorf("ControlPlaneMachineSet object was nil")
321+
}
322+
if ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value == nil {
323+
return fmt.Errorf("providerSpec field was empty")
324+
}
325+
if err := yaml.Unmarshal(ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, &providerSpec); err != nil {
326+
return fmt.Errorf("unmarshal into providerSpec failed %w", err)
327+
}
328+
return nil
329+
}
330+
331+
// This function marshals the ProviderSpec object into a ControlPlaneMachineSet object.
332+
// Returns an error if ProviderSpec or ControlPlaneMachineSet is nil, or if the marshal fails
333+
func marshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error {
334+
if ms == nil {
335+
return fmt.Errorf("ControlPlaneMachineSet object was nil")
336+
}
337+
if providerSpec == nil {
338+
return fmt.Errorf("ProviderSpec object was nil")
339+
}
340+
rawBytes, err := json.Marshal(providerSpec)
341+
if err != nil {
342+
return fmt.Errorf("marshal into machineset failed: %w", err)
343+
}
344+
ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value = &kruntime.RawExtension{Raw: rawBytes}
345+
return nil
346+
}

0 commit comments

Comments
 (0)