Skip to content

Commit 4aa7aca

Browse files
committed
msbic: implement CPMS support
1 parent b36147e commit 4aa7aca

File tree

6 files changed

+431
-22
lines changed

6 files changed

+431
-22
lines changed

cmd/machine-config-controller/start.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ func runStartCmd(_ *cobra.Command, _ []string) {
134134
ctrlctx.ClientBuilder.MachineClientOrDie("machine-set-boot-image-controller"),
135135
ctrlctx.KubeNamespacedInformerFactory.Core().V1().ConfigMaps(),
136136
ctrlctx.MachineInformerFactory.Machine().V1beta1().MachineSets(),
137+
ctrlctx.MachineInformerFactory.Machine().V1().ControlPlaneMachineSets(),
137138
ctrlctx.ConfigInformerFactory.Config().V1().Infrastructures(),
138139
ctrlctx.ClientBuilder.OperatorClientOrDie(componentName),
139140
ctrlctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),

manifests/machineconfigcontroller/clusterrole.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ rules:
3737
resources: ["daemonsets"]
3838
verbs: ["get"]
3939
- apiGroups: ["machine.openshift.io"]
40-
resources: ["machinesets","machines"]
40+
resources: ["machinesets","machines","controlplanemachinesets"]
4141
verbs: ["get", "list", "watch", "patch"]
4242
- apiGroups: ["operator.openshift.io"]
4343
resources: ["machineconfigurations/status"]
Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
package machineset
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"time"
9+
10+
archtranslater "github.com/coreos/stream-metadata-go/arch"
11+
"github.com/coreos/stream-metadata-go/stream"
12+
osconfigv1 "github.com/openshift/api/config/v1"
13+
machinev1 "github.com/openshift/api/machine/v1"
14+
opv1 "github.com/openshift/api/operator/v1"
15+
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
16+
operatorversion "github.com/openshift/machine-config-operator/pkg/version"
17+
"sigs.k8s.io/yaml"
18+
19+
corev1 "k8s.io/api/core/v1"
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
kruntime "k8s.io/apimachinery/pkg/runtime"
22+
"k8s.io/apimachinery/pkg/types"
23+
kubeErrs "k8s.io/apimachinery/pkg/util/errors"
24+
"k8s.io/apimachinery/pkg/util/jsonmergepatch"
25+
"k8s.io/apimachinery/pkg/util/wait"
26+
clientset "k8s.io/client-go/kubernetes"
27+
"k8s.io/klog/v2"
28+
)
29+
30+
// syncControlPlaneMachineSets will attempt to enqueue every control plane machineset
31+
// ControlPlaneMachineSets are singletons, but for the sake of consistency with the other
32+
// syncs, I chose to keep this function similar.
33+
// nolint:dupl // I separated these from syncMAPIMachineSets for readability
34+
func (ctrl *Controller) syncControlPlaneMachineSets(reason string) {
35+
36+
// TODO: check if CPMS feature gate exists after API lands
37+
38+
ctrl.cpmsSyncMutex.Lock()
39+
defer ctrl.cpmsSyncMutex.Unlock()
40+
41+
var mcop *opv1.MachineConfiguration
42+
var pollError error
43+
// Wait for mcop.Status to populate, otherwise error out. This shouldn't take very long
44+
// as this is done by the operator sync loop.
45+
if err := wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 2*time.Minute, true, func(_ context.Context) (bool, error) {
46+
mcop, pollError = ctrl.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName)
47+
if pollError != nil {
48+
klog.Errorf("MachineConfiguration/cluster has not been created yet")
49+
return false, nil
50+
}
51+
52+
// Ensure status.ObservedGeneration matches the last generation of MachineConfiguration
53+
if mcop.Generation != mcop.Status.ObservedGeneration {
54+
klog.Errorf("MachineConfiguration.Status is not up to date.")
55+
pollError = fmt.Errorf("MachineConfiguration.Status is not up to date")
56+
return false, nil
57+
}
58+
return true, nil
59+
}); err != nil {
60+
klog.Errorf("MachineConfiguration was not ready: %v", pollError)
61+
ctrl.updateConditions(reason, fmt.Errorf("MachineConfiguration was not ready: while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded)
62+
return
63+
}
64+
65+
// TODO: Change this to CPMS type after API lands
66+
machineManagerFound, machineResourceSelector, err := getMachineResourceSelectorFromMachineManagers(mcop.Status.ManagedBootImagesStatus.MachineManagers, opv1.MachineAPI, opv1.MachineSets)
67+
if err != nil {
68+
klog.Errorf("failed to create a machineset selector while enqueueing controlplanemachineset %v", err)
69+
ctrl.updateConditions(reason, fmt.Errorf("failed to create a machineset selector while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded)
70+
return
71+
}
72+
if !machineManagerFound {
73+
klog.V(4).Infof("No ControlPlaneMachineSet manager was found, so no ControlPlaneMachineSet will be enrolled.")
74+
// clear out MAPI boot image history
75+
for k := range ctrl.cpmsBootImageState {
76+
delete(ctrl.cpmsBootImageState, k)
77+
}
78+
}
79+
80+
controlPlaneMachineSets, err := ctrl.cpmsLister.List(machineResourceSelector)
81+
if err != nil {
82+
klog.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err)
83+
ctrl.updateConditions(reason, fmt.Errorf("failed to fetch ControlPlaneMachineSet list while enqueueing ControlPlaneMachineSet %v", err), opv1.MachineConfigurationBootImageUpdateDegraded)
84+
return
85+
}
86+
87+
// If no machine resources were enrolled; exit the enqueue process without errors.
88+
if len(controlPlaneMachineSets) == 0 {
89+
klog.Infof("No ControlPlaneMachineSet were enrolled, so no MAPI machinesets will be enqueued.")
90+
// clear out ControlPlaneMachineSet boot image history
91+
for k := range ctrl.cpmsBootImageState {
92+
delete(ctrl.cpmsBootImageState, k)
93+
}
94+
}
95+
96+
// Reset stats before initiating reconciliation loop
97+
ctrl.cpmsStats.inProgress = 0
98+
ctrl.cpmsStats.totalCount = len(controlPlaneMachineSets)
99+
ctrl.cpmsStats.erroredCount = 0
100+
101+
// Signal start of reconciliation process, by setting progressing to true
102+
var syncErrors []error
103+
ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing)
104+
105+
for _, controlPlaneMachineSet := range controlPlaneMachineSets {
106+
err := ctrl.syncControlPlaneMachineSet(controlPlaneMachineSet)
107+
if err == nil {
108+
ctrl.cpmsStats.inProgress++
109+
} else {
110+
klog.Errorf("Error syncing ControlPlaneMachineSet %v", err)
111+
syncErrors = append(syncErrors, fmt.Errorf("error syncing ControlPlaneMachineSet %s: %v", controlPlaneMachineSet.Name, err))
112+
ctrl.cpmsStats.erroredCount++
113+
}
114+
// Update progressing conditions every step of the loop
115+
ctrl.updateConditions(reason, nil, opv1.MachineConfigurationBootImageUpdateProgressing)
116+
}
117+
// Update/Clear degrade conditions based on errors from this loop
118+
ctrl.updateConditions(reason, kubeErrs.NewAggregate(syncErrors), opv1.MachineConfigurationBootImageUpdateDegraded)
119+
}
120+
121+
// syncControlPlaneMachineSet will attempt to reconcile the provided ControlPlaneMachineSet
122+
func (ctrl *Controller) syncControlPlaneMachineSet(controlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error {
123+
124+
startTime := time.Now()
125+
klog.V(4).Infof("Started syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, startTime)
126+
defer func() {
127+
klog.V(4).Infof("Finished syncing ControlPlaneMachineSet %q (%v)", controlPlaneMachineSet.Name, time.Since(startTime))
128+
}()
129+
130+
// If the machineset has an owner reference, exit and report error. This means
131+
// that the machineset may be managed by another workflow and should not be reconciled.
132+
if len(controlPlaneMachineSet.GetOwnerReferences()) != 0 {
133+
klog.Infof("ControlPlaneMachineSet %s has OwnerReference: %v, skipping boot image update", controlPlaneMachineSet.GetOwnerReferences()[0].Kind+"/"+controlPlaneMachineSet.GetOwnerReferences()[0].Name, controlPlaneMachineSet.Name)
134+
return nil
135+
}
136+
137+
if os, ok := controlPlaneMachineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.Labels[OSLabelKey]; ok {
138+
if os == "Windows" {
139+
klog.Infof("ControlPlaneMachineSet %s has a windows os label, skipping boot image update", controlPlaneMachineSet.Name)
140+
return nil
141+
}
142+
}
143+
144+
// ControlPlaneMachineSets do not normally have an arch annotation, so use the architecture of the node
145+
// running this pod, which will always be a control plane node.
146+
arch := archtranslater.CurrentRpmArch()
147+
148+
// Fetch the infra object to determine the platform type
149+
infra, err := ctrl.infraLister.Get("cluster")
150+
if err != nil {
151+
return fmt.Errorf("failed to fetch infra object during ControlPlaneMachineSet sync: %w", err)
152+
}
153+
154+
// Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by
155+
// the operator when a master node successfully updates to a new image. This is
156+
// to prevent machinesets from being updated before the operator itself has updated.
157+
// If it hasn't been updated, exit and wait for a resync.
158+
configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName)
159+
if err != nil {
160+
return fmt.Errorf("failed to fetch coreos-bootimages config map duringControlPlaneMachineSet sync: %w", err)
161+
}
162+
versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey]
163+
if !versionHashFound {
164+
klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
165+
return nil
166+
}
167+
if versionHashFromCM != operatorversion.Hash {
168+
klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete")
169+
return nil
170+
}
171+
releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.OCPReleaseVersionKey]
172+
if !releaseVersionFound {
173+
klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
174+
return nil
175+
}
176+
if releaseVersionFromCM != operatorversion.ReleaseVersion {
177+
klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete")
178+
return nil
179+
}
180+
181+
// Check if the this ControlPlaneMachineSet requires an update
182+
patchRequired, newControlPlaneMachineSet, err := checkControlPlaneMachineSet(infra, controlPlaneMachineSet, configMap, arch, ctrl.kubeClient)
183+
if err != nil {
184+
return fmt.Errorf("failed to reconcile ControlPlaneMachineSet %s, err: %w", controlPlaneMachineSet.Name, err)
185+
}
186+
187+
// Patch the machineset if required
188+
if patchRequired {
189+
// First, check if we're hot looping
190+
if ctrl.checkControlPlaneMachineSetHotLoop(newControlPlaneMachineSet) {
191+
return fmt.Errorf("refusing to reconcile ControlPlaneMachineSet %s, hot loop detected. Please opt-out of boot image updates, adjust your machine provisioning workflow to prevent hot loops and opt back in to resume boot image updates", controlPlaneMachineSet.Name)
192+
}
193+
klog.Infof("Patching ControlPlaneMachineSet %s", controlPlaneMachineSet.Name)
194+
return ctrl.patchControlPlaneMachineSet(controlPlaneMachineSet, newControlPlaneMachineSet)
195+
}
196+
klog.Infof("No patching required for ControlPlaneMachineSet %s", controlPlaneMachineSet.Name)
197+
return nil
198+
}
199+
200+
// Checks against a local store of boot image updates to detect hot looping
201+
func (ctrl *Controller) checkControlPlaneMachineSetHotLoop(machineSet *machinev1.ControlPlaneMachineSet) bool {
202+
bis, ok := ctrl.cpmsBootImageState[machineSet.Name]
203+
if !ok {
204+
// If the controlplanemachineset doesn't currently have a record, create a new one.
205+
ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{
206+
value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw,
207+
hotLoopCount: 1,
208+
}
209+
} else {
210+
hotLoopCount := 1
211+
// If the controller is updating to a value that was previously updated to, increase the hot loop counter
212+
if bytes.Equal(bis.value, machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw) {
213+
hotLoopCount = (bis.hotLoopCount) + 1
214+
}
215+
// Return an error and degrade if the hot loop counter is above threshold
216+
if hotLoopCount > HotLoopLimit {
217+
return true
218+
}
219+
ctrl.cpmsBootImageState[machineSet.Name] = BootImageState{
220+
value: machineSet.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw,
221+
hotLoopCount: hotLoopCount,
222+
}
223+
}
224+
return false
225+
}
226+
227+
// This function patches the ControlPlaneMachineSet object using the machineClient
228+
// Returns an error if marshsalling or patching fails.
229+
func (ctrl *Controller) patchControlPlaneMachineSet(oldControlPlaneMachineSet, newControlPlaneMachineSet *machinev1.ControlPlaneMachineSet) error {
230+
oldControlPlaneMachineSetMarshal, err := json.Marshal(oldControlPlaneMachineSet)
231+
if err != nil {
232+
return fmt.Errorf("unable to marshal old ControlPlaneMachineSet: %w", err)
233+
}
234+
newControlPlaneMachineSetMarshal, err := json.Marshal(newControlPlaneMachineSet)
235+
if err != nil {
236+
return fmt.Errorf("unable to marshal new ControlPlaneMachineSet: %w", err)
237+
}
238+
patchBytes, err := jsonmergepatch.CreateThreeWayJSONMergePatch(oldControlPlaneMachineSetMarshal, newControlPlaneMachineSetMarshal, oldControlPlaneMachineSetMarshal)
239+
if err != nil {
240+
return fmt.Errorf("unable to create patch for new ControlPlaneMachineSet: %w", err)
241+
}
242+
_, err = ctrl.machineClient.MachineV1().ControlPlaneMachineSets(MachineAPINamespace).Patch(context.TODO(), oldControlPlaneMachineSet.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{})
243+
if err != nil {
244+
return fmt.Errorf("unable to patch new ControlPlaneMachineSet: %w", err)
245+
}
246+
klog.Infof("Successfully patched ControlPlaneMachineSet %s", oldControlPlaneMachineSet.Name)
247+
return nil
248+
}
249+
250+
// This function calls the appropriate reconcile function based on the infra type
251+
// On success, it will return a bool indicating if a patch is required, and an updated
252+
// machineset object if any. It will return an error if any of the above steps fail.
253+
func checkControlPlaneMachineSet(infra *osconfigv1.Infrastructure, machineSet *machinev1.ControlPlaneMachineSet, configMap *corev1.ConfigMap, arch string, secretClient clientset.Interface) (bool, *machinev1.ControlPlaneMachineSet, error) {
254+
switch infra.Status.PlatformStatus.Type {
255+
case osconfigv1.AWSPlatformType:
256+
return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAWSProviderSpec)
257+
case osconfigv1.AzurePlatformType:
258+
return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileAzureProviderSpec)
259+
case osconfigv1.GCPPlatformType:
260+
return reconcilePlatformCPMS(machineSet, infra, configMap, arch, secretClient, reconcileGCPProviderSpec)
261+
// TODO: vsphere CPMS template seems to be empty in CI runs, and will need further investigation
262+
default:
263+
klog.Infof("Skipping controlplanemachineset %s, unsupported platform %s", machineSet.Name, infra.Status.PlatformStatus.Type)
264+
return false, nil, nil
265+
}
266+
}
267+
268+
// Generic reconcile function that handles the common pattern across all platforms
269+
// nolint:dupl // I separated this from reconcilePlatform for readability
270+
func reconcilePlatformCPMS[T any](
271+
cpms *machinev1.ControlPlaneMachineSet,
272+
infra *osconfigv1.Infrastructure,
273+
configMap *corev1.ConfigMap,
274+
arch string,
275+
secretClient clientset.Interface,
276+
reconcileProviderSpec func(*stream.Stream, string, *osconfigv1.Infrastructure, *T, string, clientset.Interface) (bool, *T, error),
277+
) (patchRequired bool, newCPMS *machinev1.ControlPlaneMachineSet, err error) {
278+
klog.Infof("Reconciling controlplanemachineset %s on %s, with arch %s", cpms.Name, string(infra.Status.PlatformStatus.Type), arch)
279+
280+
// Unmarshal the provider spec
281+
providerSpec := new(T)
282+
if err := unmarshalProviderSpecCPMS(cpms, providerSpec); err != nil {
283+
return false, nil, err
284+
}
285+
286+
// Unmarshal the configmap into a stream object
287+
streamData := new(stream.Stream)
288+
if err := unmarshalStreamDataConfigMap(configMap, streamData); err != nil {
289+
return false, nil, err
290+
}
291+
292+
// Reconcile the provider spec
293+
patchRequired, newProviderSpec, err := reconcileProviderSpec(streamData, arch, infra, providerSpec, cpms.Name, secretClient)
294+
if err != nil {
295+
return false, nil, err
296+
}
297+
298+
// If no patch is required, exit early
299+
if !patchRequired {
300+
return false, nil, nil
301+
}
302+
303+
// If patch is required, marshal the new providerspec into the controlplanemachineset
304+
newCPMS = cpms.DeepCopy()
305+
if err := marshalProviderSpecCPMS(newCPMS, newProviderSpec); err != nil {
306+
return false, nil, err
307+
}
308+
return patchRequired, newCPMS, nil
309+
}
310+
311+
// This function unmarshals the controlplanemachineset's provider spec into
312+
// a ProviderSpec object. Returns an error if providerSpec field is nil,
313+
// or the unmarshal fails
314+
func unmarshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error {
315+
if ms == nil {
316+
return fmt.Errorf("ControlPlaneMachineSet object was nil")
317+
}
318+
if ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value == nil {
319+
return fmt.Errorf("providerSpec field was empty")
320+
}
321+
if err := yaml.Unmarshal(ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value.Raw, &providerSpec); err != nil {
322+
return fmt.Errorf("unmarshal into providerSpec failed %w", err)
323+
}
324+
return nil
325+
}
326+
327+
// This function marshals the ProviderSpec object into a ControlPlaneMachineSet object.
328+
// Returns an error if ProviderSpec or ControlPlaneMachineSet is nil, or if the marshal fails
329+
func marshalProviderSpecCPMS(ms *machinev1.ControlPlaneMachineSet, providerSpec interface{}) error {
330+
if ms == nil {
331+
return fmt.Errorf("ControlPlaneMachineSet object was nil")
332+
}
333+
if providerSpec == nil {
334+
return fmt.Errorf("ProviderSpec object was nil")
335+
}
336+
rawBytes, err := json.Marshal(providerSpec)
337+
if err != nil {
338+
return fmt.Errorf("marshal into machineset failed: %w", err)
339+
}
340+
ms.Spec.Template.OpenShiftMachineV1Beta1Machine.Spec.ProviderSpec.Value = &kruntime.RawExtension{Raw: rawBytes}
341+
return nil
342+
}

0 commit comments

Comments
 (0)