Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 91 additions & 46 deletions pkg/controller/build/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,56 +266,20 @@ func (b *buildReconciler) updateMachineOSBuild(ctx context.Context, old, current
}

if !oldState.IsBuildFailure() && curState.IsBuildFailure() {
klog.Infof("MachineOSBuild %s failed, leaving ephemeral objects in place for inspection and setting BuildDegraded condition", current.Name)

// Before setting BuildDegraded, check if another MachineOSBuild is active or if this one has been superseded
mosbList, err := b.getMachineOSBuildsForMachineOSConfig(mosc)
if err != nil {
return fmt.Errorf("could not get MachineOSBuilds for MachineOSConfig %q: %w", mosc.Name, err)
}

// Check if there are any newer or active builds that would supersede this failure
hasActiveBuild := false
isCurrentBuildStale := false
for _, mosb := range mosbList {
// Skip the current failed build
if mosb.Name == current.Name {
continue
}
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
// Check if there's an active build (building, prepared, or succeeded)
if mosbState.IsBuilding() || mosbState.IsBuildPrepared() || mosbState.IsBuildSuccess() {
hasActiveBuild = true
klog.Infof("Found active MachineOSBuild %s, skipping BuildDegraded condition for failed build %s", mosb.Name, current.Name)
break
}
}

// Also check if the failed MachineOSBuild is no longer referenced by the MachineOSConfig
if mosc.Status.CurrentImagePullSpec != "" && !isMachineOSBuildCurrentForMachineOSConfigWithPullspec(mosc, current) {
isCurrentBuildStale = true
klog.Infof("Failed MachineOSBuild %s is no longer current for MachineOSConfig %s, skipping BuildDegraded condition", current.Name, mosc.Name)
}

// Only set BuildDegraded if there are no active builds and this build is still current
if hasActiveBuild || isCurrentBuildStale {
klog.Infof("Skipping BuildDegraded condition for failed MachineOSBuild %s (hasActiveBuild=%v, isStale=%v)", current.Name, hasActiveBuild, isCurrentBuildStale)
return nil
}
klog.Infof("MachineOSBuild %s failed, leaving ephemeral objects in place for inspection", current.Name)

mcp, err := b.machineConfigPoolLister.Get(mosc.Spec.MachineConfigPool.Name)
if err != nil {
return fmt.Errorf("could not get MachineConfigPool from MachineOSConfig %q: %w", mosc.Name, err)
}

// Set BuildDegraded condition
buildError := getBuildErrorFromMOSB(current)
return b.syncBuildFailureStatus(ctx, mcp, buildError, current.Name)
// Always update ImageBuildDegraded condition based on current active build status
return b.updateImageBuildDegradedCondition(ctx, mcp, mosc)
}

// If the build was successful, clean up the build objects and propagate the
// final image pushspec onto the MachineOSConfig object.
// Also clear BuildDegraded condition if it was set due to a previously failed build
// Also update BuildDegraded condition based on current active build status
if !oldState.IsBuildSuccess() && curState.IsBuildSuccess() {
klog.Infof("MachineOSBuild %s succeeded, cleaning up all ephemeral objects used for the build", current.Name)

Expand All @@ -324,9 +288,9 @@ func (b *buildReconciler) updateMachineOSBuild(ctx context.Context, old, current
return fmt.Errorf("could not get MachineConfigPool from MachineOSConfig %q: %w", mosc.Name, err)
}

// Clear BuildDegraded condition if set
if err := b.syncBuildSuccessStatus(ctx, mcp); err != nil {
klog.Errorf("Failed to clear BuildDegraded condition for pool %s: %v", mcp.Name, err)
// Update BuildDegraded condition based on current active build status
if err := b.updateImageBuildDegradedCondition(ctx, mcp, mosc); err != nil {
klog.Errorf("Failed to update ImageBuildDegraded condition for pool %s: %v", mcp.Name, err)
}

// Clean up ephemeral objects
Expand Down Expand Up @@ -372,6 +336,18 @@ func (b *buildReconciler) updateMachineOSConfigStatus(ctx context.Context, mosc
klog.Infof("Updated annotations on MachineOSConfig %q", mosc.Name)

mosc = updatedMosc

// When annotations are updated (new build starts), also update observedGeneration
// to signal that the controller is processing the current generation
if mosc.Status.ObservedGeneration != mosc.GetGeneration() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought (non-blocking): I feel like we need a few more of these checks throughout the build controller. However, I'm not sure where / when to use them.

mosc.Status.ObservedGeneration = mosc.GetGeneration()
_, err = b.mcfgclient.MachineconfigurationV1().MachineOSConfigs().UpdateStatus(ctx, mosc, metav1.UpdateOptions{})
if err != nil {
klog.Errorf("Failed to update observedGeneration on MachineOSConfig %q: %v", mosc.Name, err)
} else {
klog.Infof("Updated observedGeneration on MachineOSConfig %q to %d", mosc.Name, mosc.GetGeneration())
}
}
}

// Skip the status update if digest image pushspec hasn't been set yet.
Expand Down Expand Up @@ -460,14 +436,14 @@ func (b *buildReconciler) startBuild(ctx context.Context, mosb *mcfgv1.MachineOS
return fmt.Errorf("could not update MachineOSConfig %q status for MachineOSBuild %q: %w", mosc.Name, mosb.Name, err)
}

// Initialize BuildDegraded condition to False when build starts
// Update BuildDegraded condition based on current active build status when build starts
mcp, err := b.machineConfigPoolLister.Get(mosc.Spec.MachineConfigPool.Name)
if err != nil {
return fmt.Errorf("could not get MachineConfigPool from MachineOSConfig %q: %w", mosc.Name, err)
}

if err := b.initializeBuildDegradedCondition(ctx, mcp); err != nil {
klog.Errorf("Failed to initialize BuildDegraded condition for pool %s: %v", mcp.Name, err)
if err := b.updateImageBuildDegradedCondition(ctx, mcp, mosc); err != nil {
klog.Errorf("Failed to update ImageBuildDegraded condition for pool %s: %v", mcp.Name, err)
}

return nil
Expand Down Expand Up @@ -1713,3 +1689,72 @@ func (b *buildReconciler) syncBuildFailureStatus(ctx context.Context, pool *mcfg
}
return buildErr
}

// updateImageBuildDegradedCondition examines all MachineOSBuilds for the MachineOSConfig
// and sets the ImageBuildDegraded condition based on the status of the currently active build
func (b *buildReconciler) updateImageBuildDegradedCondition(ctx context.Context, pool *mcfgv1.MachineConfigPool, mosc *mcfgv1.MachineOSConfig) error {
mosbList, err := b.getMachineOSBuildsForMachineOSConfig(mosc)
if err != nil {
return fmt.Errorf("could not get MachineOSBuilds for MachineOSConfig %q: %w", mosc.Name, err)
}

// Find the currently active (most relevant) build
// Priority: 1) Currently referenced by MOSC, 2) Building/Prepared, 3) Most recent
var activeBuild *mcfgv1.MachineOSBuild
var mostRecentBuild *mcfgv1.MachineOSBuild

// First, look for the build currently referenced by the MachineOSConfig
for _, mosb := range mosbList {
if isMachineOSBuildCurrentForMachineOSConfig(mosc, mosb) {
activeBuild = mosb
break
}
}

// If no current build found, look for active builds (building/prepared)
if activeBuild == nil {
for _, mosb := range mosbList {
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
if mosbState.IsBuilding() || mosbState.IsBuildPrepared() {
activeBuild = mosb
break
}
}
}

// Keep track of the most recent build regardless
for _, mosb := range mosbList {
if mostRecentBuild == nil || mosb.CreationTimestamp.After(mostRecentBuild.CreationTimestamp.Time) {
mostRecentBuild = mosb
}
}

// If still no active build, use the most recent one
if activeBuild == nil {
activeBuild = mostRecentBuild
}

Comment on lines +1696 to +1736
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (non-blocking): Would it make sense to refactor this logic into a separate function called, getActiveBuild() or getMostRelevantBuild()?

// If no builds exist at all, clear any existing BuildDegraded condition
if activeBuild == nil {
return b.syncBuildSuccessStatus(ctx, pool)
}

// Update condition based on the active build's status
activeState := ctrlcommon.NewMachineOSBuildState(activeBuild)

switch {
case activeState.IsBuildFailure():
// Set BuildDegraded=True for failed builds
buildError := getBuildErrorFromMOSB(activeBuild)
return b.syncBuildFailureStatus(ctx, pool, buildError, activeBuild.Name)
case activeState.IsBuildSuccess():
// Clear BuildDegraded=False for successful builds
return b.syncBuildSuccessStatus(ctx, pool)
case activeState.IsBuilding(), activeState.IsBuildPrepared():
// Clear BuildDegraded=False for builds in progress (allow retry after previous failure)
return b.initializeBuildDegradedCondition(ctx, pool)
}

// For any other states, don't change the condition
return nil
}
35 changes: 34 additions & 1 deletion pkg/controller/node/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ func (ctrl *Controller) calculateStatus(mcs []*mcfgv1.MachineConfigNode, cconfig

allUpdated := updatedMachineCount == machineCount &&
readyMachineCount == machineCount &&
unavailableMachineCount == 0
unavailableMachineCount == 0 &&
!isLayeredPoolBuilding(isLayeredPool, mosc, mosb)

if allUpdated {
//TODO: update api to only have one condition regarding status of update.
Expand Down Expand Up @@ -563,3 +564,35 @@ func isPinnedImageSetsInProgressForPool(pool *mcfgv1.MachineConfigPool) bool {
}
return false
}

// isLayeredPoolBuilding checks if a layered pool has an active build or failed build that would
// make nodes not truly "updated" even if they have the current machine config
func isLayeredPoolBuilding(isLayeredPool bool, mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) bool {
if !isLayeredPool || mosc == nil || mosb == nil {
return false
}

mosbState := ctrlcommon.NewMachineOSBuildState(mosb)

// Check if there's an active build (building or prepared)
if mosbState.IsBuilding() || mosbState.IsBuildPrepared() {
return true
}

// Check if there's a failed build - this means the update attempt failed
// so nodes should not be considered "updated"
if mosbState.IsBuildFailure() {
return true
}

// Check if there's a successful build that nodes haven't applied yet
// This happens when a build completes but the MOSC status hasn't been updated
// or nodes haven't picked up the new image yet
if mosbState.IsBuildSuccess() && mosb.Status.DigestedImagePushSpec != "" {
// If the successful build's image differs from what MOSC thinks is current,
// then nodes are not truly updated to the latest successful build
return string(mosb.Status.DigestedImagePushSpec) != string(mosc.Status.CurrentImagePullSpec)
}

return false
}
129 changes: 129 additions & 0 deletions test/e2e-ocl/onclusterlayering_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1362,11 +1362,15 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {

cs := framework.NewClientSet("")

// Get a random worker node to add to the layered pool
targetNode := helpers.GetRandomNode(t, cs, "worker")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Do we need to roll the image out to the node every time we do a build in this test? I ask because each rollout takes some time to do, and with three image builds, this test could take a long time overall.


mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{
poolName: layeredMCPName,
customDockerfiles: map[string]string{
layeredMCPName: cowsayDockerfile,
},
targetNode: &targetNode,
})

// First, add a bad containerfile to cause a build failure
Expand Down Expand Up @@ -1419,10 +1423,135 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {

t.Logf("Second build completed successfully: %s", finishedBuild.Name)

// Wait for the MachineOSConfig to get the new pullspec, which indicates full reconciliation
waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, string(finishedBuild.Status.DigestedImagePushSpec))

// Wait for and verify ImageBuildDegraded condition is now False
degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse)
require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present")
assert.Equal(t, "BuildSucceeded", degradedCondition.Reason, "ImageBuildDegraded reason should be BuildSucceeded")

t.Logf("ImageBuildDegraded condition correctly cleared to False with message: %s", degradedCondition.Message)

// Verify MCP status is correct after successful build and full reconciliation
successMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{})
require.NoError(t, err)

// After successful build completion and full reconciliation, MCP should show:
// Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False
kubeassert.Eventually().MachineConfigPoolReachesState(successMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) {
if err != nil {
return false, err
}
// Return false (keep polling) if conditions don't match expected state
if !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) {
return false, nil
}

// Return true when expected state is reached
t.Logf("MCP status after successful build - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v",
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded))

return true, nil
}, "MCP should reach correct state after successful build (Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False)")

// Now trigger another build to test MCP status transitions when a new build starts
t.Logf("Triggering a third build to test MCP status transitions")

// Modify the containerfile slightly to trigger a new build
apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{})
require.NoError(t, err)

// Add a comment to the containerfile to change it and trigger a new build
modifiedDockerfile := cowsayDockerfile + "\n# Comment to trigger new build"
apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{
{
ContainerfileArch: mcfgv1.NoArch,
Content: modifiedDockerfile,
},
}

updated, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{})
require.NoError(t, err)

t.Logf("Modified containerfile, waiting for third build to start")

// Get the updated MCP to compute the new build
mcp, err = cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{})
require.NoError(t, err)

// Compute the new MachineOSBuild name for the third build
thirdMoscMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updated, mcp)

// Wait for the third build to start
thirdMosb := waitForBuildToStart(t, cs, thirdMoscMosb)
t.Logf("Third build started: %s", thirdMosb.Name)

// Verify MCP status during active build:
// Updated=False, Updating=True, Degraded=False, ImageBuildDegraded=False
buildingMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{})
require.NoError(t, err)

kubeassert.Eventually().MachineConfigPoolReachesState(buildingMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) {
if err != nil {
return false, err
}
// During build, MCP should show: Updated=False, Updating=True
if apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) ||
!apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) {
return false, nil
}

t.Logf("MCP status during active build - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v",
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded))

return true, nil
}, "MCP should reach correct state during active build (Updated=False, Updating=True, Degraded=False, ImageBuildDegraded=False)")

// Wait for the third build to complete successfully
finalBuild := waitForBuildToComplete(t, cs, thirdMosb)
t.Logf("Third build completed successfully: %s", finalBuild.Name)

// Wait for the MachineOSConfig to get the new pullspec, which indicates full reconciliation
waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, string(finalBuild.Status.DigestedImagePushSpec))

// Final verification: MCP status should return to:
// Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False
finalMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{})
require.NoError(t, err)

kubeassert.Eventually().MachineConfigPoolReachesState(finalMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) {
if err != nil {
return false, err
}
// Return false (keep polling) if conditions don't match expected state
if !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) ||
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) {
return false, nil
}

// Return true when expected state is reached
t.Logf("Final MCP status after third build completion - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v",
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded),
apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded))

return true, nil
}, "MCP should return to correct state after final build completion (Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False)")

t.Logf("All MCP status transitions verified successfully across build failure, success, and subsequent new build")
}