Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
df39156
:seedling: Remove usage of FailureReason and FailureMessage (hcloud)
guettli Nov 17, 2025
0b37365
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Nov 17, 2025
8b5367c
do not overwrite the message.
guettli Nov 17, 2025
03a156b
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Nov 25, 2025
f02c1a2
add deprecation message to FailureReason and FailureMessage.
guettli Nov 25, 2025
9667883
removed block to skip reconcile. Not needed.
guettli Nov 25, 2025
a619922
use skip reconciling.
guettli Nov 25, 2025
eb84b90
fix creation of event, extend docstring of SetErrorAndRemediate
guettli Nov 25, 2025
e771a19
align code to BM.
guettli Nov 25, 2025
e76972a
fixed typo.
guettli Nov 25, 2025
26cc5f1
tests is working.
guettli Nov 25, 2025
5d90102
fixed test.
guettli Nov 25, 2025
743c994
fix linters.
guettli Nov 25, 2025
1c2d5ec
fix linter
guettli Nov 25, 2025
69ce8c3
fix typos.
guettli Nov 25, 2025
9855c72
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Nov 26, 2025
8f600f9
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Dec 2, 2025
722b496
typo.
guettli Dec 2, 2025
b0c140d
use RemediationSucceededCondition
guettli Dec 2, 2025
1f98712
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Dec 3, 2025
7e3595f
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Dec 3, 2025
e8221a7
Merge branch 'tg/remove-failure-reason--baremetal' into tg/remove-fai…
guettli Dec 4, 2025
96af210
amend to last merge commit.
guettli Dec 4, 2025
df963a2
fixed tests.
guettli Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/v1beta1/hcloudmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,18 @@ type HCloudMachineStatus struct {
// FailureReason will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a succinct value suitable
// for machine interpretation.
//
// Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
//
// +optional
FailureReason *capierrors.MachineStatusError `json:"failureReason,omitempty"`

// FailureMessage will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a more verbose string suitable
// for logging and human consumption.
//
// Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
//
// +optional
FailureMessage *string `json:"failureMessage,omitempty"`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,16 @@ spec:
FailureMessage will be set in the event that there is a terminal problem
reconciling the Machine and will contain a more verbose string suitable
for logging and human consumption.

Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
type: string
failureReason:
description: |-
FailureReason will be set in the event that there is a terminal problem
reconciling the Machine and will contain a succinct value suitable
for machine interpretation.

Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
type: string
instanceState:
description: InstanceState is the state of the server for this machine.
Expand Down
7 changes: 5 additions & 2 deletions controllers/hcloudmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,13 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
return r.reconcileDelete(ctx, machineScope)
}

if hcloudMachine.Status.FailureReason != nil {
// This machine will be removed.
_, exists := machine.Annotations[clusterv1.RemediateMachineAnnotation]
if exists {
// This hcloud machine will be removed.
log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.")
return reconcile.Result{}, nil
}

return r.reconcileNormal(ctx, machineScope)
}

Expand Down
118 changes: 109 additions & 9 deletions controllers/hcloudremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,14 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
hcloudutil "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/util"
"github.com/syself/cluster-api-provider-hetzner/pkg/utils"
)
Expand Down Expand Up @@ -147,9 +151,8 @@ var _ = Describe("HCloudRemediationReconciler", func() {
},
},
Spec: infrav1.HCloudMachineSpec{
ImageName: "my-control-plane",
Type: "cpx31",
PlacementGroupName: &defaultPlacementGroupName,
ImageName: "my-control-plane",
Type: "cpx31",
},
}
Expect(testEnv.Create(ctx, hcloudMachine)).To(Succeed())
Expand Down Expand Up @@ -227,14 +230,18 @@ var _ = Describe("HCloudRemediationReconciler", func() {
Expect(testEnv.Create(ctx, hcloudRemediation)).To(Succeed())

By("checking if hcloudRemediation is in deleting phase and capiMachine has the MachineOwnerRemediatedCondition")
Eventually(func() bool {
Eventually(func() error {
if err := testEnv.Get(ctx, hcloudRemediationkey, hcloudRemediation); err != nil {
return false
return err
}

return hcloudRemediation.Status.Phase == infrav1.PhaseDeleting &&
isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason)
}, timeout).Should(BeTrue())
if hcloudRemediation.Status.Phase != infrav1.PhaseDeleting {
return fmt.Errorf("hcloudRemediation.Status.Phase is not infrav1.PhaseDeleting")
}
if !isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason) {
return fmt.Errorf("MachineOwnerRemediatedCondition not set")
}
return nil
}, timeout).Should(Succeed())
})

It("checks that, under normal conditions, a reboot is carried out and retryCount and lastRemediated are set", func() {
Expand Down Expand Up @@ -318,5 +325,98 @@ var _ = Describe("HCloudRemediationReconciler", func() {
isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason)
}, timeout).Should(BeTrue())
})
It("should delete machine if SetErrorAndRemediate() was called", func() {
By("Checking Environment")
capiMachineAgain, err := util.GetOwnerMachine(ctx, testEnv, hcloudMachine.ObjectMeta)
Expect(err).ShouldNot(HaveOccurred())
Expect(capiMachineAgain).ToNot(BeNil())
Expect(capiMachine.UID).To(Equal(capiMachineAgain.UID))
hcloudClient := testEnv.HCloudClientFactory.NewClient("dummy-token")

server, err := hcloudClient.CreateServer(ctx, hcloud.ServerCreateOpts{
Name: "myserver",
})
Expect(err).ShouldNot(HaveOccurred())
providerID := hcloudutil.ProviderIDFromServerID(int(server.ID))
hcloudMachine.Spec.ProviderID = &providerID
err = testEnv.Update(ctx, hcloudMachine)
Expect(err).ShouldNot(HaveOccurred())

By("Call SetErrorAndRemediateMachine")
Eventually(func() error {
err = testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine)
if err != nil {
return err
}
err = scope.SetErrorAndRemediateMachine(ctx, testEnv, capiMachine, hcloudMachine, "test-of-set-error-and-remediate")
if err != nil {
return err
}
err = testEnv.Status().Update(ctx, hcloudMachine)
if err != nil {
return err
}
return nil
}).Should(Succeed())

By("Wait until hcloud has condition set.")
Eventually(func() error {
err := testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine)
if err != nil {
return err
}
c := conditions.Get(hcloudMachine, infrav1.DeleteMachineSucceededCondition)
if c == nil {
return fmt.Errorf("not set: DeleteMachineSucceededCondition")
}
if c.Status != corev1.ConditionFalse {
return fmt.Errorf("status not set yet")
}
return nil
}).Should(Succeed())

By("Do the job of CAPI: Create a HCloudRemediation")
rem := &infrav1.HCloudRemediation{
ObjectMeta: metav1.ObjectMeta{
Name: hcloudMachine.Name,
Namespace: hcloudMachine.Namespace,
},
Spec: infrav1.HCloudRemediationSpec{
Strategy: &infrav1.RemediationStrategy{
Type: infrav1.RemediationTypeReboot,
RetryLimit: 5,
Timeout: &metav1.Duration{
Duration: time.Minute,
},
},
},
}

err = controllerutil.SetOwnerReference(capiMachine, rem, testEnv.GetScheme())
Expect(err).Should(Succeed())

err = testEnv.Create(ctx, rem)
Expect(err).ShouldNot(HaveOccurred())

By("Wait until remediation is done")
Eventually(func() error {
err := testEnv.Get(ctx, client.ObjectKeyFromObject(capiMachine), capiMachine)
if err != nil {
return err
}

c := conditions.Get(capiMachine, clusterv1.MachineOwnerRemediatedCondition)
if c == nil {
return fmt.Errorf("not set: MachineOwnerRemediatedCondition")
}
if c.Status != corev1.ConditionFalse {
return fmt.Errorf("status not set yet")
}
if c.Message != "Remediation finished (machine will be deleted): exit remediation because infra machine has condition set: DeleteMachineInProgress: test-of-set-error-and-remediate" {
return fmt.Errorf("Message is not as expected: %q", c.Message)
}
return nil
}).Should(Succeed())
})
})
})
2 changes: 1 addition & 1 deletion controllers/hetznerbaremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl
msg := "hbmm has DeleteMachineSucceededCondition=False."
log.Info(msg)
conditions.MarkFalse(bmHost, infrav1.DeleteMachineSucceededCondition,
deleteMachineCondition.Reason, deleteMachineCondition.Severity,
deleteMachineCondition.Reason, clusterv1.ConditionSeverityInfo,
"%s", deleteMachineCondition.Message)
} else {
conditions.MarkTrue(bmHost, infrav1.DeleteMachineSucceededCondition)
Expand Down
38 changes: 13 additions & 25 deletions pkg/scope/hcloudremediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,33 +72,26 @@ func NewHCloudRemediationScope(params HCloudRemediationScopeParams) (*HCloudReme
return nil, fmt.Errorf("failed to init patch helper: %w", err)
}

machinePatchHelper, err := patch.NewHelper(params.Machine, params.Client)
if err != nil {
return nil, fmt.Errorf("failed to init machine patch helper: %w", err)
}

return &HCloudRemediationScope{
Logger: params.Logger,
Client: params.Client,
HCloudClient: params.HCloudClient,
patchHelper: patchHelper,
machinePatchHelper: machinePatchHelper,
Machine: params.Machine,
HCloudMachine: params.HCloudMachine,
HCloudRemediation: params.HCloudRemediation,
Logger: params.Logger,
Client: params.Client,
HCloudClient: params.HCloudClient,
patchHelper: patchHelper,
Machine: params.Machine,
HCloudMachine: params.HCloudMachine,
HCloudRemediation: params.HCloudRemediation,
}, nil
}

// HCloudRemediationScope defines the basic context for an actuator to operate upon.
type HCloudRemediationScope struct {
logr.Logger
Client client.Client
patchHelper *patch.Helper
machinePatchHelper *patch.Helper
HCloudClient hcloudclient.Client
Machine *clusterv1.Machine
HCloudMachine *infrav1.HCloudMachine
HCloudRemediation *infrav1.HCloudRemediation
Client client.Client
patchHelper *patch.Helper
HCloudClient hcloudclient.Client
Machine *clusterv1.Machine
HCloudMachine *infrav1.HCloudMachine
HCloudRemediation *infrav1.HCloudRemediation
}

// Close closes the current scope persisting the cluster configuration and status.
Expand Down Expand Up @@ -126,8 +119,3 @@ func (m *HCloudRemediationScope) ServerIDFromProviderID() (int64, error) {
func (m *HCloudRemediationScope) PatchObject(ctx context.Context, opts ...patch.Option) error {
return m.patchHelper.Patch(ctx, m.HCloudRemediation, opts...)
}

// PatchMachine persists the machine spec and status.
func (m *HCloudRemediationScope) PatchMachine(ctx context.Context, opts ...patch.Option) error {
return m.machinePatchHelper.Patch(ctx, m.Machine, opts...)
}
45 changes: 37 additions & 8 deletions pkg/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ import (

"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/record"
"sigs.k8s.io/controller-runtime/pkg/client"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
Expand Down Expand Up @@ -126,13 +127,41 @@ func (m *MachineScope) PatchObject(ctx context.Context) error {
return m.patchHelper.Patch(ctx, m.HCloudMachine)
}

// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs
// the message. It assumes the reason is invalid configuration, since that is
// currently the only relevant MachineStatusError choice.
// CAPI will delete the machine and create a new one.
func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) {
m.HCloudMachine.Status.FailureMessage = &message
m.HCloudMachine.Status.FailureReason = &reason
// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding
// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be
// created, and the DeleteMachineSucceededCondition will be set to False on the hcloud-machine. It
// gets used, when a not-recoverable error happens. Example: hcloud server was deleted by hand in
// the hcloud UI.
func (m *MachineScope) SetErrorAndRemediate(ctx context.Context, message string) error {
return SetErrorAndRemediateMachine(ctx, m.Client, m.Machine, m.HCloudMachine, message)
}

// SetErrorAndRemediateMachine implements SetErrorAndRemediate. It is exported, so that other code
// (for example in tests) can call without creating a MachinenScope.
func SetErrorAndRemediateMachine(ctx context.Context, crClient client.Client, capiMachine *clusterv1.Machine, hcloudMachine *infrav1.HCloudMachine, message string) error {
// Create a patch base
patch := client.MergeFrom(capiMachine.DeepCopy())

// Modify only annotations on the in-memory copy
if capiMachine.Annotations == nil {
capiMachine.Annotations = map[string]string{}
}
capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] = ""

// Apply patch – only the diff (annotations) is sent to the API server
if err := crClient.Patch(ctx, capiMachine, patch); err != nil {
return fmt.Errorf("patch failed in SetErrorAndRemediate: %w", err)
}

record.Warnf(hcloudMachine,
"HCloudMachineWillBeRemediated",
"HCloudMachine will be remediated: %s", message)

conditions.MarkFalse(hcloudMachine, infrav1.DeleteMachineSucceededCondition,
infrav1.DeleteMachineInProgressReason, clusterv1.ConditionSeverityInfo, "%s",
message)

return nil
}

// SetRegion sets the region field on the machine.
Expand Down
Loading