diff --git a/api/v1beta1/hcloudmachine_types.go b/api/v1beta1/hcloudmachine_types.go index 25ab0b10e..8add4e17e 100644 --- a/api/v1beta1/hcloudmachine_types.go +++ b/api/v1beta1/hcloudmachine_types.go @@ -120,12 +120,18 @@ type HCloudMachineStatus struct { // FailureReason will be set in the event that there is a terminal problem // reconciling the Machine and will contain a succinct value suitable // for machine interpretation. + // + // Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details. + // // +optional FailureReason *capierrors.MachineStatusError `json:"failureReason,omitempty"` // FailureMessage will be set in the event that there is a terminal problem // reconciling the Machine and will contain a more verbose string suitable // for logging and human consumption. + // + // Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details. + // // +optional FailureMessage *string `json:"failureMessage,omitempty"` diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml index b8767595c..6ff4bb61a 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml @@ -262,12 +262,16 @@ spec: FailureMessage will be set in the event that there is a terminal problem reconciling the Machine and will contain a more verbose string suitable for logging and human consumption. + + Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details. type: string failureReason: description: |- FailureReason will be set in the event that there is a terminal problem reconciling the Machine and will contain a succinct value suitable for machine interpretation. + + Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details. type: string instanceState: description: InstanceState is the state of the server for this machine. diff --git a/controllers/hcloudmachine_controller.go b/controllers/hcloudmachine_controller.go index a791abc06..b21640c57 100644 --- a/controllers/hcloudmachine_controller.go +++ b/controllers/hcloudmachine_controller.go @@ -241,10 +241,13 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R return r.reconcileDelete(ctx, machineScope) } - if hcloudMachine.Status.FailureReason != nil { - // This machine will be removed. + _, exists := machine.Annotations[clusterv1.RemediateMachineAnnotation] + if exists { + // This hcloud machine will be removed. + log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.") return reconcile.Result{}, nil } + return r.reconcileNormal(ctx, machineScope) } diff --git a/controllers/hcloudremediation_controller_test.go b/controllers/hcloudremediation_controller_test.go index 2ddd2a6b5..1ef5c3672 100644 --- a/controllers/hcloudremediation_controller_test.go +++ b/controllers/hcloudremediation_controller_test.go @@ -27,10 +27,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" + "github.com/syself/cluster-api-provider-hetzner/pkg/scope" hcloudutil "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/util" "github.com/syself/cluster-api-provider-hetzner/pkg/utils" ) @@ -147,9 +151,8 @@ var _ = Describe("HCloudRemediationReconciler", func() { }, }, Spec: infrav1.HCloudMachineSpec{ - ImageName: "my-control-plane", - Type: "cpx31", - PlacementGroupName: &defaultPlacementGroupName, + ImageName: "my-control-plane", + Type: "cpx31", }, } Expect(testEnv.Create(ctx, hcloudMachine)).To(Succeed()) @@ -227,14 +230,18 @@ var _ = Describe("HCloudRemediationReconciler", func() { Expect(testEnv.Create(ctx, hcloudRemediation)).To(Succeed()) By("checking if hcloudRemediation is in deleting phase and capiMachine has the MachineOwnerRemediatedCondition") - Eventually(func() bool { + Eventually(func() error { if err := testEnv.Get(ctx, hcloudRemediationkey, hcloudRemediation); err != nil { - return false + return err } - - return hcloudRemediation.Status.Phase == infrav1.PhaseDeleting && - isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason) - }, timeout).Should(BeTrue()) + if hcloudRemediation.Status.Phase != infrav1.PhaseDeleting { + return fmt.Errorf("hcloudRemediation.Status.Phase is not infrav1.PhaseDeleting") + } + if !isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason) { + return fmt.Errorf("MachineOwnerRemediatedCondition not set") + } + return nil + }, timeout).Should(Succeed()) }) It("checks that, under normal conditions, a reboot is carried out and retryCount and lastRemediated are set", func() { @@ -318,5 +325,98 @@ var _ = Describe("HCloudRemediationReconciler", func() { isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason) }, timeout).Should(BeTrue()) }) + It("should delete machine if SetErrorAndRemediate() was called", func() { + By("Checking Environment") + capiMachineAgain, err := util.GetOwnerMachine(ctx, testEnv, hcloudMachine.ObjectMeta) + Expect(err).ShouldNot(HaveOccurred()) + Expect(capiMachineAgain).ToNot(BeNil()) + Expect(capiMachine.UID).To(Equal(capiMachineAgain.UID)) + hcloudClient := testEnv.HCloudClientFactory.NewClient("dummy-token") + + server, err := hcloudClient.CreateServer(ctx, hcloud.ServerCreateOpts{ + Name: "myserver", + }) + Expect(err).ShouldNot(HaveOccurred()) + providerID := hcloudutil.ProviderIDFromServerID(int(server.ID)) + hcloudMachine.Spec.ProviderID = &providerID + err = testEnv.Update(ctx, hcloudMachine) + Expect(err).ShouldNot(HaveOccurred()) + + By("Call SetErrorAndRemediateMachine") + Eventually(func() error { + err = testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine) + if err != nil { + return err + } + err = scope.SetErrorAndRemediateMachine(ctx, testEnv, capiMachine, hcloudMachine, "test-of-set-error-and-remediate") + if err != nil { + return err + } + err = testEnv.Status().Update(ctx, hcloudMachine) + if err != nil { + return err + } + return nil + }).Should(Succeed()) + + By("Wait until hcloud has condition set.") + Eventually(func() error { + err := testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine) + if err != nil { + return err + } + c := conditions.Get(hcloudMachine, infrav1.DeleteMachineSucceededCondition) + if c == nil { + return fmt.Errorf("not set: DeleteMachineSucceededCondition") + } + if c.Status != corev1.ConditionFalse { + return fmt.Errorf("status not set yet") + } + return nil + }).Should(Succeed()) + + By("Do the job of CAPI: Create a HCloudRemediation") + rem := &infrav1.HCloudRemediation{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcloudMachine.Name, + Namespace: hcloudMachine.Namespace, + }, + Spec: infrav1.HCloudRemediationSpec{ + Strategy: &infrav1.RemediationStrategy{ + Type: infrav1.RemediationTypeReboot, + RetryLimit: 5, + Timeout: &metav1.Duration{ + Duration: time.Minute, + }, + }, + }, + } + + err = controllerutil.SetOwnerReference(capiMachine, rem, testEnv.GetScheme()) + Expect(err).Should(Succeed()) + + err = testEnv.Create(ctx, rem) + Expect(err).ShouldNot(HaveOccurred()) + + By("Wait until remediation is done") + Eventually(func() error { + err := testEnv.Get(ctx, client.ObjectKeyFromObject(capiMachine), capiMachine) + if err != nil { + return err + } + + c := conditions.Get(capiMachine, clusterv1.MachineOwnerRemediatedCondition) + if c == nil { + return fmt.Errorf("not set: MachineOwnerRemediatedCondition") + } + if c.Status != corev1.ConditionFalse { + return fmt.Errorf("status not set yet") + } + if c.Message != "Remediation finished (machine will be deleted): exit remediation because infra machine has condition set: DeleteMachineInProgress: test-of-set-error-and-remediate" { + return fmt.Errorf("Message is not as expected: %q", c.Message) + } + return nil + }).Should(Succeed()) + }) }) }) diff --git a/controllers/hetznerbaremetalhost_controller.go b/controllers/hetznerbaremetalhost_controller.go index a4df04c17..bd562e8aa 100644 --- a/controllers/hetznerbaremetalhost_controller.go +++ b/controllers/hetznerbaremetalhost_controller.go @@ -258,7 +258,7 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl msg := "hbmm has DeleteMachineSucceededCondition=False." log.Info(msg) conditions.MarkFalse(bmHost, infrav1.DeleteMachineSucceededCondition, - deleteMachineCondition.Reason, deleteMachineCondition.Severity, + deleteMachineCondition.Reason, clusterv1.ConditionSeverityInfo, "%s", deleteMachineCondition.Message) } else { conditions.MarkTrue(bmHost, infrav1.DeleteMachineSucceededCondition) diff --git a/pkg/scope/hcloudremediation.go b/pkg/scope/hcloudremediation.go index de0a2c0ff..921fafbe6 100644 --- a/pkg/scope/hcloudremediation.go +++ b/pkg/scope/hcloudremediation.go @@ -72,33 +72,26 @@ func NewHCloudRemediationScope(params HCloudRemediationScopeParams) (*HCloudReme return nil, fmt.Errorf("failed to init patch helper: %w", err) } - machinePatchHelper, err := patch.NewHelper(params.Machine, params.Client) - if err != nil { - return nil, fmt.Errorf("failed to init machine patch helper: %w", err) - } - return &HCloudRemediationScope{ - Logger: params.Logger, - Client: params.Client, - HCloudClient: params.HCloudClient, - patchHelper: patchHelper, - machinePatchHelper: machinePatchHelper, - Machine: params.Machine, - HCloudMachine: params.HCloudMachine, - HCloudRemediation: params.HCloudRemediation, + Logger: params.Logger, + Client: params.Client, + HCloudClient: params.HCloudClient, + patchHelper: patchHelper, + Machine: params.Machine, + HCloudMachine: params.HCloudMachine, + HCloudRemediation: params.HCloudRemediation, }, nil } // HCloudRemediationScope defines the basic context for an actuator to operate upon. type HCloudRemediationScope struct { logr.Logger - Client client.Client - patchHelper *patch.Helper - machinePatchHelper *patch.Helper - HCloudClient hcloudclient.Client - Machine *clusterv1.Machine - HCloudMachine *infrav1.HCloudMachine - HCloudRemediation *infrav1.HCloudRemediation + Client client.Client + patchHelper *patch.Helper + HCloudClient hcloudclient.Client + Machine *clusterv1.Machine + HCloudMachine *infrav1.HCloudMachine + HCloudRemediation *infrav1.HCloudRemediation } // Close closes the current scope persisting the cluster configuration and status. @@ -126,8 +119,3 @@ func (m *HCloudRemediationScope) ServerIDFromProviderID() (int64, error) { func (m *HCloudRemediationScope) PatchObject(ctx context.Context, opts ...patch.Option) error { return m.patchHelper.Patch(ctx, m.HCloudRemediation, opts...) } - -// PatchMachine persists the machine spec and status. -func (m *HCloudRemediationScope) PatchMachine(ctx context.Context, opts ...patch.Option) error { - return m.machinePatchHelper.Patch(ctx, m.Machine, opts...) -} diff --git a/pkg/scope/machine.go b/pkg/scope/machine.go index 49a003bff..9eaf642e7 100644 --- a/pkg/scope/machine.go +++ b/pkg/scope/machine.go @@ -28,10 +28,11 @@ import ( "k8s.io/apimachinery/pkg/types" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11 "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" + "sigs.k8s.io/cluster-api/util/record" + "sigs.k8s.io/controller-runtime/pkg/client" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets" @@ -126,13 +127,41 @@ func (m *MachineScope) PatchObject(ctx context.Context) error { return m.patchHelper.Patch(ctx, m.HCloudMachine) } -// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs -// the message. It assumes the reason is invalid configuration, since that is -// currently the only relevant MachineStatusError choice. -// CAPI will delete the machine and create a new one. -func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) { - m.HCloudMachine.Status.FailureMessage = &message - m.HCloudMachine.Status.FailureReason = &reason +// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding +// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be +// created, and the DeleteMachineSucceededCondition will be set to False on the hcloud-machine. It +// gets used, when a not-recoverable error happens. Example: hcloud server was deleted by hand in +// the hcloud UI. +func (m *MachineScope) SetErrorAndRemediate(ctx context.Context, message string) error { + return SetErrorAndRemediateMachine(ctx, m.Client, m.Machine, m.HCloudMachine, message) +} + +// SetErrorAndRemediateMachine implements SetErrorAndRemediate. It is exported, so that other code +// (for example in tests) can call without creating a MachinenScope. +func SetErrorAndRemediateMachine(ctx context.Context, crClient client.Client, capiMachine *clusterv1.Machine, hcloudMachine *infrav1.HCloudMachine, message string) error { + // Create a patch base + patch := client.MergeFrom(capiMachine.DeepCopy()) + + // Modify only annotations on the in-memory copy + if capiMachine.Annotations == nil { + capiMachine.Annotations = map[string]string{} + } + capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] = "" + + // Apply patch – only the diff (annotations) is sent to the API server + if err := crClient.Patch(ctx, capiMachine, patch); err != nil { + return fmt.Errorf("patch failed in SetErrorAndRemediate: %w", err) + } + + record.Warnf(hcloudMachine, + "HCloudMachineWillBeRemediated", + "HCloudMachine will be remediated: %s", message) + + conditions.MarkFalse(hcloudMachine, infrav1.DeleteMachineSucceededCondition, + infrav1.DeleteMachineInProgressReason, clusterv1.ConditionSeverityInfo, "%s", + message) + + return nil } // SetRegion sets the region field on the machine. diff --git a/pkg/services/hcloud/remediation/remediation.go b/pkg/services/hcloud/remediation/remediation.go index aaead6c81..c37328578 100644 --- a/pkg/services/hcloud/remediation/remediation.go +++ b/pkg/services/hcloud/remediation/remediation.go @@ -23,9 +23,11 @@ import ( "time" "github.com/hetznercloud/hcloud-go/v2/hcloud" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/record" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -47,9 +49,23 @@ func NewService(scope *scope.HCloudRemediationScope) *Service { } // Reconcile implements reconcilement of HCloudRemediation. -func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err error) { +func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) { + // if SetErrorAndRemediate() was used to stop provisioning, do not try to reboot server + infraMachineCondition := conditions.Get(s.scope.HCloudMachine, infrav1.DeleteMachineSucceededCondition) + if infraMachineCondition != nil && infraMachineCondition.Status == corev1.ConditionFalse { + err := s.setOwnerRemediatedConditionToFailed(ctx, + fmt.Sprintf("exit remediation because infra machine has condition set: %s: %s", + infraMachineCondition.Reason, + infraMachineCondition.Message)) + if err != nil { + return reconcile.Result{}, fmt.Errorf("setOwnerRemediatedConditionToFailed failed: %w", err) + } + return reconcile.Result{}, nil + } + var server *hcloud.Server if s.scope.HCloudMachine.Spec.ProviderID != nil { + var err error server, err = s.findServer(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to find the server of unhealthy machine: %w", err) @@ -59,21 +75,24 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro // stop remediation if server does not exist or ProviderID is nil (in this case the server // cannot exist). if server == nil { - s.scope.HCloudRemediation.Status.Phase = infrav1.PhaseDeleting + msg := "ProviderID is not set" + if s.scope.HCloudMachine.Spec.ProviderID != nil { + msg = fmt.Sprintf("No server found via hcloud api for providerID %q", + *s.scope.HCloudMachine.Spec.ProviderID) + } - if err := s.setOwnerRemediatedCondition(ctx); err != nil { + if err := s.setOwnerRemediatedConditionToFailed(ctx, msg); err != nil { record.Warn(s.scope.HCloudRemediation, "FailedSettingConditionOnMachine", err.Error()) return reconcile.Result{}, fmt.Errorf("failed to set conditions on CAPI machine: %w", err) } - providerID := "nil" - if s.scope.HCloudMachine.Spec.ProviderID != nil { - providerID = *s.scope.HCloudMachine.Spec.ProviderID - } - msg := fmt.Sprintf("exit remediation because hcloud server (providerID=%s) does not exist", - providerID) - s.scope.Logger.Error(nil, msg) - record.Warn(s.scope.HCloudRemediation, "ExitRemediation", msg) - return res, nil + return reconcile.Result{}, nil + } + + if s.scope.HCloudRemediation.Spec.Strategy == nil { + s.scope.Info("remediation strategy is nil") + record.Warn(s.scope.HCloudRemediation, "UnsupportedRemdiationStrategy", + "remediation strategy is nil") + return reconcile.Result{}, nil } remediationType := s.scope.HCloudRemediation.Spec.Strategy.Type @@ -81,7 +100,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro if remediationType != infrav1.RemediationTypeReboot { s.scope.Info("unsupported remediation strategy") record.Warnf(s.scope.HCloudRemediation, "UnsupportedRemdiationStrategy", "remediation strategy %q is unsupported", remediationType) - return res, nil + return reconcile.Result{}, nil } // If no phase set, default to running @@ -96,7 +115,7 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro return s.handlePhaseWaiting(ctx) } - return res, nil + return reconcile.Result{}, nil } func (s *Service) handlePhaseRunning(ctx context.Context, server *hcloud.Server) (res reconcile.Result, err error) { @@ -145,7 +164,7 @@ func (s *Service) handlePhaseRunning(ctx context.Context, server *hcloud.Server) return res, nil } -func (s *Service) handlePhaseWaiting(ctx context.Context) (res reconcile.Result, err error) { +func (s *Service) handlePhaseWaiting(ctx context.Context) (reconcile.Result, error) { nextCheck := s.timeUntilNextRemediation(time.Now()) if nextCheck > 0 { @@ -153,19 +172,15 @@ func (s *Service) handlePhaseWaiting(ctx context.Context) (res reconcile.Result, return reconcile.Result{RequeueAfter: nextCheck}, nil } - // When machine is still unhealthy after remediation, setting of OwnerRemediatedCondition - // moves control to CAPI machine controller. The owning controller will do - // preflight checks and handles the Machine deletion - - s.scope.HCloudRemediation.Status.Phase = infrav1.PhaseDeleting - - if err := s.setOwnerRemediatedCondition(ctx); err != nil { + err := s.setOwnerRemediatedConditionToFailed(ctx, + "exit remediation because because retryLimit is reached and reboot timed out") + if err != nil { record.Warn(s.scope.HCloudRemediation, "FailedSettingConditionOnMachine", err.Error()) return reconcile.Result{}, fmt.Errorf("failed to set conditions on CAPI machine: %w", err) } - record.Event(s.scope.HCloudRemediation, "SetOwnerRemediatedCondition", "exit remediation because because retryLimit is reached and reboot timed out") - return res, nil + // do not reconcile again. + return reconcile.Result{}, nil } func (s *Service) findServer(ctx context.Context) (*hcloud.Server, error) { @@ -183,19 +198,30 @@ func (s *Service) findServer(ctx context.Context) (*hcloud.Server, error) { return server, nil } -// setOwnerRemediatedCondition sets MachineOwnerRemediatedCondition on CAPI machine object +// setOwnerRemediatedConditionToFailed sets MachineOwnerRemediatedCondition on CAPI machine object // that have failed a healthcheck. -func (s *Service) setOwnerRemediatedCondition(ctx context.Context) error { +func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg string) error { + patchHelper, err := patch.NewHelper(s.scope.Machine, s.scope.Client) + if err != nil { + return fmt.Errorf("failed to init patch helper: %w", err) + } + + // Move control to CAPI machine controller. CAPI will delete the machine. conditions.MarkFalse( s.scope.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, - "", + "Remediation finished (machine will be deleted): %s", msg, ) - if err := s.scope.PatchMachine(ctx); err != nil { - return fmt.Errorf("failed to patch machine: %w", err) + + if err := patchHelper.Patch(ctx, s.scope.Machine); err != nil { + return fmt.Errorf("failed to patch: %w", err) } + + record.Event(s.scope.HCloudRemediation, "ExitRemediation", msg) + + s.scope.HCloudRemediation.Status.Phase = infrav1.PhaseDeleting return nil } diff --git a/pkg/services/hcloud/server/server.go b/pkg/services/hcloud/server/server.go index 00fd0ecd6..0355fcd30 100644 --- a/pkg/services/hcloud/server/server.go +++ b/pkg/services/hcloud/server/server.go @@ -32,7 +32,6 @@ import ( "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" - capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11 "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/record" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -122,7 +121,10 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro "BootStateSince", s.scope.HCloudMachine.Status.BootStateSince, ) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, fmt.Errorf("SetErrorAndRemediate failed: %w", err) + } s.scope.HCloudMachine.SetBootState(infrav1.HCloudBootStateUnset) record.Warn(s.scope.HCloudMachine, "NoHCloudServerFound", msg) conditions.MarkFalse(s.scope.HCloudMachine, infrav1.ServerAvailableCondition, @@ -166,7 +168,10 @@ func (s *Service) handleBootStateUnset(ctx context.Context) (reconcile.Result, e // timeout. Something has failed. msg := fmt.Sprintf("handleBootStateUnset timed out after %s. Deleting machine", durationOfState.Round(time.Second).String()) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } s.scope.Logger.Error(nil, msg) conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "HandleBootStateUnsetTimedOut", clusterv1.ConditionSeverityWarning, @@ -235,7 +240,10 @@ func (s *Service) handleBootStateInitializing(ctx context.Context, server *hclou // timeout. Something has failed. msg := fmt.Sprintf("handleBootStateInitializing timed out after %s. Deleting machine", durationOfState.Round(time.Second).String()) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } s.scope.Logger.Error(nil, msg) conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "BootStateInitializingTimedOut", clusterv1.ConditionSeverityWarning, @@ -327,7 +335,10 @@ func (s *Service) handleBootStateEnablingRescue(ctx context.Context, server *hcl msg := fmt.Sprintf("handleBootStateEnablingRescue timed out after %s. Deleting machine", durationOfState.Round(time.Second).String()) s.scope.Logger.Error(nil, msg) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "EnablingRescueTimedOut", clusterv1.ConditionSeverityWarning, "%s", msg) return reconcile.Result{}, nil @@ -338,7 +349,10 @@ func (s *Service) handleBootStateEnablingRescue(ctx context.Context, server *hcl if hm.Status.ExternalIDs.ActionIDEnableRescueSystem == 0 { msg := "handleBootStateEnablingRescue ActionIdEnableRescueSystem not set? Can not continue. Provisioning Failed" s.scope.Logger.Error(nil, msg) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "ActionIDForEnablingRescueSystemNotSet", clusterv1.ConditionSeverityWarning, "%s", msg) return reconcile.Result{}, nil @@ -369,7 +383,10 @@ func (s *Service) handleBootStateEnablingRescue(ctx context.Context, server *hcl if err != nil { err = fmt.Errorf("action %+v failed (wait for rescue enabled): %w", action, err) s.scope.Logger.Error(err, "") - s.scope.SetError(err.Error(), capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, err.Error()) + if err != nil { + return reconcile.Result{}, err + } conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "EnablingRescueActionFailed", clusterv1.ConditionSeverityWarning, "%s", err.Error()) @@ -451,7 +468,10 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc // timeout. Something has failed. msg := fmt.Sprintf("reaching rescue system has timed out after %s. Deleting machine", durationOfState.Round(time.Second).String()) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } s.scope.Logger.Error(nil, msg) conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "BootingToRescueTimedOut", clusterv1.ConditionSeverityWarning, @@ -500,7 +520,10 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc if remoteHostName != "rescue" { msg := fmt.Sprintf("Remote hostname (via ssh) of hcloud server is %q. Expected 'rescue'. Deleting hcloud machine", remoteHostName) s.scope.Logger.Error(nil, msg) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "UnexpectedHostname", clusterv1.ConditionSeverityWarning, "%s", msg) @@ -534,7 +557,10 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc "ImageURLCommand", s.scope.ImageURLCommand, "exitStatus", exitStatus, "stdoutStderr", stdoutStderr) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "StartImageURLCommandNoZeroExitCode", clusterv1.ConditionSeverityWarning, "%s", msg) @@ -571,7 +597,10 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server durationOfState.Round(time.Second).String()) err = errors.New(msg) s.scope.Logger.Error(err, "", "logFile", logFile) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } record.Warn(hm, "ImageURLCommandFailed", logFile) conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "RunningImageCommandTimedOut", clusterv1.ConditionSeverityWarning, @@ -607,7 +636,10 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server msg := "ImageURLCommand failed. Deleting machine" err = errors.New(msg) s.scope.Logger.Error(err, "", "logFile", logFile) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "ImageCommandFailed", clusterv1.ConditionSeverityWarning, "%s", msg) @@ -632,7 +664,10 @@ func (s *Service) handleBootingToRealOS(ctx context.Context, server *hcloud.Serv // timeout. Something has failed. msg := fmt.Sprintf("handleBootingToRealOS timed out after %s. Deleting machine", durationOfState.Round(time.Second).String()) - s.scope.SetError(msg, capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, msg) + if err != nil { + return reconcile.Result{}, err + } s.scope.Logger.Error(nil, msg) conditions.MarkFalse(hm, infrav1.ServerAvailableCondition, "BootingToRealOSTimedOut", clusterv1.ConditionSeverityWarning, @@ -736,7 +771,7 @@ func handleRateLimit(hm *infrav1.HCloudMachine, err error, functionName string, } // Delete implements delete method of server. -func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) { +func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) { server, err := s.findServer(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("failed to find server: %w", err) @@ -751,7 +786,7 @@ func (s *Service) Delete(ctx context.Context) (res reconcile.Result, err error) msg := fmt.Sprintf("Unable to delete HCloud server. Could not find matching server for %s. ProviderID: %q", s.scope.Name(), providerID) s.scope.V(1).Info(msg) record.Warn(s.scope.HCloudMachine, "NoInstanceFound", msg) - return res, nil + return reconcile.Result{}, nil } // control planes have to be deleted as targets of server @@ -983,14 +1018,15 @@ func (s *Service) createServer(ctx context.Context, userData []byte, image *hclo } } if !foundPlacementGroupInStatus { + msg := fmt.Sprintf("Placement group %q does not exist in cluster", + *hm.Spec.PlacementGroupName) conditions.MarkFalse(hm, infrav1.ServerCreateSucceededCondition, infrav1.InstanceHasNonExistingPlacementGroupReason, clusterv1.ConditionSeverityError, - "Placement group %q does not exist in cluster", - *hm.Spec.PlacementGroupName, + "%s", msg, ) - return nil, errServerCreateNotPossible + return nil, fmt.Errorf("%s: %w", msg, errServerCreateNotPossible) } } @@ -1106,7 +1142,7 @@ func (s *Service) getSSHKeys(ctx context.Context) ( infrav1.SSHKeyNotFoundReason, clusterv1.ConditionSeverityError, "%s", msg) - return nil, nil, errServerCreateNotPossible + return nil, nil, fmt.Errorf("%s: %w", msg, errServerCreateNotPossible) } hcloudSSHKeys = append(hcloudSSHKeys, sshKey) } @@ -1123,15 +1159,15 @@ func (s *Service) getServerImage(ctx context.Context, imageName string) (*hcloud return nil, handleRateLimit(s.scope.HCloudMachine, err, "GetServerType", "failed to get server type in HCloud") } if serverType == nil { + msg := fmt.Sprintf("failed to get server type %q", string(s.scope.HCloudMachine.Spec.Type)) conditions.MarkFalse( s.scope.HCloudMachine, infrav1.ServerCreateSucceededCondition, infrav1.ServerTypeNotFoundReason, clusterv1.ConditionSeverityError, - "failed to get server type %q", - string(s.scope.HCloudMachine.Spec.Type), + "%s", msg, ) - return nil, errServerCreateNotPossible + return nil, fmt.Errorf("%s: %w", msg, errServerCreateNotPossible) } // query for an existing image by label @@ -1161,29 +1197,27 @@ func (s *Service) getServerImage(ctx context.Context, imageName string) (*hcloud images = append(images, imagesByName...) if len(images) > 1 { - err := fmt.Errorf("image is ambiguous - %d images have name %s", + msg := fmt.Sprintf("image is ambiguous - %d images have name %s", len(images), imageName) - record.Warn(s.scope.HCloudMachine, "ImageNameAmbiguous", err.Error()) + record.Warn(s.scope.HCloudMachine, "ImageNameAmbiguous", msg) conditions.MarkFalse(s.scope.HCloudMachine, infrav1.ServerCreateSucceededCondition, infrav1.ImageAmbiguousReason, clusterv1.ConditionSeverityError, - "%s", - err.Error(), + "%s", msg, ) - return nil, errServerCreateNotPossible + return nil, fmt.Errorf("%s: %w", msg, errServerCreateNotPossible) } if len(images) == 0 { - err := fmt.Errorf("no image found with name %s", s.scope.HCloudMachine.Spec.ImageName) - record.Warn(s.scope.HCloudMachine, "ImageNotFound", err.Error()) + msg := fmt.Sprintf("no image found with name %s", s.scope.HCloudMachine.Spec.ImageName) + record.Warn(s.scope.HCloudMachine, "ImageNotFound", msg) conditions.MarkFalse(s.scope.HCloudMachine, infrav1.ServerCreateSucceededCondition, infrav1.ImageNotFoundReason, clusterv1.ConditionSeverityError, - "%s", - err.Error(), + "%s", msg, ) - return nil, errServerCreateNotPossible + return nil, fmt.Errorf("%s: %w", msg, errServerCreateNotPossible) } return images[0], nil @@ -1212,7 +1246,10 @@ func (s *Service) handleServerStatusOff(ctx context.Context, server *hcloud.Serv } } else { // Timed out. Set failure reason - s.scope.SetError("reached timeout of waiting for machines that are switched off", capierrors.CreateMachineError) + err := s.scope.SetErrorAndRemediate(ctx, "reached timeout of waiting for machines that are switched off") + if err != nil { + return reconcile.Result{}, err + } return res, nil } } else { diff --git a/pkg/services/hcloud/server/server_suite_test.go b/pkg/services/hcloud/server/server_suite_test.go index d7b867294..bf131efaf 100644 --- a/pkg/services/hcloud/server/server_suite_test.go +++ b/pkg/services/hcloud/server/server_suite_test.go @@ -18,6 +18,7 @@ package server import ( "bytes" + "context" "encoding/json" "testing" @@ -26,10 +27,13 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/kubectl/pkg/scheme" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" ctrl "sigs.k8s.io/controller-runtime" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" "github.com/syself/cluster-api-provider-hetzner/pkg/scope" @@ -259,12 +263,30 @@ func newTestServer() *hcloud.Server { } func newTestService(hcloudMachine *infrav1.HCloudMachine, hcloudClient hcloudclient.Client) *Service { + scheme := runtime.NewScheme() + utilruntime.Must(infrav1.AddToScheme(scheme)) + utilruntime.Must(clusterv1.AddToScheme(scheme)) + client := fakeclient.NewClientBuilder().WithScheme(scheme).Build() + machine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcloudMachine.Name, + Namespace: hcloudMachine.Namespace, + }, + Spec: clusterv1.MachineSpec{}, + Status: clusterv1.MachineStatus{}, + } + err := client.Create(context.Background(), machine) + if err != nil { + panic(err) + } return &Service{ &scope.MachineScope{ HCloudMachine: hcloudMachine, ClusterScope: scope.ClusterScope{ HCloudClient: hcloudClient, + Client: client, }, + Machine: machine, }, } } diff --git a/pkg/services/hcloud/server/server_test.go b/pkg/services/hcloud/server/server_test.go index 6d7f1658d..189d02186 100644 --- a/pkg/services/hcloud/server/server_test.go +++ b/pkg/services/hcloud/server/server_test.go @@ -33,8 +33,10 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11 + "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/reconcile" infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1" @@ -138,7 +140,7 @@ var _ = Describe("handleServerStatusOff", func() { hcloudMachine = &infrav1.HCloudMachine{ ObjectMeta: metav1.ObjectMeta{ - Name: "hcloudMachineName", + Name: "my-machine", Namespace: "default", }, Spec: infrav1.HCloudMachineSpec{ @@ -198,7 +200,9 @@ var _ = Describe("handleServerStatusOff", func() { Expect(res).Should(Equal(emptyResult)) Expect(server.Status).To(Equal(hcloud.ServerStatusOff)) - Expect(hcloudMachine.Status.FailureMessage).Should(Equal(ptr.To("reached timeout of waiting for machines that are switched off"))) + + _, exists := service.scope.Machine.Annotations[clusterv1.RemediateMachineAnnotation] + Expect(exists).To(BeTrue()) }) It("tries to power on server and sets new condition if different one is set", func() { @@ -363,7 +367,7 @@ var _ = Describe("getSSHKeys", func() { ClusterScope: *clusterScope, HCloudMachine: &infrav1.HCloudMachine{ ObjectMeta: metav1.ObjectMeta{ - Name: "hcloudmachinename", + Name: "my-machine", Namespace: "default", }, }, @@ -651,39 +655,67 @@ var _ = Describe("Reconcile", func() { err = testEnv.Create(ctx, helpers.GetDefaultSSHSecret("rescue-ssh-secret", testNs.Name)) Expect(err).To(BeNil()) - service = &Service{ - scope: &scope.MachineScope{ - ClusterScope: *clusterScope, - - Machine: &clusterv1.Machine{ - ObjectMeta: metav1.ObjectMeta{ - Name: "machineName", - Namespace: testNs.Name, - }, - Spec: clusterv1.MachineSpec{ - FailureDomain: ptr.To("nbg1"), + hcloudMachine := &infrav1.HCloudMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-machine", + Namespace: testNs.Name, + }, + Spec: infrav1.HCloudMachineSpec{ + Type: "cpx11", + ImageName: "ubuntu-24.04", + SSHKeys: []infrav1.SSHKey{ + { + Name: "sshKey1", + Fingerprint: "b7:2f:30:a0:2f:6c:58:6c:21:04:58:61:ba:06:3b:1f", }, }, + }, + } + Expect(testEnv.Create(ctx, hcloudMachine)).ShouldNot(HaveOccurred()) + Eventually(func() error { + return testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine) + }).Should(Succeed()) + Expect(hcloudMachine.Kind).To(Equal("HCloudMachine")) - HCloudMachine: &infrav1.HCloudMachine{ - ObjectMeta: metav1.ObjectMeta{ - Name: "hcloudmachinename", - Namespace: testNs.Name, - }, - Spec: infrav1.HCloudMachineSpec{ - Type: "cpx11", - ImageName: "ubuntu-24.04", - SSHKeys: []infrav1.SSHKey{ - { - Name: "sshKey1", - Fingerprint: "b7:2f:30:a0:2f:6c:58:6c:21:04:58:61:ba:06:3b:1f", - }, - }, - }, + capiMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-machine", + Namespace: testNs.Name, + }, + Spec: clusterv1.MachineSpec{ + InfrastructureRef: corev1.ObjectReference{ + Name: hcloudMachine.Name, + Kind: hcloudMachine.Kind, + APIVersion: hcloudMachine.APIVersion, }, + ClusterName: "clustername", + FailureDomain: ptr.To("nbg1"), + }, + } + Expect(testEnv.Create(ctx, capiMachine)).ShouldNot(HaveOccurred()) + + err = controllerutil.SetOwnerReference(capiMachine, hcloudMachine, testEnv.Scheme()) + Expect(err).ShouldNot(HaveOccurred()) + + service = &Service{ + scope: &scope.MachineScope{ + ClusterScope: *clusterScope, + Machine: capiMachine, + HCloudMachine: hcloudMachine, SSHClientFactory: testEnv.HCloudSSHClientFactory, }, } + service.scope.Client = testEnv.Client + + Eventually(func() error { + capiMachine, err = util.GetOwnerMachine(ctx, service.scope.Client, + service.scope.HCloudMachine.ObjectMeta) + if err != nil { + return err + } + Expect(capiMachine).NotTo(BeNil()) + return nil + }) }) AfterEach(func() { @@ -733,7 +765,7 @@ var _ = Describe("Reconcile", func() { By("setting the ProviderID on the HCloudMachine") service.scope.HCloudMachine.Spec.ProviderID = ptr.To("hcloud://1234567") - err = testEnv.Create(ctx, service.scope.HCloudMachine) + err = testEnv.Update(ctx, service.scope.HCloudMachine) Expect(err).To(BeNil()) service.scope.Machine.Spec.Bootstrap.DataSecretName = ptr.To("bootstrapsecret") @@ -743,12 +775,14 @@ var _ = Describe("Reconcile", func() { hcloudClient.On("ListServers", mock.Anything, mock.Anything).Return(nil, nil) By("calling reconcile") - _, err := service.Reconcile(ctx) + _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) By("validating if CreateMachineError was set on HCloudMachine object") - Expect(*service.scope.HCloudMachine.Status.FailureReason).To(Equal(capierrors.CreateMachineError)) - Expect(*service.scope.HCloudMachine.Status.FailureMessage).To(Equal("hcloud server (\"hcloud://1234567\") no longer available. Setting MachineError.")) + c := conditions.Get(service.scope.HCloudMachine, infrav1.DeleteMachineSucceededCondition) + Expect(c).NotTo(BeNil()) + Expect(c.Status).To(Equal(corev1.ConditionFalse)) + Expect(c.Message).To(Equal(`hcloud server ("hcloud://1234567") no longer available. Setting MachineError.`)) }) It("transitions the BootStrate from BootStateUnset -> BootStateBootingToRealOS -> BootStateOperatingSystemRunning (imageName)", func() { @@ -764,9 +798,6 @@ var _ = Describe("Reconcile", func() { }) Expect(err).To(BeNil()) - err = testEnv.Create(ctx, service.scope.HCloudMachine) - Expect(err).To(BeNil()) - service.scope.Machine.Spec.Bootstrap.DataSecretName = ptr.To("bootstrapsecret") hcloudClient.On("GetServerType", mock.Anything, mock.Anything).Return(&hcloud.ServerType{ @@ -800,14 +831,14 @@ var _ = Describe("Reconcile", func() { hcloudClient.On("CreateServer", mock.Anything, mock.Anything).Return(&hcloud.Server{ ID: 1, - Name: "hcloudmachinename", + Name: "my-machine", Status: hcloud.ServerStatusInitializing, }, nil) By("calling reconcile") _, err := service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to BootStateBootingToRealOS") @@ -816,13 +847,13 @@ var _ = Describe("Reconcile", func() { By("reconciling again") hcloudClient.On("GetServer", mock.Anything, mock.Anything).Return(&hcloud.Server{ ID: 1, - Name: "hcloudmachinename", + Name: "my-machine", Status: hcloud.ServerStatusRunning, }, nil) _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to BootStateOperatingSystemRunning once the server's status changes to running") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateOperatingSystemRunning)) @@ -843,8 +874,6 @@ var _ = Describe("Reconcile", func() { service.scope.ImageURLCommand = "dummy-image-url-command.sh" service.scope.HCloudMachine.Spec.ImageName = "" service.scope.HCloudMachine.Spec.ImageURL = "oci://example.com/repo/image:v1" - err = testEnv.Create(ctx, service.scope.HCloudMachine) - Expect(err).To(BeNil()) service.scope.Machine.Spec.Bootstrap.DataSecretName = ptr.To("bootstrapsecret") @@ -879,14 +908,14 @@ var _ = Describe("Reconcile", func() { hcloudClient.On("CreateServer", mock.Anything, mock.Anything).Return(&hcloud.Server{ ID: 1, - Name: "hcloudmachinename", + Name: "my-machine", Status: hcloud.ServerStatusInitializing, }, nil) By("calling reconcile") _, err := service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to Initializing") @@ -895,7 +924,7 @@ var _ = Describe("Reconcile", func() { By("reconciling again") hcloudClient.On("GetServer", mock.Anything, mock.Anything).Return(&hcloud.Server{ ID: 1, - Name: "hcloudmachinename", + Name: "my-machine", Status: hcloud.ServerStatusRunning, }, nil).Once() @@ -917,7 +946,7 @@ var _ = Describe("Reconcile", func() { }, nil).Once() _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to EnablingRescue") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateEnablingRescue)) @@ -925,7 +954,7 @@ var _ = Describe("Reconcile", func() { By("reconcile again --------------------------------------------------------") hcloudClient.On("GetServer", mock.Anything, mock.Anything).Return(&hcloud.Server{ ID: 1, - Name: "hcloudmachinename", + Name: "my-machine", RescueEnabled: true, Status: hcloud.ServerStatusRunning, }, nil).Once() @@ -944,7 +973,7 @@ var _ = Describe("Reconcile", func() { ) _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to EnablingRescue") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateEnablingRescue)) @@ -964,7 +993,7 @@ var _ = Describe("Reconcile", func() { }) _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to BootingToRescue") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateBootingToRescue)) @@ -980,10 +1009,10 @@ var _ = Describe("Reconcile", func() { StdErr: "", Err: nil, }) - startImageURLCommandMock := testEnv.HCloudSSHClient.On("StartImageURLCommand", mock.Anything, mock.Anything, mock.Anything, mock.Anything, "hcloudmachinename", []string{"sda"}).Return(0, "", nil) + startImageURLCommandMock := testEnv.HCloudSSHClient.On("StartImageURLCommand", mock.Anything, mock.Anything, mock.Anything, mock.Anything, "my-machine", []string{"sda"}).Return(0, "", nil) _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to RunningImageCommand") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateRunningImageCommand)) @@ -1004,7 +1033,7 @@ var _ = Describe("Reconcile", func() { }, nil).Once() _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to BootingToRealOS") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateBootingToRealOS)) @@ -1018,7 +1047,7 @@ var _ = Describe("Reconcile", func() { }, nil).Once() _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) - Expect(service.scope.HCloudMachine.Status.FailureReason).To(BeNil()) + Expect(noErrorOccured(service.scope)).To(BeNil()) By("ensuring the bootstate has transitioned to OperatingSystemRunning") Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateOperatingSystemRunning)) }) @@ -1032,3 +1061,14 @@ func isPresentAndFalseWithReason(getter conditions.Getter, condition clusterv1.C return objectCondition.Status == corev1.ConditionFalse && objectCondition.Reason == reason } + +func noErrorOccured(s *scope.MachineScope) error { + c := conditions.Get(s.HCloudMachine, infrav1.DeleteMachineSucceededCondition) + if c == nil { + return nil + } + if c.Status == corev1.ConditionTrue { + return nil + } + return fmt.Errorf("Error on HCloudMachine: %s: %s", c.Reason, c.Message) +} diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index b468818cc..2ab73d8c4 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -401,15 +401,9 @@ func logHCloudMachineStatus(ctx context.Context, c client.Client) error { } log("HCloudMachine: " + hm.Name + " " + id + " " + strings.Join(addresses, " ")) log(" ProvisioningState: " + string(*hm.Status.InstanceState)) - l := make([]string, 0) - if hm.Status.FailureMessage != nil { - l = append(l, *hm.Status.FailureMessage) - } - if hm.Status.FailureMessage != nil { - l = append(l, *hm.Status.FailureMessage) - } - if len(l) > 0 { - log(" Error: " + strings.Join(l, ", ")) + c := conditions.Get(hm, infrav1.DeleteMachineSucceededCondition) + if c != nil && c.Status != corev1.ConditionTrue { + log(fmt.Sprintf(" Error: %s: %s", c.Reason, c.Message)) } readyC := conditions.Get(hm, clusterv1.ReadyCondition) msg := ""