From 280b7b53d200dc2be24609acfa7096dd05d3e828 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Fri, 17 May 2024 19:29:18 +0200 Subject: [PATCH] Improve KCP remediation re-entrancy --- .../internal/controllers/remediation.go | 26 ++++++++++-- .../internal/controllers/remediation_test.go | 41 +++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 28f54b5538a3..bddaf6a4db2a 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -99,9 +99,29 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Returns if another remediation is in progress but the new Machine is not yet created. // Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated // is being deleted to avoid unnecessary logs if no further remediation should be done. - if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { - log.Info("Another remediation is already in progress. Skipping remediation.") - return ctrl.Result{}, nil + if v, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok { + // Check if the annotation is stale; this might happen in case there is a crash in the controller in between + // when a new Machine is created and the annotation is eventually removed from KCP via defer patch at the end + // of KCP reconcile. + remediationData, err := RemediationDataFromAnnotation(v) + if err != nil { + return ctrl.Result{}, err + } + + staleAnnotation := false + for _, m := range controlPlane.Machines.UnsortedList() { + if m.CreationTimestamp.After(remediationData.Timestamp.Time) { + // Remove the annotation tracking that a remediation is in progress (the annotation is stale). + delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation) + staleAnnotation = true + break + } + } + + if !staleAnnotation { + log.Info("Another remediation is already in progress. Skipping remediation.") + return ctrl.Result{}, nil + } } patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client) diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index 463b62f3b141..ab4abc7bd220 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -166,6 +166,47 @@ func TestReconcileUnhealthyMachines(t *testing.T) { g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped g.Expect(err).ToNot(HaveOccurred()) }) + t.Run("remediation in progress is ignored when stale", func(t *testing.T) { + g := NewWithT(t) + + m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation(), withWaitBeforeDeleteFinalizer()) + conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "") + conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{ + Machine: "foo", + Timestamp: metav1.Time{Time: time.Now().Add(-1 * time.Hour).UTC()}, + RetryCount: 0, + }), + }, + }, + }, + Cluster: &clusterv1.Cluster{}, + Machines: collections.FromMachines(m), + } + ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane) + + g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue + g.Expect(err).ToNot(HaveOccurred()) + + g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation)) + remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(remediationData.Machine).To(Equal(m.Name)) + g.Expect(remediationData.RetryCount).To(Equal(0)) + + assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "") + + err = env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(m.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse()) + + removeFinalizer(g, m) + g.Expect(env.Cleanup(ctx, m)).To(Succeed()) + }) t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) { g := NewWithT(t)