Skip to content

Commit

Permalink
Improve KCP remediation re-entrancy
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed May 17, 2024
1 parent fd328d7 commit 280b7b5
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 3 deletions.
26 changes: 23 additions & 3 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,29 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// Returns if another remediation is in progress but the new Machine is not yet created.
// Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated
// is being deleted to avoid unnecessary logs if no further remediation should be done.
if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
log.Info("Another remediation is already in progress. Skipping remediation.")
return ctrl.Result{}, nil
if v, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
// Check if the annotation is stale; this might happen in case there is a crash in the controller in between
// when a new Machine is created and the annotation is eventually removed from KCP via defer patch at the end
// of KCP reconcile.
remediationData, err := RemediationDataFromAnnotation(v)
if err != nil {
return ctrl.Result{}, err
}

staleAnnotation := false
for _, m := range controlPlane.Machines.UnsortedList() {
if m.CreationTimestamp.After(remediationData.Timestamp.Time) {
// Remove the annotation tracking that a remediation is in progress (the annotation is stale).
delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
staleAnnotation = true
break
}
}

if !staleAnnotation {
log.Info("Another remediation is already in progress. Skipping remediation.")
return ctrl.Result{}, nil
}
}

patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
Expand Down
41 changes: 41 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,47 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
g.Expect(err).ToNot(HaveOccurred())
})
t.Run("remediation in progress is ignored when stale", func(t *testing.T) {
g := NewWithT(t)

m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation(), withWaitBeforeDeleteFinalizer())
conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
controlPlane := &internal.ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{
Machine: "foo",
Timestamp: metav1.Time{Time: time.Now().Add(-1 * time.Hour).UTC()},
RetryCount: 0,
}),
},
},
},
Cluster: &clusterv1.Cluster{},
Machines: collections.FromMachines(m),
}
ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)

g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
g.Expect(err).ToNot(HaveOccurred())

g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
g.Expect(err).ToNot(HaveOccurred())
g.Expect(remediationData.Machine).To(Equal(m.Name))
g.Expect(remediationData.RetryCount).To(Equal(0))

assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")

err = env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(m.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())

removeFinalizer(g, m)
g.Expect(env.Cleanup(ctx, m)).To(Succeed())
})
t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) {
g := NewWithT(t)

Expand Down

0 comments on commit 280b7b5

Please sign in to comment.