Skip to content

Commit

Permalink
KCP: remove etcd member in pre-terminate hook
Browse files Browse the repository at this point in the history
  • Loading branch information
sbueringer committed Sep 5, 2024
1 parent 50f3604 commit 8295a4c
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ const (
// failures in updating remediation retry (the counter restarts from zero).
RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"

// PreTerminateDeleteHookAnnotation is the annotation KCP sets on Machines to ensure it can later remove the
// etcd member right before Machine termination (i.e. before InfraMachine deletion).
PreTerminateDeleteHookAnnotation = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/kubeadmcontrolplane"

// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
// machine unrelated from the previous remediation.
DefaultMinHealthyPeriod = 1 * time.Hour
Expand Down
5 changes: 5 additions & 0 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@ func (c *ControlPlane) HasDeletingMachine() bool {
return len(c.Machines.Filter(collections.HasDeletionTimestamp)) > 0
}

// DeletingMachines returns machines in the control plane that are in the process of being deleted.
func (c *ControlPlane) DeletingMachines() collections.Machines {
return c.Machines.Filter(collections.HasDeletionTimestamp)
}

// GetKubeadmConfig returns the KubeadmConfig of a given machine.
func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.KubeadmConfig, bool) {
kubeadmConfig, ok := c.KubeadmConfigs[machineName]
Expand Down
59 changes: 59 additions & 0 deletions controlplane/kubeadm/internal/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,10 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
return ctrl.Result{}, err
}

if result, err := r.reconcilePreTerminateHook(ctx, controlPlane); err != nil || !result.IsZero() {
return result, err
}

// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
// otherwise continue with the other KCP operations.
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
Expand Down Expand Up @@ -768,6 +772,61 @@ func (r *KubeadmControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context
return nil
}

func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
if !controlPlane.HasDeletingMachine() {
return ctrl.Result{}, nil
}

log := ctrl.LoggerFrom(ctx)

removedPreTerminateHook := false
for _, deletingMachine := range controlPlane.DeletingMachines() {
log := log.WithValues("Machine", klog.KObj(deletingMachine))
ctx := ctrl.LoggerInto(ctx, log)

c := conditions.Get(deletingMachine, clusterv1.PreTerminateDeleteHookSucceededCondition)
if c != nil && c.Status == corev1.ConditionFalse && c.Reason == clusterv1.WaitingExternalHookReason {
if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateDeleteHookAnnotation]; !exists {
continue
}

// Forward etcd leader and remove member only if KCP is not in deletion.
if controlPlane.KCP.DeletionTimestamp.IsZero() {
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to remove etcd member for deleting Machine %s: failed to create client to workload cluster", klog.KObj(deletingMachine))
}

// Note: In regular deletion cases (remediation, scale down) the leader should have been already moved.
// We're doing this again here in case the Machine became leader again or the Machine deletion was
// triggered in another way (e.g. a user running kubectl delete machine)
etcdLeaderCandidate := controlPlane.Machines.Newest()
if err := workloadCluster.ForwardEtcdLeadership(ctx, deletingMachine, etcdLeaderCandidate); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to move leadership to candidate Machine %s", etcdLeaderCandidate.Name)
}
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, deletingMachine); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to remove etcd member for deleting Machine %s", klog.KObj(deletingMachine))
}
}

log.Info("Removing pre-terminate hook from control plane machine")
deletingMachineOriginal := deletingMachine.DeepCopy()
delete(deletingMachine.Annotations, controlplanev1.PreTerminateDeleteHookAnnotation)
if err := r.Client.Patch(ctx, deletingMachine, client.MergeFrom(deletingMachineOriginal)); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane machine %s", klog.KObj(deletingMachine))
}
removedPreTerminateHook = true
}
}

if removedPreTerminateHook {
log.Info("Waiting for Machines to be deleted", "machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", "))
return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
}

return ctrl.Result{}, nil
}

func (r *KubeadmControlPlaneReconciler) reconcileCertificateExpiries(ctx context.Context, controlPlane *internal.ControlPlane) error {
log := ctrl.LoggerFrom(ctx)

Expand Down
8 changes: 7 additions & 1 deletion controlplane/kubeadm/internal/controllers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1748,7 +1748,13 @@ func TestKubeadmControlPlaneReconciler_syncMachines(t *testing.T) {
// Verify Labels
g.Expect(updatedInplaceMutatingMachine.Labels).Should(Equal(expectedLabels))
// Verify Annotations
g.Expect(updatedInplaceMutatingMachine.Annotations).Should(Equal(kcp.Spec.MachineTemplate.ObjectMeta.Annotations))
expectedAnnotations := map[string]string{}
for k, v := range kcp.Spec.MachineTemplate.ObjectMeta.Annotations {
expectedAnnotations[k] = v
}
// The pre-terminate annotation should always be added
expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = ""
g.Expect(updatedInplaceMutatingMachine.Annotations).Should(Equal(expectedAnnotations))
// Verify Node timeout values
g.Expect(updatedInplaceMutatingMachine.Spec.NodeDrainTimeout).Should(And(
Not(BeNil()),
Expand Down
3 changes: 3 additions & 0 deletions controlplane/kubeadm/internal/controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,9 @@ func (r *KubeadmControlPlaneReconciler) computeDesiredMachine(kcp *controlplanev
annotations[controlplanev1.RemediationForAnnotation] = remediationData
}
}
// Setting pre-terminate hook so we can later remove the etcd member right before Machine termination
// (i.e. before InfraMachine deletion).
annotations[controlplanev1.PreTerminateDeleteHookAnnotation] = ""

// Construct the basic Machine.
desiredMachine := &clusterv1.Machine{
Expand Down
4 changes: 4 additions & 0 deletions controlplane/kubeadm/internal/controllers/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,8 @@ func TestKubeadmControlPlaneReconciler_computeDesiredMachine(t *testing.T) {
expectedAnnotations[k] = v
}
expectedAnnotations[controlplanev1.KubeadmClusterConfigurationAnnotation] = clusterConfigurationString
// The pre-terminate annotation should always be added
expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = ""
g.Expect(createdMachine.Annotations).To(Equal(expectedAnnotations))

// Verify that machineTemplate.ObjectMeta in KCP has not been modified.
Expand Down Expand Up @@ -646,6 +648,8 @@ func TestKubeadmControlPlaneReconciler_computeDesiredMachine(t *testing.T) {
}
expectedAnnotations[controlplanev1.KubeadmClusterConfigurationAnnotation] = existingClusterConfigurationString
expectedAnnotations[controlplanev1.RemediationForAnnotation] = remediationData
// The pre-terminate annotation should always be added
expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = ""
g.Expect(updatedMachine.Annotations).To(Equal(expectedAnnotations))

// Verify that machineTemplate.ObjectMeta in KCP has not been modified.
Expand Down
5 changes: 0 additions & 5 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
}
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
log.Error(err, "Failed to remove etcd member for machine")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
}
}

parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
Expand Down
4 changes: 0 additions & 4 deletions controlplane/kubeadm/internal/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
return ctrl.Result{}, err
}
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToDelete); err != nil {
logger.Error(err, "Failed to remove etcd member for machine")
return ctrl.Result{}, err
}
}

parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
Expand Down

0 comments on commit 8295a4c

Please sign in to comment.