Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Remove etcd membership when doing a KubeadmControlPlane scale down #2382

Merged
merged 1 commit into from
Feb 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ import (
)

const (
KubeadmControlPlaneFinalizer = "kubeadm.controlplane.cluster.x-k8s.io"
KubeadmControlPlaneHashLabelKey = "kubeadm.controlplane.cluster.x-k8s.io/hash"
SelectedForUpgradeAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/selected-for-upgrade"
DeleteForScaleDownAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/delete-for-scale-down"
KubeadmControlPlaneFinalizer = "kubeadm.controlplane.cluster.x-k8s.io"
KubeadmControlPlaneHashLabelKey = "kubeadm.controlplane.cluster.x-k8s.io/hash"
SelectedForUpgradeAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/selected-for-upgrade"
DeleteForScaleDownAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/delete-for-scale-down"
ScaleDownEtcdMemberRemovedAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/scale-down-etcd-member-removed"
)

// KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ type managementCluster interface {
GetMachinesForCluster(ctx context.Context, cluster types.NamespacedName, filters ...internal.MachineFilter) (internal.FilterableMachineCollection, error)
TargetClusterControlPlaneIsHealthy(ctx context.Context, clusterKey types.NamespacedName, controlPlaneName string) error
TargetClusterEtcdIsHealthy(ctx context.Context, clusterKey types.NamespacedName, controlPlaneName string) error
RemoveEtcdMemberForMachine(ctx context.Context, clusterKey types.NamespacedName, machine *clusterv1.Machine) error
}

// +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
Expand Down Expand Up @@ -436,18 +437,28 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Contex
return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete")
}

if !internal.HasAnnotationKey(controlplanev1.ScaleDownEtcdMemberRemovedAnnotation)(machineToDelete) {
// Ensure etcd is healthy prior to attempting to remove the member
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
logger.Error(err, "waiting for control plane to pass etcd health check before adding removing a control plane machine")
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass etcd health check before removing a control plane machine: %v", err)
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, nil
}
if err := r.managementCluster.RemoveEtcdMemberForMachine(ctx, util.ObjectKey(cluster), machineToDelete); err != nil {
logger.Error(err, "failed to remove etcd member for machine")
return ctrl.Result{}, err
}
if err := r.markWithAnnotationKey(ctx, machineToDelete, controlplanev1.ScaleDownEtcdMemberRemovedAnnotation); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to mark machine %s as having etcd membership removed", machineToDelete.Name)
}
}

// Do a final health check of the Control Plane components prior to actually deleting the machine
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
logger.Error(err, "waiting for control plane to pass control plane health check before removing a control plane machine")
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, nil
}

if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
logger.Error(err, "waiting for control plane to pass etcd health check before adding removing a control plane machine")
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass etcd health check before removing a control plane machine: %v", err)
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, nil
}

logger = logger.WithValues("machine", machineToDelete)
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "failed to delete control plane machine")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,10 @@ func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(ctx context.Context,
return nil
}

func (f *fakeManagementCluster) RemoveEtcdMemberForMachine(ctx context.Context, clusterKey types.NamespacedName, machine *clusterv1.Machine) error {
return nil
}

func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
t.Run("creates a control plane Machine if health checks pass", func(t *testing.T) {
g := NewWithT(t)
Expand Down
46 changes: 46 additions & 0 deletions controlplane/kubeadm/internal/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,21 @@ func (m *ManagementCluster) TargetClusterEtcdIsHealthy(ctx context.Context, clus
return m.healthCheck(ctx, cluster.etcdIsHealthy, clusterKey, controlPlaneName)
}

// RemoveEtcdMemberForMachine removes the etcd member from the target cluster's etcd cluster.
func (m *ManagementCluster) RemoveEtcdMemberForMachine(ctx context.Context, clusterKey types.NamespacedName, machine *clusterv1.Machine) error {
if machine == nil || machine.Status.NodeRef == nil {
// Nothing to do, no node for Machine
return nil
}

cluster, err := m.getCluster(ctx, clusterKey)
if err != nil {
return err
}

return cluster.removeMemberForNode(ctx, machine.Status.NodeRef.Name)
}

// cluster are operations on target clusters.
type cluster struct {
client ctrlclient.Client
Expand Down Expand Up @@ -251,6 +266,37 @@ func (c *cluster) controlPlaneIsHealthy(ctx context.Context) (healthCheckResult,
return response, nil
}

func (c *cluster) removeMemberForNode(ctx context.Context, nodeName string) error {
tlsConfig, err := c.generateEtcdTLSClientBundle()
if err != nil {
return err
}

// Create the etcd client for the etcd Pod scheduled on the Node
etcdClient, err := c.getEtcdClientForNode(nodeName, tlsConfig)
detiber marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return errors.Wrap(err, "failed to create etcd client")
}

// List etcd members. This checks that the member is healthy, because the request goes through consensus.
members, err := etcdClient.Members(ctx)
if err != nil {
return errors.Wrap(err, "failed to list etcd members using etcd client")
}
member := etcdutil.MemberForName(members, nodeName)

// The member has already been removed, return immediately
if member == nil {
return nil
}

if err := etcdClient.RemoveMember(ctx, member.ID); err != nil {
return errors.Wrap(err, "failed to remove member from etcd")
}

return nil
}

// etcdIsHealthy runs checks for every etcd member in the cluster to satisfy our definition of healthy.
// This is a best effort check and nodes can become unhealthy after the check is complete. It is not a guarantee.
// It's used a signal for if we should allow a target cluster to scale up, scale down or upgrade.
Expand Down