From 4cfa74a33caefaa5510f51ed64b6306e61f1b8f7 Mon Sep 17 00:00:00 2001 From: sethp-nr <30441101+sethp-nr@users.noreply.github.com> Date: Wed, 26 Feb 2020 08:48:05 -0800 Subject: [PATCH] Report all healthcheck errors Before: ``` E0225 23:47:06.883670 8 kubeadm_control_plane_controller.go:577] controllers/KubeadmControlPlane "msg"="waiting for control plane to pass etcd health check before adding an additional control plane machine" "error"="there are 3 control plane nodes, but 0 etcd members" "cluster"="test" "kubeadmControlPlane"="test" "namespace"="test" ``` After: ``` E0226 01:48:29.726200 166 kubeadm_control_plane_controller.go:577] controllers/KubeadmControlPlane "msg"="waiting for control plane to pass etcd health check before adding an additional control plane machine" "error"="[could not check etcd member health, node \"ip-10-0-0-3.ec2.internal\": failed to create etcd client: unable to create etcd client: context deadline exceeded, node \"ip-10-0-0-197.ec2.internal\": failed to create etcd client: unable to create etcd client: context deadline exceeded, node \"ip-10-0-0-59.ec2.internal\": failed to create etcd client: unable to create etcd client: context deadline exceeded]" "cluster"="test" "kubeadmControlPlane"="test" "namespace"="test" ``` --- controlplane/kubeadm/internal/cluster.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/controlplane/kubeadm/internal/cluster.go b/controlplane/kubeadm/internal/cluster.go index 18571b31cabc..5d365798d3cc 100644 --- a/controlplane/kubeadm/internal/cluster.go +++ b/controlplane/kubeadm/internal/cluster.go @@ -121,11 +121,11 @@ type healthCheck func(context.Context) (healthCheckResult, error) // healthCheck will run a generic health check function and report any errors discovered. // It does some additional validation to make sure there is a 1;1 match between nodes and machines. func (m *ManagementCluster) healthCheck(ctx context.Context, check healthCheck, clusterKey types.NamespacedName, controlPlaneName string) error { + var errorList []error nodeChecks, err := check(ctx) if err != nil { - return err + errorList = append(errorList, err) } - errorList := []error{} for nodeName, err := range nodeChecks { if err != nil { errorList = append(errorList, fmt.Errorf("node %q: %v", nodeName, err)) @@ -388,6 +388,10 @@ func (c *cluster) etcdIsHealthy(ctx context.Context) (healthCheckResult, error) } } + if len(response) > 0 { + return response, errors.New("could not check etcd member health") + } + // Check that there is exactly one etcd member for every control plane machine. // There should be no etcd members added "out of band."" if len(controlPlaneNodes.Items) != len(knownMemberIDSet) {