From 5862edd60ef05c0101cc30bdf7d6a25cb874ae63 Mon Sep 17 00:00:00 2001 From: Rafal Korepta Date: Sat, 26 Nov 2022 23:58:33 +0100 Subject: [PATCH] operator: Get cluster health before an update As per https://github.com/redpanda-data/redpanda/issues/3023 the cluster should be healthy before starting put node in maintanance mode and after POD is restarted. --- controllers/redpanda/suite_test.go | 6 ++++ pkg/admin/admin.go | 2 ++ pkg/resources/featuregates/featuregates.go | 6 ++++ pkg/resources/statefulset_update.go | 37 ++++++++++++++++++++++ 4 files changed, 51 insertions(+) diff --git a/controllers/redpanda/suite_test.go b/controllers/redpanda/suite_test.go index ddac7a8b1d00..c709a408b24e 100644 --- a/controllers/redpanda/suite_test.go +++ b/controllers/redpanda/suite_test.go @@ -558,6 +558,12 @@ func (m *mockAdminAPI) DisableMaintenanceMode(_ context.Context, _ int) error { return nil } +func (m *mockAdminAPI) GetHealthOverview(_ context.Context) (admin.ClusterHealthOverview, error) { + return admin.ClusterHealthOverview{ + IsHealthy: true, + }, nil +} + //nolint:goerr113 // test code func (m *mockAdminAPI) SetBrokerStatus( id int, status admin.MembershipStatus, diff --git a/pkg/admin/admin.go b/pkg/admin/admin.go index 728b8db67ca4..4962683bcddd 100644 --- a/pkg/admin/admin.go +++ b/pkg/admin/admin.go @@ -97,6 +97,8 @@ type AdminAPIClient interface { EnableMaintenanceMode(ctx context.Context, node int) error DisableMaintenanceMode(ctx context.Context, node int) error + + GetHealthOverview(ctx context.Context) (admin.ClusterHealthOverview, error) } var _ AdminAPIClient = &admin.AdminAPI{} diff --git a/pkg/resources/featuregates/featuregates.go b/pkg/resources/featuregates/featuregates.go index 794cf5b22fd6..69b146096508 100644 --- a/pkg/resources/featuregates/featuregates.go +++ b/pkg/resources/featuregates/featuregates.go @@ -38,6 +38,12 @@ func CentralizedConfiguration(version string) bool { return atLeastVersion(V22_1, version) } +// ClusterHealth feature gate should be removed when the operator +// will no longer support 21.x or older versions +func ClusterHealth(version string) bool { + return atLeastVersion(V22_1, version) +} + // MaintenanceMode feature gate should be removed when the operator // will no longer support 21.x or older versions func MaintenanceMode(version string) bool { diff --git a/pkg/resources/statefulset_update.go b/pkg/resources/statefulset_update.go index 8200463031d7..618cf33199a0 100644 --- a/pkg/resources/statefulset_update.go +++ b/pkg/resources/statefulset_update.go @@ -23,6 +23,7 @@ import ( "github.com/banzaicloud/k8s-objectmatcher/patch" "github.com/redpanda-data/redpanda/src/go/k8s/pkg/labels" + "github.com/redpanda-data/redpanda/src/go/k8s/pkg/resources/featuregates" "github.com/redpanda-data/redpanda/src/go/k8s/pkg/utils" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -78,10 +79,15 @@ func (r *StatefulSetResource) runUpdate( if err = r.updateRestartingStatus(ctx, true); err != nil { return fmt.Errorf("unable to turn on restarting status in cluster custom resource: %w", err) } + if err = r.updateStatefulSet(ctx, current, modified); err != nil { return err } + if err = r.isClusterHealthy(ctx); err != nil { + return err + } + if err = r.rollingUpdate(ctx, &modified.Spec.Template); err != nil { return err } @@ -94,6 +100,37 @@ func (r *StatefulSetResource) runUpdate( return nil } +func (r *StatefulSetResource) isClusterHealthy(ctx context.Context) error { + if !featuregates.ClusterHealth(r.pandaCluster.Status.Version) { + r.logger.V(debugLogLevel).Info("Cluster health endpoint is not available", "version", r.pandaCluster.Spec.Version) + return nil + } + + adminAPIClient, err := r.getAdminAPIClient(ctx) + if err != nil { + return fmt.Errorf("creating admin API client: %w", err) + } + + health, err := adminAPIClient.GetHealthOverview(ctx) + if err != nil { + return fmt.Errorf("getting cluster health overview: %w", err) + } + + restarting := "not restarting" + if r.pandaCluster.Status.IsRestarting() { + restarting = "restarting" + } + + if !health.IsHealthy { + return &RequeueAfterError{ + RequeueAfter: RequeueDuration, + Msg: fmt.Sprintf("wait for cluster to become healthy (cluster %s)", restarting), + } + } + + return nil +} + func (r *StatefulSetResource) rollingUpdate( ctx context.Context, template *corev1.PodTemplateSpec, ) error {