From cf9d1db5450886aeaa99a2298aa91f4da45f1c7f Mon Sep 17 00:00:00 2001 From: Rafal Korepta Date: Sat, 26 Nov 2022 23:58:33 +0100 Subject: [PATCH] Get cluster health before an update As per https://github.com/redpanda-data/redpanda/issues/3023 the cluster should be healthy before starting put node in maintanance mode and after POD is restarted. --- src/go/k8s/pkg/admin/admin.go | 2 ++ .../k8s/pkg/resources/statefulset_update.go | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/go/k8s/pkg/admin/admin.go b/src/go/k8s/pkg/admin/admin.go index 728b8db67ca4c..4962683bcddda 100644 --- a/src/go/k8s/pkg/admin/admin.go +++ b/src/go/k8s/pkg/admin/admin.go @@ -97,6 +97,8 @@ type AdminAPIClient interface { EnableMaintenanceMode(ctx context.Context, node int) error DisableMaintenanceMode(ctx context.Context, node int) error + + GetHealthOverview(ctx context.Context) (admin.ClusterHealthOverview, error) } var _ AdminAPIClient = &admin.AdminAPI{} diff --git a/src/go/k8s/pkg/resources/statefulset_update.go b/src/go/k8s/pkg/resources/statefulset_update.go index 8200463031d70..dfbc22168856e 100644 --- a/src/go/k8s/pkg/resources/statefulset_update.go +++ b/src/go/k8s/pkg/resources/statefulset_update.go @@ -82,6 +82,10 @@ func (r *StatefulSetResource) runUpdate( return err } + if err = r.isClusterHealthy(ctx); err != nil { + return err + } + if err = r.rollingUpdate(ctx, &modified.Spec.Template); err != nil { return err } @@ -94,6 +98,32 @@ func (r *StatefulSetResource) runUpdate( return nil } +func (r *StatefulSetResource) isClusterHealthy(ctx context.Context) error { + adminAPIClient, err := r.getAdminAPIClient(ctx) + if err != nil { + return fmt.Errorf("creating admin API client: %w", err) + } + + health, err := adminAPIClient.GetHealthOverview(ctx) + if err != nil { + return fmt.Errorf("getting cluster health overview: %w", err) + } + + restarting := "not restarting" + if r.pandaCluster.Status.IsRestarting() { + restarting = "restarting" + } + + if !health.IsHealthy { + return &RequeueAfterError{ + RequeueAfter: RequeueDuration, + Msg: fmt.Sprintf("wait for cluster to become healthy (cluster %s)", restarting), + } + } + + return nil +} + func (r *StatefulSetResource) rollingUpdate( ctx context.Context, template *corev1.PodTemplateSpec, ) error {