Skip to content

Commit

Permalink
operator: fix maintenance mode activation on decommissioning node (wo…
Browse files Browse the repository at this point in the history
…rkaround for #4999)
  • Loading branch information
nicolaferraro committed Jun 13, 2022
1 parent 49302f5 commit 15adb01
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/go/k8s/controllers/redpanda/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,14 @@ func (m *mockAdminAPI) RecommissionBroker(_ context.Context, id int) error {
return m.SetBrokerStatus(id, admin.MembershipStatusActive)
}

func (m *mockAdminAPI) EnableMaintenanceMode(_ context.Context, _ int) error {
return nil
}

func (m *mockAdminAPI) DisableMaintenanceMode(_ context.Context, _ int) error {
return nil
}

// nolint:goerr113 // test code
func (m *mockAdminAPI) SetBrokerStatus(
id int, status admin.MembershipStatus,
Expand Down
3 changes: 3 additions & 0 deletions src/go/k8s/pkg/admin/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ type AdminAPIClient interface {
Brokers(ctx context.Context) ([]admin.Broker, error)
DecommissionBroker(ctx context.Context, node int) error
RecommissionBroker(ctx context.Context, node int) error

EnableMaintenanceMode(ctx context.Context, node int) error
DisableMaintenanceMode(ctx context.Context, node int) error
}

var _ AdminAPIClient = &admin.AdminAPI{}
Expand Down
6 changes: 6 additions & 0 deletions src/go/k8s/pkg/resources/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@ func (r *StatefulSetResource) Ensure(ctx context.Context) error {
}
r.LastObservedState = &sts

// Hack for: https://github.com/redpanda-data/redpanda/issues/4999
err = r.disableMaintenanceModeOnDecommissionedNodes(ctx)
if err != nil {
return err
}

r.logger.Info("Running update", "resource name", r.Key().Name)
err = r.runUpdate(ctx, &sts, obj.(*appsv1.StatefulSet))
if err != nil {
Expand Down
53 changes: 53 additions & 0 deletions src/go/k8s/pkg/resources/statefulset_scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ package resources

import (
"context"
"errors"
"fmt"

"github.com/go-logr/logr"
redpandav1alpha1 "github.com/redpanda-data/redpanda/src/go/k8s/apis/redpanda/v1alpha1"
adminutils "github.com/redpanda-data/redpanda/src/go/k8s/pkg/admin"
"github.com/redpanda-data/redpanda/src/go/k8s/pkg/labels"
"github.com/redpanda-data/redpanda/src/go/k8s/pkg/resources/featuregates"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/api/admin"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -261,6 +263,56 @@ func (r *StatefulSetResource) isClusterFormed(
return len(brokers) > 0, nil
}

// disableMaintenanceModeOnDecommissionedNodes can be used to put a cluster in a consistent state, disabling maintenance mode on
// nodes that have been decommissioned.
//
// A decommissioned node may activate maintenance mode via shutdown hooks and the cluster may enter an inconsistent state,
// preventing other pods clean shutdown.
//
// See: https://github.com/redpanda-data/redpanda/issues/4999
func (r *StatefulSetResource) disableMaintenanceModeOnDecommissionedNodes(
ctx context.Context,
) error {
if !featuregates.MaintenanceMode(r.pandaCluster.Status.Version) {
return nil
}

if r.pandaCluster.Status.DecommissioningNode == nil || r.pandaCluster.Status.CurrentReplicas > *r.pandaCluster.Status.DecommissioningNode {
// Only if actually in a decommissioning phase
return nil
}

ordinal := *r.pandaCluster.Status.DecommissioningNode
targetReplicas := ordinal

scaledDown, err := r.verifyRunningCount(ctx, targetReplicas)
if err != nil || !scaledDown {
// This should be done only when the pod disappears from the cluster
return err
}

adminAPI, err := r.getAdminAPIClient(ctx)
if err != nil {
return err
}

r.logger.Info("Forcing deletion of maintenance mode for the decommissioned node", "node_id", ordinal)
err = adminAPI.DisableMaintenanceMode(ctx, int(ordinal))
if err != nil {
var httpErr *admin.HTTPResponseError
if errors.As(err, &httpErr) {
if httpErr.Response != nil && httpErr.Response.StatusCode/100 == 4 {
// Cluster says we don't need to do it
r.logger.Info("No need to disable maintenance mode on the decommissioned node", "node_id", ordinal, "status_code", httpErr.Response.StatusCode)
return nil
}
}
return fmt.Errorf("could not disable maintenance mode on decommissioning node %d: %w", ordinal, err)
}
r.logger.Info("Maintenance mode disabled for the decommissioned node", "node_id", ordinal)
return nil
}

// verifyRunningCount checks if the statefulset is configured to run the given amount of replicas and that also pods match the expectations
func (r *StatefulSetResource) verifyRunningCount(
ctx context.Context, replicas int32,
Expand All @@ -281,6 +333,7 @@ func (r *StatefulSetResource) verifyRunningCount(
if err != nil {
return false, fmt.Errorf("could not list pods for checking replicas: %w", err)
}

return len(podList.Items) == int(replicas), nil
}

Expand Down

0 comments on commit 15adb01

Please sign in to comment.