Skip to content

Commit

Permalink
k8s: Put brokers in maintenance mode before deleting orphan pod
Browse files Browse the repository at this point in the history
During rolling update, before this change, Redpanda operator was calculating
the difference between running pod specification and stateful set pod template.
If the specification did not match the pod was deleted. From release v22.1.1
operator is configuring each broker with pod lifecycle hooks. In the PreStop
hook the script will try to put broker into maintenance mode for 120 seconds
before POD is terminated. Redpanda could not finish within 120 seconds to put
one broker into maintenance mode.

This PR improves the situation by putting maintenance mode before POD is
deleted. The `EnableMaintenanceMode` function is called multiple times until
`Broker` function returns correct status. The assumption is that REST admin API
maintenance mode endpoint is idempotent.

When pod is successfully deleted statefulset would reschedule the pod with
correct pod specification.

redpanda-data#4125
redpanda-data#3023
  • Loading branch information
Rafal Korepta committed Nov 29, 2022
1 parent 7beaf40 commit 11470aa
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/go/k8s/pkg/admin/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ type AdminAPIClient interface {
GetLicenseInfo(ctx context.Context) (admin.License, error)

Brokers(ctx context.Context) ([]admin.Broker, error)
Broker(ctx context.Context, nodeID int) (admin.Broker, error)
DecommissionBroker(ctx context.Context, node int) error
RecommissionBroker(ctx context.Context, node int) error

Expand Down
20 changes: 20 additions & 0 deletions src/go/k8s/pkg/admin/mock_admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,26 @@ func (m *MockAdminAPI) SetBrokerStatus(
return fmt.Errorf("unknown broker %d", id)
}

func (m *MockAdminAPI) Broker(_ context.Context, nodeID int) (admin.Broker, error) {
t := true
return admin.Broker{
NodeID: nodeID,
NumCores: 2,
MembershipStatus: "",
IsAlive: &t,
Version: "unversioned",
Maintenance: &admin.MaintenanceStatus{
Draining: false,
Finished: true,
Errors: false,
Partitions: 0,
Eligible: 0,
Transferring: 0,
Failed: 0,
},
}, nil
}

func makeCopy(input, output interface{}) {
ser, err := json.Marshal(input)
if err != nil {
Expand Down
49 changes: 49 additions & 0 deletions src/go/k8s/pkg/resources/statefulset_update.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"net/url"
"reflect"
"sort"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -173,6 +174,22 @@ func (r *StatefulSetResource) rollingUpdate(
}

if !patchResult.IsEmpty() {
var ordinal int64
ordinal, err = strconv.ParseInt(pod.Name[len(r.pandaCluster.Name)+1:], 10, 32)
if err != nil {
return fmt.Errorf("cluster %s: cannot convert pod name (%s) to ordinal: %w", r.pandaCluster.Name, pod.Name, err)
}

if err = r.putInMaintenanceMode(ctx, int32(ordinal)); err != nil {
// As maintenance mode can not be easily watched using controller runtime the requeue error
// is always returned. That way a rolling update will not finish when operator waits for
// maintenance mode finished.
return &RequeueAfterError{
RequeueAfter: RequeueDuration,
Msg: fmt.Sprintf("putting node (%s) into maintenance mode: %v", pod.Name, err),
}
}

r.logger.Info("Changes in Pod definition other than activeDeadlineSeconds, configurator and Redpanda container name. Deleting pod",
"pod-name", pod.Name,
"patch", patchResult.Patch)
Expand Down Expand Up @@ -207,6 +224,38 @@ func (r *StatefulSetResource) rollingUpdate(
return nil
}

//nolint:goerr113 // out of scope for this PR
func (r *StatefulSetResource) putInMaintenanceMode(ctx context.Context, ordinal int32) error {
adminAPIClient, err := r.getAdminAPIClient(ctx, ordinal)
if err != nil {
return fmt.Errorf("creating admin API client: %w", err)
}

nodeConf, err := adminAPIClient.GetNodeConfig(ctx)
if err != nil {
return fmt.Errorf("getting node config: %w", err)
}

err = adminAPIClient.EnableMaintenanceMode(ctx, nodeConf.NodeID)
if err != nil {
return fmt.Errorf("enabling maintenance mode: %w", err)
}

br, err := adminAPIClient.Broker(ctx, nodeConf.NodeID)
if err != nil {
return fmt.Errorf("getting broker infromations: %w", err)
}
if br.Maintenance != nil &&
br.Maintenance.Finished &&
!br.Maintenance.Draining &&
!br.Maintenance.Errors &&
br.Maintenance.Failed == 0 {
return fmt.Errorf("maintenance mode is not valid to do rolling update: %v", br.Maintenance)
}

return nil
}

func (r *StatefulSetResource) updateStatefulSet(
ctx context.Context,
current *appsv1.StatefulSet,
Expand Down

0 comments on commit 11470aa

Please sign in to comment.