From 49302f5752b6a698f15953c7d6932a191a84d0a2 Mon Sep 17 00:00:00 2001 From: nicolaferraro Date: Thu, 9 Jun 2022 16:41:08 +0200 Subject: [PATCH] operator: add documentation to the scale handler --- src/go/k8s/pkg/resources/statefulset_scale.go | 57 ++++++++++++++++++- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/src/go/k8s/pkg/resources/statefulset_scale.go b/src/go/k8s/pkg/resources/statefulset_scale.go index 7b059be1b16a1..0f62b324f2ed7 100644 --- a/src/go/k8s/pkg/resources/statefulset_scale.go +++ b/src/go/k8s/pkg/resources/statefulset_scale.go @@ -23,7 +23,40 @@ import ( k8sclient "sigs.k8s.io/controller-runtime/pkg/client" ) -// handleScaling is called to detect cases of replicas change and apply them to the cluster +// handleScaling is responsible for managing the current number of replicas running for a cluster. +// +// Replicas are controlled via the field `status.currentReplicas` that is set in the current method and should be +// respected by all other external reconcile functions, including the one that sets the actual amount of replicas in the StatefulSet. +// External functions should use `cluster.getCurrentReplicas()` to get the number of expected replicas for a cluster, +// since it deals with cases where the status is not initialized yet. +// +// When users change the value of `spec.replicas` for a cluster, this function is responsible for progressively changing the `status.currentReplicas` to match that value. +// In the case of a Cluster downscaled by 1 replica (i.e. `spec.replicas` lower than current value of `status.currentReplicas` by `1`), before lowering the +// value of `status.currentReplicas`, this handler will first decommission the last node, using the admin API of the cluster. +// +// There are cases where the cluster will not decommission a node (e.g. user reduces `spec.replicas` to `2`, but there are topics with `3` partition replicas), +// in which case the draining phase will hang indefinitely in Redpanda. When this happens, the controller will not downscale the cluster and +// users will find in `status.decommissioningNode` that a node is constantly being decommissioned. +// +// In cases where the decommissioning process hangs, users can increase again the value of `spec.replicas` and the handler will contact the admin API +// to recommission the last node, ensuring that the number of replicas matches the expected value and that the node joins the cluster again. +// +// Users can change the value of `spec.replicas` freely (except it cannot be 0). In case of downscaling by multiple replicas, the handler will +// decommission one node at time, until the desired number of replicas is met. +// +// When a new cluster is created, the value of `status.currentReplicas` will be initially set to `1`, no matter what the user sets in `spec.replicas`. +// This handler will first ensure that the initial node forms a cluster, by retrieving the list of brokers using the admin API. +// After the cluster is formed, the handler will increase the `status.currentReplicas` as requested. +// +// This is due to the fact that Redpanda is currently unable to initialize a cluster if each node is given the full list of seed servers: https://github.com/redpanda-data/redpanda/issues/333. +// Previous versions of the operator use to hack the list of seeds server (in the configurator pod) of node with ordinal 0, to set it always to an empty set, +// allowing it to create an initial cluster. That strategy worked because the list of seed servers is not read again from the `redpanda.yaml` file once the +// cluster is initialized. +// But the drawback was that, on an existing running cluster, when node 0 loses its data directory (e.g. because it's using local storage, and it undergoes a k8s node upgrade), +// then node 0 (having no data and an empty seed server list in `redpanda.yaml`) creates a brand new cluster ignoring the other nodes (split-brain). +// The strategy implemented here (to initialize the cluster at 1 replica, then upscaling to the desired number, without hacks on the seed server list), +// should fix this problem, since the list of seeds servers will be the same in all nodes once the cluster is created. +// // nolint:nestif // for clarity func (r *StatefulSetResource) handleScaling(ctx context.Context) error { if r.pandaCluster.Status.DecommissioningNode != nil { @@ -76,7 +109,16 @@ func (r *StatefulSetResource) handleScaling(ctx context.Context) error { return r.Status().Update(ctx, r.pandaCluster) } -// handleDecommission manages cases of node decommissioning +// handleDecommission manages the case of decommissioning of the last node of a cluster. +// +// When this handler is called, the `status.decommissioningNode` is populated with the pod ordinal (== nodeID) of the +// node that needs to be decommissioned. +// +// The handler verifies that the node is not present in the list of brokers registered in the cluster, via admin API, +// then downscales the StatefulSet via decreasing the `status.currentReplicas`. +// +// Before completing the process, it double-checks if the node is still not registered, for handling cases where the node was +// about to start when the decommissioning process started. If the broker is found, the process is restarted. func (r *StatefulSetResource) handleDecommission(ctx context.Context) error { targetReplicas := *r.pandaCluster.Status.DecommissioningNode r.logger.Info("Handling cluster in decommissioning phase", "target replicas", targetReplicas) @@ -151,7 +193,15 @@ func (r *StatefulSetResource) handleDecommission(ctx context.Context) error { return r.Status().Update(ctx, r.pandaCluster) } -// handleRecommission manages cases of nodes being recommissioned after a failed/wrong decommission +// handleRecommission manages the case of a node being recommissioned after a failed/wrong decommission. +// +// Recommission can only work for nodes that are still in the "draining" phase according to Redpanda. +// +// When this handler is triggered, `status.decommissioningNode` is populated with the node that was being decommissioned and +// `spec.replicas` reports a value that include that node, indicating the intention from the user to recommission it. +// +// The handler ensures that the node is running and also calls the admin API to recommission it. +// The process finishes when the node is registered among brokers and the StatefulSet is correctly scaled. func (r *StatefulSetResource) handleRecommission(ctx context.Context) error { r.logger.Info("Handling cluster in recommissioning phase") @@ -278,6 +328,7 @@ type NodeReappearingError struct { NodeID int } +// Error makes the NodeReappearingError a proper error func (e *NodeReappearingError) Error() string { return fmt.Sprintf("node has appeared in the cluster with id=%d", e.NodeID) }