Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hooks for rolling restarts and upgrades #4125

Merged
merged 9 commits into from
Apr 21, 2022
42 changes: 40 additions & 2 deletions src/go/k8s/apis/redpanda/v1alpha1/cluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ type ClusterSpec struct {
// DNS name.
// http://www.dns-sd.org/trailingdotsindomainnames.html
DNSTrailingDotDisabled bool `json:"dnsTrailingDotDisabled,omitempty"`
// RestartConfig allows to control the behavior of the cluster when restarting
RestartConfig *RestartConfig `json:"restartConfig,omitempty"`
}

// RestartConfig contains strategies to configure how the cluster behaves when restarting, because of upgrades
// or other lifecycle events.
type RestartConfig struct {
// DisableMaintenanceModeHooks deactivates the preStop and postStart hooks that force nodes to enter maintenance mode when stopping and exit maintenance mode when up again
DisableMaintenanceModeHooks *bool `json:"disableMaintenanceModeHooks,omitempty"`
}

// PDBConfig specifies how the PodDisruptionBudget should be created for the
Expand Down Expand Up @@ -283,9 +292,13 @@ type ClusterStatus struct {
// Nodes of the provisioned redpanda nodes
// +optional
Nodes NodesList `json:"nodes,omitempty"`
// Indicates cluster is upgrading
// Indicates cluster is upgrading.
// +optional
// Deprecated: replaced by "restarting"
DeprecatedUpgrading bool `json:"upgrading"`
// Indicates that a cluster is restarting due to an upgrade or a different reason
// +optional
Upgrading bool `json:"upgrading"`
Restarting bool `json:"restarting"`
// Current version of the cluster.
// +optional
Version string `json:"version"`
Expand Down Expand Up @@ -789,6 +802,31 @@ func (r *Cluster) IsSchemaRegistryMutualTLSEnabled() bool {
r.Spec.Configuration.SchemaRegistry.TLS.RequireClientAuth
}

// IsUsingMaintenanceModeHooks tells if the cluster is configured to use maintenance mode hooks on the pods.
// Maintenance mode feature needs to be enabled for this to be relevant.
func (r *Cluster) IsUsingMaintenanceModeHooks() bool {
// enabled unless explicitly stated
if r.Spec.RestartConfig != nil && r.Spec.RestartConfig.DisableMaintenanceModeHooks != nil {
return !*r.Spec.RestartConfig.DisableMaintenanceModeHooks
}
return true
}

// ClusterStatus

// IsRestarting tells if the cluster is restarting due to a change in configuration or an upgrade in progress
func (s *ClusterStatus) IsRestarting() bool {
// Let's consider the old field for a transition period
return s.Restarting || s.DeprecatedUpgrading
}

// SetRestarting sets the cluster as restarting
func (s *ClusterStatus) SetRestarting(restarting bool) {
s.Restarting = restarting
// keep deprecated upgrading field as some external tools may still rely on it
s.DeprecatedUpgrading = restarting
}

// TLSConfig is a generic TLS configuration
type TLSConfig struct {
Enabled bool `json:"enabled,omitempty"`
Expand Down
25 changes: 25 additions & 0 deletions src/go/k8s/apis/redpanda/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,16 @@ spec:
to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/'
type: object
type: object
restartConfig:
description: RestartConfig allows to control the behavior of the cluster
when restarting
properties:
disableMaintenanceModeHooks:
description: DisableMaintenanceModeHooks deactivates the preStop
and postStart hooks that force nodes to enter maintenance mode
when stopping and exit maintenance mode when up again
type: boolean
type: object
sidecars:
description: Sidecars is list of sidecars run alongside redpanda container
properties:
Expand Down Expand Up @@ -902,8 +912,13 @@ spec:
description: Replicas show how many nodes are working in the cluster
format: int32
type: integer
restarting:
description: Indicates that a cluster is restarting due to an upgrade
or a different reason
type: boolean
upgrading:
description: Indicates cluster is upgrading
description: 'Indicates cluster is upgrading. Deprecated: replaced
by "restarting"'
type: boolean
version:
description: Current version of the cluster.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,16 @@ func (r *ClusterReconciler) synchronizeStatusWithCluster(
conditionData := mapStatusToCondition(status)
conditionChanged := redpandaCluster.Status.SetCondition(conditionData.Type, conditionData.Status, conditionData.Reason, conditionData.Message)
stsNeedsRestart := needsRestart(status)
if conditionChanged || (stsNeedsRestart && !redpandaCluster.Status.Upgrading) {
if conditionChanged || (stsNeedsRestart && !redpandaCluster.Status.IsRestarting()) {
// Trigger restart here if needed
if stsNeedsRestart {
redpandaCluster.Status.Upgrading = true
redpandaCluster.Status.SetRestarting(true)
}
log.Info("Updating configuration state for cluster",
"status", conditionData.Status,
"reason", conditionData.Reason,
"message", conditionData.Message,
"upgrading", redpandaCluster.Status.Upgrading,
"restarting", redpandaCluster.Status.IsRestarting(),
)
if err := r.Status().Update(ctx, redpandaCluster); err != nil {
return nil, errorWithContext(err, "could not update condition on cluster")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const (
// CentralizedConfiguration feature gate should be removed when the operator
// will no longer support 21.x or older versions
func CentralizedConfiguration(version string) bool {
if version == "dev" {
if version == devVersion {
// development version contains this feature
return true
}
Expand Down
14 changes: 14 additions & 0 deletions src/go/k8s/pkg/resources/featuregates/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2022 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package featuregates

const (
devVersion = "dev"
)
32 changes: 32 additions & 0 deletions src/go/k8s/pkg/resources/featuregates/maintenance_mode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright 2022 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package featuregates

import "github.com/Masterminds/semver/v3"

const (
maintenanceModeMajor = uint64(22)
maintenanceModeMinor = uint64(1)
)

// MaintenanceMode feature gate should be removed when the operator
// will no longer support 21.x or older versions
func MaintenanceMode(version string) bool {
if version == devVersion {
// development version contains this feature
return true
}
v, err := semver.NewVersion(version)
if err != nil {
return false
}

return v.Major() == maintenanceModeMajor && v.Minor() >= maintenanceModeMinor || v.Major() > maintenanceModeMajor
}
83 changes: 83 additions & 0 deletions src/go/k8s/pkg/resources/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ var (
ConfigMapHashAnnotationKey = redpandav1alpha1.GroupVersion.Group + "/configmap-hash"
// CentralizedConfigurationHashAnnotationKey contains the hash of the centralized configuration properties that require a restart when changed
CentralizedConfigurationHashAnnotationKey = redpandav1alpha1.GroupVersion.Group + "/centralized-configuration-hash"

// terminationGracePeriodSeconds should account for additional delay introduced by hooks
terminationGracePeriodSeconds int64 = 120
)

// ConfiguratorSettings holds settings related to configurator container and deployment
Expand Down Expand Up @@ -345,6 +348,7 @@ func (r *StatefulSetResource) obj(
},
},
}, r.secretVolumes()...),
TerminationGracePeriodSeconds: &terminationGracePeriodSeconds,
InitContainers: []corev1.Container{
{
Name: configuratorContainerName,
Expand Down Expand Up @@ -512,6 +516,15 @@ func (r *StatefulSetResource) obj(
},
}

// Only multi-replica clusters should use maintenance mode. See: https://github.com/redpanda-data/redpanda/issues/4338
multiReplica := r.pandaCluster.Spec.Replicas != nil && *r.pandaCluster.Spec.Replicas > 1
if featuregates.MaintenanceMode(r.pandaCluster.Spec.Version) && r.pandaCluster.IsUsingMaintenanceModeHooks() && multiReplica {
ss.Spec.Template.Spec.Containers[0].Lifecycle = &corev1.Lifecycle{
PreStop: r.getPreStopHook(),
PostStart: r.getPostStartHook(),
}
}

if featuregates.CentralizedConfiguration(r.pandaCluster.Spec.Version) {
ss.Spec.Template.Spec.Containers[0].VolumeMounts = append(ss.Spec.Template.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{
Name: "configmap-dir",
Expand All @@ -535,6 +548,76 @@ func (r *StatefulSetResource) obj(
return ss, nil
}

// getPrestopHook creates a hook that drains the node before shutting down.
func (r *StatefulSetResource) getPreStopHook() *corev1.Handler {
// TODO replace scripts with proper RPK calls
curlCommand := r.composeCURLMaintenanceCommand(`-X PUT --silent -o /dev/null -w "%{http_code}"`, nil)
genericMaintenancePath := "/v1/maintenance"
curlGetCommand := r.composeCURLMaintenanceCommand(`--silent`, &genericMaintenancePath)
cmd := strings.Join(
[]string{
fmt.Sprintf(`until [ "${status:-}" = "200" ]; do status=$(%s); sleep 0.5; done`, curlCommand),
fmt.Sprintf(`until [ "${finished:-}" = "true" ]; do finished=$(%s | grep -o '\"finished\":[^,}]*' | grep -o '[^: ]*$'); sleep 0.5; done`, curlGetCommand),
}, " && ")

return &corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/bash",
"-c",
cmd,
},
},
}
}

// getPostStartHook creates a hook that removes maintenance mode after startup.
func (r *StatefulSetResource) getPostStartHook() *corev1.Handler {
// TODO replace scripts with proper RPK calls
curlCommand := r.composeCURLMaintenanceCommand(`-X DELETE --silent -o /dev/null -w "%{http_code}"`, nil)
// HTTP code 400 is returned by v22 nodes during an upgrade from v21 until the new version reaches quorum and the maintenance mode feature is enabled
cmd := fmt.Sprintf(`until [ "${status:-}" = "200" ] || [ "${status:-}" = "400" ]; do status=$(%s); sleep 0.5; done`, curlCommand)

return &corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/bash",
"-c",
cmd,
},
},
}
}

// nolint:goconst // no need
func (r *StatefulSetResource) composeCURLMaintenanceCommand(
options string, urlOverwrite *string,
) string {
adminAPI := r.pandaCluster.AdminAPIInternal()

cmd := fmt.Sprintf(`curl %s `, options)

tlsConfig := adminAPI.GetTLS()
proto := "http"
if tlsConfig != nil && tlsConfig.Enabled {
proto = "https"
if tlsConfig.RequireClientAuth {
cmd += "--cacert /etc/tls/certs/admin/ca/ca.crt --cert /etc/tls/certs/admin/tls.crt --key /etc/tls/certs/admin/tls.key "
} else {
cmd += "--cacert /etc/tls/certs/admin/tls.crt "
RafalKorepta marked this conversation as resolved.
Show resolved Hide resolved
}
}
cmd += fmt.Sprintf("%s://${POD_NAME}.%s.%s.svc.cluster.local:%d", proto, r.pandaCluster.Name, r.pandaCluster.Namespace, adminAPI.Port)

if urlOverwrite == nil {
prefixLen := len(r.pandaCluster.Name) + 1
cmd += fmt.Sprintf("/v1/brokers/${POD_NAME:%d}/maintenance", prefixLen)
RafalKorepta marked this conversation as resolved.
Show resolved Hide resolved
} else {
cmd += *urlOverwrite
}
return cmd
}

// setCloudStorage manipulates v1.StatefulSet object in order to add cloud storage specific
// properties to Redpanda pod.
func setCloudStorage(
Expand Down
Loading