Skip to content

Commit

Permalink
Merge pull request #4125 from nicolaferraro/rolling-restarts
Browse files Browse the repository at this point in the history
Add hooks for rolling restarts and upgrades
  • Loading branch information
nicolaferraro committed Apr 21, 2022
2 parents 7c96fab + f461fc2 commit 7bcb4c7
Show file tree
Hide file tree
Showing 45 changed files with 280 additions and 73 deletions.
42 changes: 40 additions & 2 deletions src/go/k8s/apis/redpanda/v1alpha1/cluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ type ClusterSpec struct {
// DNS name.
// http://www.dns-sd.org/trailingdotsindomainnames.html
DNSTrailingDotDisabled bool `json:"dnsTrailingDotDisabled,omitempty"`
// RestartConfig allows to control the behavior of the cluster when restarting
RestartConfig *RestartConfig `json:"restartConfig,omitempty"`
}

// RestartConfig contains strategies to configure how the cluster behaves when restarting, because of upgrades
// or other lifecycle events.
type RestartConfig struct {
// DisableMaintenanceModeHooks deactivates the preStop and postStart hooks that force nodes to enter maintenance mode when stopping and exit maintenance mode when up again
DisableMaintenanceModeHooks *bool `json:"disableMaintenanceModeHooks,omitempty"`
}

// PDBConfig specifies how the PodDisruptionBudget should be created for the
Expand Down Expand Up @@ -283,9 +292,13 @@ type ClusterStatus struct {
// Nodes of the provisioned redpanda nodes
// +optional
Nodes NodesList `json:"nodes,omitempty"`
// Indicates cluster is upgrading
// Indicates cluster is upgrading.
// +optional
// Deprecated: replaced by "restarting"
DeprecatedUpgrading bool `json:"upgrading"`
// Indicates that a cluster is restarting due to an upgrade or a different reason
// +optional
Upgrading bool `json:"upgrading"`
Restarting bool `json:"restarting"`
// Current version of the cluster.
// +optional
Version string `json:"version"`
Expand Down Expand Up @@ -789,6 +802,31 @@ func (r *Cluster) IsSchemaRegistryMutualTLSEnabled() bool {
r.Spec.Configuration.SchemaRegistry.TLS.RequireClientAuth
}

// IsUsingMaintenanceModeHooks tells if the cluster is configured to use maintenance mode hooks on the pods.
// Maintenance mode feature needs to be enabled for this to be relevant.
func (r *Cluster) IsUsingMaintenanceModeHooks() bool {
// enabled unless explicitly stated
if r.Spec.RestartConfig != nil && r.Spec.RestartConfig.DisableMaintenanceModeHooks != nil {
return !*r.Spec.RestartConfig.DisableMaintenanceModeHooks
}
return true
}

// ClusterStatus

// IsRestarting tells if the cluster is restarting due to a change in configuration or an upgrade in progress
func (s *ClusterStatus) IsRestarting() bool {
// Let's consider the old field for a transition period
return s.Restarting || s.DeprecatedUpgrading
}

// SetRestarting sets the cluster as restarting
func (s *ClusterStatus) SetRestarting(restarting bool) {
s.Restarting = restarting
// keep deprecated upgrading field as some external tools may still rely on it
s.DeprecatedUpgrading = restarting
}

// TLSConfig is a generic TLS configuration
type TLSConfig struct {
Enabled bool `json:"enabled,omitempty"`
Expand Down
25 changes: 25 additions & 0 deletions src/go/k8s/apis/redpanda/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 16 additions & 1 deletion src/go/k8s/config/crd/bases/redpanda.vectorized.io_clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,16 @@ spec:
to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/'
type: object
type: object
restartConfig:
description: RestartConfig allows to control the behavior of the cluster
when restarting
properties:
disableMaintenanceModeHooks:
description: DisableMaintenanceModeHooks deactivates the preStop
and postStart hooks that force nodes to enter maintenance mode
when stopping and exit maintenance mode when up again
type: boolean
type: object
sidecars:
description: Sidecars is list of sidecars run alongside redpanda container
properties:
Expand Down Expand Up @@ -902,8 +912,13 @@ spec:
description: Replicas show how many nodes are working in the cluster
format: int32
type: integer
restarting:
description: Indicates that a cluster is restarting due to an upgrade
or a different reason
type: boolean
upgrading:
description: Indicates cluster is upgrading
description: 'Indicates cluster is upgrading. Deprecated: replaced
by "restarting"'
type: boolean
version:
description: Current version of the cluster.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,16 @@ func (r *ClusterReconciler) synchronizeStatusWithCluster(
conditionData := mapStatusToCondition(status)
conditionChanged := redpandaCluster.Status.SetCondition(conditionData.Type, conditionData.Status, conditionData.Reason, conditionData.Message)
stsNeedsRestart := needsRestart(status)
if conditionChanged || (stsNeedsRestart && !redpandaCluster.Status.Upgrading) {
if conditionChanged || (stsNeedsRestart && !redpandaCluster.Status.IsRestarting()) {
// Trigger restart here if needed
if stsNeedsRestart {
redpandaCluster.Status.Upgrading = true
redpandaCluster.Status.SetRestarting(true)
}
log.Info("Updating configuration state for cluster",
"status", conditionData.Status,
"reason", conditionData.Reason,
"message", conditionData.Message,
"upgrading", redpandaCluster.Status.Upgrading,
"restarting", redpandaCluster.Status.IsRestarting(),
)
if err := r.Status().Update(ctx, redpandaCluster); err != nil {
return nil, errorWithContext(err, "could not update condition on cluster")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const (
// CentralizedConfiguration feature gate should be removed when the operator
// will no longer support 21.x or older versions
func CentralizedConfiguration(version string) bool {
if version == "dev" {
if version == devVersion {
// development version contains this feature
return true
}
Expand Down
14 changes: 14 additions & 0 deletions src/go/k8s/pkg/resources/featuregates/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2022 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package featuregates

const (
devVersion = "dev"
)
32 changes: 32 additions & 0 deletions src/go/k8s/pkg/resources/featuregates/maintenance_mode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright 2022 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package featuregates

import "github.com/Masterminds/semver/v3"

const (
maintenanceModeMajor = uint64(22)
maintenanceModeMinor = uint64(1)
)

// MaintenanceMode feature gate should be removed when the operator
// will no longer support 21.x or older versions
func MaintenanceMode(version string) bool {
if version == devVersion {
// development version contains this feature
return true
}
v, err := semver.NewVersion(version)
if err != nil {
return false
}

return v.Major() == maintenanceModeMajor && v.Minor() >= maintenanceModeMinor || v.Major() > maintenanceModeMajor
}
83 changes: 83 additions & 0 deletions src/go/k8s/pkg/resources/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ var (
ConfigMapHashAnnotationKey = redpandav1alpha1.GroupVersion.Group + "/configmap-hash"
// CentralizedConfigurationHashAnnotationKey contains the hash of the centralized configuration properties that require a restart when changed
CentralizedConfigurationHashAnnotationKey = redpandav1alpha1.GroupVersion.Group + "/centralized-configuration-hash"

// terminationGracePeriodSeconds should account for additional delay introduced by hooks
terminationGracePeriodSeconds int64 = 120
)

// ConfiguratorSettings holds settings related to configurator container and deployment
Expand Down Expand Up @@ -345,6 +348,7 @@ func (r *StatefulSetResource) obj(
},
},
}, r.secretVolumes()...),
TerminationGracePeriodSeconds: &terminationGracePeriodSeconds,
InitContainers: []corev1.Container{
{
Name: configuratorContainerName,
Expand Down Expand Up @@ -512,6 +516,15 @@ func (r *StatefulSetResource) obj(
},
}

// Only multi-replica clusters should use maintenance mode. See: https://github.com/redpanda-data/redpanda/issues/4338
multiReplica := r.pandaCluster.Spec.Replicas != nil && *r.pandaCluster.Spec.Replicas > 1
if featuregates.MaintenanceMode(r.pandaCluster.Spec.Version) && r.pandaCluster.IsUsingMaintenanceModeHooks() && multiReplica {
ss.Spec.Template.Spec.Containers[0].Lifecycle = &corev1.Lifecycle{
PreStop: r.getPreStopHook(),
PostStart: r.getPostStartHook(),
}
}

if featuregates.CentralizedConfiguration(r.pandaCluster.Spec.Version) {
ss.Spec.Template.Spec.Containers[0].VolumeMounts = append(ss.Spec.Template.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{
Name: "configmap-dir",
Expand All @@ -535,6 +548,76 @@ func (r *StatefulSetResource) obj(
return ss, nil
}

// getPrestopHook creates a hook that drains the node before shutting down.
func (r *StatefulSetResource) getPreStopHook() *corev1.Handler {
// TODO replace scripts with proper RPK calls
curlCommand := r.composeCURLMaintenanceCommand(`-X PUT --silent -o /dev/null -w "%{http_code}"`, nil)
genericMaintenancePath := "/v1/maintenance"
curlGetCommand := r.composeCURLMaintenanceCommand(`--silent`, &genericMaintenancePath)
cmd := strings.Join(
[]string{
fmt.Sprintf(`until [ "${status:-}" = "200" ]; do status=$(%s); sleep 0.5; done`, curlCommand),
fmt.Sprintf(`until [ "${finished:-}" = "true" ]; do finished=$(%s | grep -o '\"finished\":[^,}]*' | grep -o '[^: ]*$'); sleep 0.5; done`, curlGetCommand),
}, " && ")

return &corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/bash",
"-c",
cmd,
},
},
}
}

// getPostStartHook creates a hook that removes maintenance mode after startup.
func (r *StatefulSetResource) getPostStartHook() *corev1.Handler {
// TODO replace scripts with proper RPK calls
curlCommand := r.composeCURLMaintenanceCommand(`-X DELETE --silent -o /dev/null -w "%{http_code}"`, nil)
// HTTP code 400 is returned by v22 nodes during an upgrade from v21 until the new version reaches quorum and the maintenance mode feature is enabled
cmd := fmt.Sprintf(`until [ "${status:-}" = "200" ] || [ "${status:-}" = "400" ]; do status=$(%s); sleep 0.5; done`, curlCommand)

return &corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/bash",
"-c",
cmd,
},
},
}
}

// nolint:goconst // no need
func (r *StatefulSetResource) composeCURLMaintenanceCommand(
options string, urlOverwrite *string,
) string {
adminAPI := r.pandaCluster.AdminAPIInternal()

cmd := fmt.Sprintf(`curl %s `, options)

tlsConfig := adminAPI.GetTLS()
proto := "http"
if tlsConfig != nil && tlsConfig.Enabled {
proto = "https"
if tlsConfig.RequireClientAuth {
cmd += "--cacert /etc/tls/certs/admin/ca/ca.crt --cert /etc/tls/certs/admin/tls.crt --key /etc/tls/certs/admin/tls.key "
} else {
cmd += "--cacert /etc/tls/certs/admin/tls.crt "
}
}
cmd += fmt.Sprintf("%s://${POD_NAME}.%s.%s.svc.cluster.local:%d", proto, r.pandaCluster.Name, r.pandaCluster.Namespace, adminAPI.Port)

if urlOverwrite == nil {
prefixLen := len(r.pandaCluster.Name) + 1
cmd += fmt.Sprintf("/v1/brokers/${POD_NAME:%d}/maintenance", prefixLen)
} else {
cmd += *urlOverwrite
}
return cmd
}

// setCloudStorage manipulates v1.StatefulSet object in order to add cloud storage specific
// properties to Redpanda pod.
func setCloudStorage(
Expand Down
Loading

0 comments on commit 7bcb4c7

Please sign in to comment.