Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 topology controller should avoid unnecessary rollouts during upgrades #8628

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/v1beta1/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,11 @@ const (
// not yet completed because Control Plane is not yet updated to match the desired topology spec.
TopologyReconciledControlPlaneUpgradePendingReason = "ControlPlaneUpgradePending"

// TopologyReconciledMachineDeploymentsCreatePendingReason (Severity=Info) documents reconciliation of a Cluster topology
// not yet completed because at least one of the MachineDeployments is yet to be created.
// This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable.
TopologyReconciledMachineDeploymentsCreatePendingReason = "MachineDeploymentsCreatePending"

// TopologyReconciledMachineDeploymentsUpgradePendingReason (Severity=Info) documents reconciliation of a Cluster topology
// not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec.
TopologyReconciledMachineDeploymentsUpgradePendingReason = "MachineDeploymentsUpgradePending"
Expand Down
46 changes: 30 additions & 16 deletions internal/controllers/topology/cluster/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
)
return nil
}

// If an error occurred during reconciliation set the TopologyReconciled condition to false.
// Add the error message from the reconcile function to the message of the condition.
if reconcileErr != nil {
Expand Down Expand Up @@ -108,25 +109,37 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
// * either the Control Plane or any of the MachineDeployments are still pending to pick up the new version
// (generally happens when upgrading the cluster)
// * when there are MachineDeployments for which the upgrade has been deferred
if s.UpgradeTracker.ControlPlane.PendingUpgrade ||
s.UpgradeTracker.MachineDeployments.PendingUpgrade() ||
// * when new MachineDeployments are pending to be created
// (generally happens when upgrading the cluster)
if s.UpgradeTracker.ControlPlane.IsPendingUpgrade ||
s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate() ||
s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade() ||
s.UpgradeTracker.MachineDeployments.DeferredUpgrade() {
msgBuilder := &strings.Builder{}
var reason string

// TODO(ykakarap): Evaluate potential improvements to building the condition. Multiple causes can trigger the
// condition to be false at the same time (Example: ControlPlane.IsPendingUpgrade and MachineDeployments.IsAnyPendingCreate can
// occur at the same time). Find better wording and `Reason` for the condition so that the condition can be rich
// with all the relevant information.
switch {
case s.UpgradeTracker.ControlPlane.PendingUpgrade:
fmt.Fprintf(msgBuilder, "Control plane upgrade to %s on hold.", s.Blueprint.Topology.Version)
case s.UpgradeTracker.ControlPlane.IsPendingUpgrade:
fmt.Fprintf(msgBuilder, "Control plane rollout and upgrade to version %s on hold.", s.Blueprint.Topology.Version)
reason = clusterv1.TopologyReconciledControlPlaneUpgradePendingReason
case s.UpgradeTracker.MachineDeployments.PendingUpgrade():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s upgrade to version %s on hold.",
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.PendingUpgradeNames()),
case s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s rollout and upgrade to version %s on hold.",
computeNameList(s.UpgradeTracker.MachineDeployments.PendingUpgradeNames()),
s.Blueprint.Topology.Version,
)
reason = clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason
case s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) for Topologies %s creation on hold.",
computeNameList(s.UpgradeTracker.MachineDeployments.PendingCreateTopologyNames()),
)
reason = clusterv1.TopologyReconciledMachineDeploymentsCreatePendingReason
case s.UpgradeTracker.MachineDeployments.DeferredUpgrade():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s upgrade to version %s deferred.",
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.DeferredUpgradeNames()),
fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s rollout and upgrade to version %s deferred.",
computeNameList(s.UpgradeTracker.MachineDeployments.DeferredUpgradeNames()),
s.Blueprint.Topology.Version,
)
reason = clusterv1.TopologyReconciledMachineDeploymentsUpgradeDeferredReason
Expand All @@ -148,7 +161,7 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste

case len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) > 0:
fmt.Fprintf(msgBuilder, " MachineDeployment(s) %s are upgrading",
computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.UpgradingNames()),
computeNameList(s.UpgradeTracker.MachineDeployments.UpgradingNames()),
)
}

Expand All @@ -175,12 +188,13 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
return nil
}

// computeMachineDeploymentNameList computes the MD name list to be shown in conditions.
// It shortens the list to at most 5 MachineDeployment names.
func computeMachineDeploymentNameList(mdList []string) any {
if len(mdList) > 5 {
mdList = append(mdList[:5], "...")
// computeNameList computes list of names form the given list to be shown in conditions.
// It shortens the list to at most 5 names and adds an ellipsis at the end if the list
// has more than 5 elements.
func computeNameList(list []string) any {
if len(list) > 5 {
list = append(list[:5], "...")
}

return strings.Join(mdList, ", ")
return strings.Join(list, ", ")
}
72 changes: 52 additions & 20 deletions internal/controllers/topology/cluster/conditions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = true
ut.ControlPlane.IsPendingUpgrade = true
ut.ControlPlane.IsProvisioning = true
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason,
wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. Control plane is completing initial provisioning",
wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. Control plane is completing initial provisioning",
},
{
name: "should set the condition to false if new version is not picked up because control plane is upgrading",
Expand All @@ -154,15 +154,15 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = true
ut.ControlPlane.IsPendingUpgrade = true
ut.ControlPlane.IsUpgrading = true
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason,
wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. Control plane is upgrading to version v1.21.2",
wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. Control plane is upgrading to version v1.21.2",
},
{
name: "should set the condition to false if new version is not picked up because control plane is scaling",
Expand All @@ -185,15 +185,15 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = true
ut.ControlPlane.IsPendingUpgrade = true
ut.ControlPlane.IsScaling = true
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason,
wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. Control plane is reconciling desired replicas",
wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. Control plane is reconciling desired replicas",
},
{
name: "should set the condition to false if new version is not picked up because at least one of the machine deployment is upgrading",
Expand Down Expand Up @@ -230,15 +230,15 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = true
ut.ControlPlane.IsPendingUpgrade = true
ut.MachineDeployments.MarkUpgrading("md0-abc123")
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason,
wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading",
wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading",
},
{
name: "should set the condition to false if control plane picked the new version but machine deployments did not because control plane is upgrading",
Expand Down Expand Up @@ -275,7 +275,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
ut.ControlPlane.IsUpgrading = true
ut.MachineDeployments.MarkPendingUpgrade("md0-abc123")
return ut
Expand All @@ -284,7 +284,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
wantConditionMessage: "MachineDeployment(s) md0-abc123 upgrade to version v1.22.0 on hold. Control plane is upgrading to version v1.22.0",
wantConditionMessage: "MachineDeployment(s) md0-abc123 rollout and upgrade to version v1.22.0 on hold. Control plane is upgrading to version v1.22.0",
},
{
name: "should set the condition to false if control plane picked the new version but machine deployments did not because control plane is scaling",
Expand Down Expand Up @@ -321,7 +321,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
ut.ControlPlane.IsScaling = true
ut.MachineDeployments.MarkPendingUpgrade("md0-abc123")
return ut
Expand All @@ -330,7 +330,39 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
wantConditionMessage: "MachineDeployment(s) md0-abc123 upgrade to version v1.22.0 on hold. Control plane is reconciling desired replicas",
wantConditionMessage: "MachineDeployment(s) md0-abc123 rollout and upgrade to version v1.22.0 on hold. Control plane is reconciling desired replicas",
},
{
name: "should set the condition to false if control plane picked the new version but there are machine deployments pending create because control plane is scaling",
reconcileErr: nil,
cluster: &clusterv1.Cluster{},
s: &scope.Scope{
Blueprint: &scope.ClusterBlueprint{
Topology: &clusterv1.Topology{
Version: "v1.22.0",
},
},
Current: &scope.ClusterState{
Cluster: &clusterv1.Cluster{},
ControlPlane: &scope.ControlPlaneState{
Object: builder.ControlPlane("ns1", "controlplane1").
WithVersion("v1.22.0").
WithReplicas(3).
Build(),
},
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.IsPendingUpgrade = false
ut.ControlPlane.IsScaling = true
ut.MachineDeployments.MarkPendingCreate("md0")
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsCreatePendingReason,
wantConditionMessage: "MachineDeployment(s) for Topologies md0 creation on hold. Control plane is reconciling desired replicas",
},
{
name: "should set the condition to true if control plane picked the new version and is upgrading but there are no machine deployments",
Expand All @@ -353,7 +385,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
ut.ControlPlane.IsUpgrading = true
return ut
}(),
Expand Down Expand Up @@ -382,7 +414,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
ut.ControlPlane.IsScaling = true
return ut
}(),
Expand Down Expand Up @@ -450,7 +482,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
ut.MachineDeployments.MarkUpgrading("md0-abc123")
ut.MachineDeployments.MarkPendingUpgrade("md1-abc123")
return ut
Expand All @@ -469,7 +501,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason,
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading",
wantConditionMessage: "MachineDeployment(s) md1-abc123 rollout and upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading",
},
{
name: "should set the condition to false if some machine deployments have not picked the new version because their upgrade has been deferred",
Expand Down Expand Up @@ -520,15 +552,15 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
ut.MachineDeployments.MarkDeferredUpgrade("md1-abc123")
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
},
wantConditionStatus: corev1.ConditionFalse,
wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradeDeferredReason,
wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 deferred.",
wantConditionMessage: "MachineDeployment(s) md1-abc123 rollout and upgrade to version v1.22.0 deferred.",
},
{
name: "should set the condition to true if there are no reconcile errors and control plane and all machine deployments picked up the new version",
Expand Down Expand Up @@ -579,7 +611,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
},
UpgradeTracker: func() *scope.UpgradeTracker {
ut := scope.NewUpgradeTracker()
ut.ControlPlane.PendingUpgrade = false
ut.ControlPlane.IsPendingUpgrade = false
return ut
}(),
HookResponseTracker: scope.NewHookResponseTracker(),
Expand Down Expand Up @@ -656,7 +688,7 @@ func TestComputeMachineDeploymentNameList(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)
g.Expect(computeMachineDeploymentNameList(tt.mdList)).To(Equal(tt.expected))
g.Expect(computeNameList(tt.mdList)).To(Equal(tt.expected))
})
}
}
Loading