diff --git a/api/v1beta1/condition_consts.go b/api/v1beta1/condition_consts.go index c0c5de162efa..c33a38f793e6 100644 --- a/api/v1beta1/condition_consts.go +++ b/api/v1beta1/condition_consts.go @@ -282,6 +282,11 @@ const ( // not yet completed because Control Plane is not yet updated to match the desired topology spec. TopologyReconciledControlPlaneUpgradePendingReason = "ControlPlaneUpgradePending" + // TopologyReconciledMachineDeploymentsCreatePendingReason (Severity=Info) documents reconciliation of a Cluster topology + // not yet completed because at least one of the MachineDeployments is yet to be created. + // This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable. + TopologyReconciledMachineDeploymentsCreatePendingReason = "MachineDeploymentsCreatePending" + // TopologyReconciledMachineDeploymentsUpgradePendingReason (Severity=Info) documents reconciliation of a Cluster topology // not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec. TopologyReconciledMachineDeploymentsUpgradePendingReason = "MachineDeploymentsUpgradePending" diff --git a/internal/controllers/topology/cluster/conditions.go b/internal/controllers/topology/cluster/conditions.go index 20066d133268..ed0140ed66d6 100644 --- a/internal/controllers/topology/cluster/conditions.go +++ b/internal/controllers/topology/cluster/conditions.go @@ -56,6 +56,7 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste ) return nil } + // If an error occurred during reconciliation set the TopologyReconciled condition to false. // Add the error message from the reconcile function to the message of the condition. if reconcileErr != nil { @@ -108,25 +109,37 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste // * either the Control Plane or any of the MachineDeployments are still pending to pick up the new version // (generally happens when upgrading the cluster) // * when there are MachineDeployments for which the upgrade has been deferred - if s.UpgradeTracker.ControlPlane.PendingUpgrade || - s.UpgradeTracker.MachineDeployments.PendingUpgrade() || + // * when new MachineDeployments are pending to be created + // (generally happens when upgrading the cluster) + if s.UpgradeTracker.ControlPlane.IsPendingUpgrade || + s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate() || + s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade() || s.UpgradeTracker.MachineDeployments.DeferredUpgrade() { msgBuilder := &strings.Builder{} var reason string + // TODO(ykakarap): Evaluate potential improvements to building the condition. Multiple causes can trigger the + // condition to be false at the same time (Example: ControlPlane.IsPendingUpgrade and MachineDeployments.IsAnyPendingCreate can + // occur at the same time). Find better wording and `Reason` for the condition so that the condition can be rich + // with all the relevant information. switch { - case s.UpgradeTracker.ControlPlane.PendingUpgrade: - fmt.Fprintf(msgBuilder, "Control plane upgrade to %s on hold.", s.Blueprint.Topology.Version) + case s.UpgradeTracker.ControlPlane.IsPendingUpgrade: + fmt.Fprintf(msgBuilder, "Control plane rollout and upgrade to version %s on hold.", s.Blueprint.Topology.Version) reason = clusterv1.TopologyReconciledControlPlaneUpgradePendingReason - case s.UpgradeTracker.MachineDeployments.PendingUpgrade(): - fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s upgrade to version %s on hold.", - computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.PendingUpgradeNames()), + case s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade(): + fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s rollout and upgrade to version %s on hold.", + computeNameList(s.UpgradeTracker.MachineDeployments.PendingUpgradeNames()), s.Blueprint.Topology.Version, ) reason = clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason + case s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate(): + fmt.Fprintf(msgBuilder, "MachineDeployment(s) for Topologies %s creation on hold.", + computeNameList(s.UpgradeTracker.MachineDeployments.PendingCreateTopologyNames()), + ) + reason = clusterv1.TopologyReconciledMachineDeploymentsCreatePendingReason case s.UpgradeTracker.MachineDeployments.DeferredUpgrade(): - fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s upgrade to version %s deferred.", - computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.DeferredUpgradeNames()), + fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s rollout and upgrade to version %s deferred.", + computeNameList(s.UpgradeTracker.MachineDeployments.DeferredUpgradeNames()), s.Blueprint.Topology.Version, ) reason = clusterv1.TopologyReconciledMachineDeploymentsUpgradeDeferredReason @@ -148,7 +161,7 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste case len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) > 0: fmt.Fprintf(msgBuilder, " MachineDeployment(s) %s are upgrading", - computeMachineDeploymentNameList(s.UpgradeTracker.MachineDeployments.UpgradingNames()), + computeNameList(s.UpgradeTracker.MachineDeployments.UpgradingNames()), ) } @@ -175,12 +188,13 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste return nil } -// computeMachineDeploymentNameList computes the MD name list to be shown in conditions. -// It shortens the list to at most 5 MachineDeployment names. -func computeMachineDeploymentNameList(mdList []string) any { - if len(mdList) > 5 { - mdList = append(mdList[:5], "...") +// computeNameList computes list of names form the given list to be shown in conditions. +// It shortens the list to at most 5 names and adds an ellipsis at the end if the list +// has more than 5 elements. +func computeNameList(list []string) any { + if len(list) > 5 { + list = append(list[:5], "...") } - return strings.Join(mdList, ", ") + return strings.Join(list, ", ") } diff --git a/internal/controllers/topology/cluster/conditions_test.go b/internal/controllers/topology/cluster/conditions_test.go index 5b18fbe90b13..d2b21aa3af55 100644 --- a/internal/controllers/topology/cluster/conditions_test.go +++ b/internal/controllers/topology/cluster/conditions_test.go @@ -123,7 +123,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = true + ut.ControlPlane.IsPendingUpgrade = true ut.ControlPlane.IsProvisioning = true return ut }(), @@ -131,7 +131,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason, - wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. Control plane is completing initial provisioning", + wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. Control plane is completing initial provisioning", }, { name: "should set the condition to false if new version is not picked up because control plane is upgrading", @@ -154,7 +154,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = true + ut.ControlPlane.IsPendingUpgrade = true ut.ControlPlane.IsUpgrading = true return ut }(), @@ -162,7 +162,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason, - wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. Control plane is upgrading to version v1.21.2", + wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. Control plane is upgrading to version v1.21.2", }, { name: "should set the condition to false if new version is not picked up because control plane is scaling", @@ -185,7 +185,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = true + ut.ControlPlane.IsPendingUpgrade = true ut.ControlPlane.IsScaling = true return ut }(), @@ -193,7 +193,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason, - wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. Control plane is reconciling desired replicas", + wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. Control plane is reconciling desired replicas", }, { name: "should set the condition to false if new version is not picked up because at least one of the machine deployment is upgrading", @@ -230,7 +230,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = true + ut.ControlPlane.IsPendingUpgrade = true ut.MachineDeployments.MarkUpgrading("md0-abc123") return ut }(), @@ -238,7 +238,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledControlPlaneUpgradePendingReason, - wantConditionMessage: "Control plane upgrade to v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading", + wantConditionMessage: "Control plane rollout and upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading", }, { name: "should set the condition to false if control plane picked the new version but machine deployments did not because control plane is upgrading", @@ -275,7 +275,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.ControlPlane.IsUpgrading = true ut.MachineDeployments.MarkPendingUpgrade("md0-abc123") return ut @@ -284,7 +284,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason, - wantConditionMessage: "MachineDeployment(s) md0-abc123 upgrade to version v1.22.0 on hold. Control plane is upgrading to version v1.22.0", + wantConditionMessage: "MachineDeployment(s) md0-abc123 rollout and upgrade to version v1.22.0 on hold. Control plane is upgrading to version v1.22.0", }, { name: "should set the condition to false if control plane picked the new version but machine deployments did not because control plane is scaling", @@ -321,7 +321,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.ControlPlane.IsScaling = true ut.MachineDeployments.MarkPendingUpgrade("md0-abc123") return ut @@ -330,7 +330,39 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason, - wantConditionMessage: "MachineDeployment(s) md0-abc123 upgrade to version v1.22.0 on hold. Control plane is reconciling desired replicas", + wantConditionMessage: "MachineDeployment(s) md0-abc123 rollout and upgrade to version v1.22.0 on hold. Control plane is reconciling desired replicas", + }, + { + name: "should set the condition to false if control plane picked the new version but there are machine deployments pending create because control plane is scaling", + reconcileErr: nil, + cluster: &clusterv1.Cluster{}, + s: &scope.Scope{ + Blueprint: &scope.ClusterBlueprint{ + Topology: &clusterv1.Topology{ + Version: "v1.22.0", + }, + }, + Current: &scope.ClusterState{ + Cluster: &clusterv1.Cluster{}, + ControlPlane: &scope.ControlPlaneState{ + Object: builder.ControlPlane("ns1", "controlplane1"). + WithVersion("v1.22.0"). + WithReplicas(3). + Build(), + }, + }, + UpgradeTracker: func() *scope.UpgradeTracker { + ut := scope.NewUpgradeTracker() + ut.ControlPlane.IsPendingUpgrade = false + ut.ControlPlane.IsScaling = true + ut.MachineDeployments.MarkPendingCreate("md0") + return ut + }(), + HookResponseTracker: scope.NewHookResponseTracker(), + }, + wantConditionStatus: corev1.ConditionFalse, + wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsCreatePendingReason, + wantConditionMessage: "MachineDeployment(s) for Topologies md0 creation on hold. Control plane is reconciling desired replicas", }, { name: "should set the condition to true if control plane picked the new version and is upgrading but there are no machine deployments", @@ -353,7 +385,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.ControlPlane.IsUpgrading = true return ut }(), @@ -382,7 +414,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.ControlPlane.IsScaling = true return ut }(), @@ -450,7 +482,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.MachineDeployments.MarkUpgrading("md0-abc123") ut.MachineDeployments.MarkPendingUpgrade("md1-abc123") return ut @@ -469,7 +501,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason, - wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading", + wantConditionMessage: "MachineDeployment(s) md1-abc123 rollout and upgrade to version v1.22.0 on hold. MachineDeployment(s) md0-abc123 are upgrading", }, { name: "should set the condition to false if some machine deployments have not picked the new version because their upgrade has been deferred", @@ -520,7 +552,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.MachineDeployments.MarkDeferredUpgrade("md1-abc123") return ut }(), @@ -528,7 +560,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, wantConditionStatus: corev1.ConditionFalse, wantConditionReason: clusterv1.TopologyReconciledMachineDeploymentsUpgradeDeferredReason, - wantConditionMessage: "MachineDeployment(s) md1-abc123 upgrade to version v1.22.0 deferred.", + wantConditionMessage: "MachineDeployment(s) md1-abc123 rollout and upgrade to version v1.22.0 deferred.", }, { name: "should set the condition to true if there are no reconcile errors and control plane and all machine deployments picked up the new version", @@ -579,7 +611,7 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) { }, UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false return ut }(), HookResponseTracker: scope.NewHookResponseTracker(), @@ -656,7 +688,7 @@ func TestComputeMachineDeploymentNameList(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - g.Expect(computeMachineDeploymentNameList(tt.mdList)).To(Equal(tt.expected)) + g.Expect(computeNameList(tt.mdList)).To(Equal(tt.expected)) }) } } diff --git a/internal/controllers/topology/cluster/desired_state.go b/internal/controllers/topology/cluster/desired_state.go index 821d42ef0cbb..ce222170c28a 100644 --- a/internal/controllers/topology/cluster/desired_state.go +++ b/internal/controllers/topology/cluster/desired_state.go @@ -100,7 +100,7 @@ func (r *Reconciler) computeDesiredState(ctx context.Context, s *scope.Scope) (* // If required, compute the desired state of the MachineDeployments from the list of MachineDeploymentTopologies // defined in the cluster. if s.Blueprint.HasMachineDeployments() { - desiredState.MachineDeployments, err = r.computeMachineDeployments(ctx, s, desiredState.ControlPlane) + desiredState.MachineDeployments, err = r.computeMachineDeployments(ctx, s) if err != nil { return nil, errors.Wrapf(err, "failed to compute MachineDeployments") } @@ -343,12 +343,12 @@ func (r *Reconciler) computeControlPlaneVersion(ctx context.Context, s *scope.Sc return "", errors.Wrap(err, "failed to get the version from control plane spec") } - s.UpgradeTracker.ControlPlane.PendingUpgrade = true + s.UpgradeTracker.ControlPlane.IsPendingUpgrade = true if *currentVersion == desiredVersion { // Mark that the control plane spec is already at the desired version. // This information is used to show the appropriate message for the TopologyReconciled // condition. - s.UpgradeTracker.ControlPlane.PendingUpgrade = false + s.UpgradeTracker.ControlPlane.IsPendingUpgrade = false } // Check if the control plane is being created for the first time. @@ -380,7 +380,7 @@ func (r *Reconciler) computeControlPlaneVersion(ctx context.Context, s *scope.Sc } // Return here if the control plane is already at the desired version - if !s.UpgradeTracker.ControlPlane.PendingUpgrade { + if !s.UpgradeTracker.ControlPlane.IsPendingUpgrade { // At this stage the control plane is not upgrading and is already at the desired version. // We can return. // Nb. We do not return early in the function if the control plane is already at the desired version so as @@ -409,7 +409,6 @@ func (r *Reconciler) computeControlPlaneVersion(ctx context.Context, s *scope.Sc // can remove this hook from the list of pending-hooks. if hookResponse.RetryAfterSeconds != 0 { log.Infof("MachineDeployments upgrade to version %q are blocked by %q hook", desiredVersion, runtimecatalog.HookName(runtimehooksv1.AfterControlPlaneUpgrade)) - s.UpgradeTracker.MachineDeployments.HoldUpgrades(true) } else { if err := hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterControlPlaneUpgrade); err != nil { return "", err @@ -471,6 +470,8 @@ func (r *Reconciler) computeControlPlaneVersion(ctx context.Context, s *scope.Sc // Control plane and machine deployments are stable. All the required hook are called. // Ready to pick up the topology version. + s.UpgradeTracker.ControlPlane.IsPendingUpgrade = false + s.UpgradeTracker.ControlPlane.IsStartingUpgrade = true return desiredVersion, nil } @@ -534,10 +535,10 @@ func calculateRefDesiredAPIVersion(currentRef *corev1.ObjectReference, desiredRe } // computeMachineDeployments computes the desired state of the list of MachineDeployments. -func (r *Reconciler) computeMachineDeployments(ctx context.Context, s *scope.Scope, desiredControlPlaneState *scope.ControlPlaneState) (scope.MachineDeploymentsStateMap, error) { +func (r *Reconciler) computeMachineDeployments(ctx context.Context, s *scope.Scope) (scope.MachineDeploymentsStateMap, error) { machineDeploymentsStateMap := make(scope.MachineDeploymentsStateMap) for _, mdTopology := range s.Blueprint.Topology.Workers.MachineDeployments { - desiredMachineDeployment, err := computeMachineDeployment(ctx, s, desiredControlPlaneState, mdTopology) + desiredMachineDeployment, err := computeMachineDeployment(ctx, s, mdTopology) if err != nil { return nil, errors.Wrapf(err, "failed to compute MachineDepoyment for topology %q", mdTopology.Name) } @@ -549,7 +550,7 @@ func (r *Reconciler) computeMachineDeployments(ctx context.Context, s *scope.Sco // computeMachineDeployment computes the desired state for a MachineDeploymentTopology. // The generated machineDeployment object is calculated using the values from the machineDeploymentTopology and // the machineDeployment class. -func computeMachineDeployment(_ context.Context, s *scope.Scope, desiredControlPlaneState *scope.ControlPlaneState, machineDeploymentTopology clusterv1.MachineDeploymentTopology) (*scope.MachineDeploymentState, error) { +func computeMachineDeployment(_ context.Context, s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology) (*scope.MachineDeploymentState, error) { desiredMachineDeployment := &scope.MachineDeploymentState{} // Gets the blueprint for the MachineDeployment class. @@ -621,10 +622,7 @@ func computeMachineDeployment(_ context.Context, s *scope.Scope, desiredControlP // Add ClusterTopologyMachineDeploymentLabel to the generated InfrastructureMachine template infraMachineTemplateLabels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] = machineDeploymentTopology.Name desiredMachineDeployment.InfrastructureMachineTemplate.SetLabels(infraMachineTemplateLabels) - version, err := computeMachineDeploymentVersion(s, machineDeploymentTopology, desiredControlPlaneState, currentMachineDeployment) - if err != nil { - return nil, errors.Wrapf(err, "failed to compute version for %s", machineDeploymentTopology.Name) - } + version := computeMachineDeploymentVersion(s, machineDeploymentTopology, currentMachineDeployment) // Compute values that can be set both in the MachineDeploymentClass and in the MachineDeploymentTopology minReadySeconds := machineDeploymentClass.MinReadySeconds @@ -754,16 +752,17 @@ func computeMachineDeployment(_ context.Context, s *scope.Scope, desiredControlP // computeMachineDeploymentVersion calculates the version of the desired machine deployment. // The version is calculated using the state of the current machine deployments, // the current control plane and the version defined in the topology. -// Nb: No MachineDeployment upgrades will be triggered while any MachineDeployment is in the middle -// of an upgrade. Even if the number of MachineDeployments that are being upgraded is less -// than the number of allowed concurrent upgrades. -func computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology, desiredControlPlaneState *scope.ControlPlaneState, currentMDState *scope.MachineDeploymentState) (string, error) { +func computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology, currentMDState *scope.MachineDeploymentState) string { desiredVersion := s.Blueprint.Topology.Version - // If creating a new machine deployment, we can pick up the desired version - // Note: We are not blocking the creation of new machine deployments when - // the control plane or any of the machine deployments are upgrading/scaling. + // If creating a new machine deployment, mark it as pending if the control plane is not + // yet stable. Creating a new MD while the control plane is upgrading can lead to unexpected race conditions. + // Example: join could fail if the load balancers are slow in detecting when CP machines are + // being deleted. if currentMDState == nil || currentMDState.Object == nil { - return desiredVersion, nil + if !isControlPlaneStable(s) || s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade) { + s.UpgradeTracker.MachineDeployments.MarkPendingCreate(machineDeploymentTopology.Name) + } + return desiredVersion } // Get the current version of the machine deployment. @@ -772,86 +771,68 @@ func computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology c // Return early if the currentVersion is already equal to the desiredVersion // no further checks required. if currentVersion == desiredVersion { - return currentVersion, nil + return currentVersion } // Return early if the upgrade for the MachineDeployment is deferred. if isMachineDeploymentDeferred(s.Blueprint.Topology, machineDeploymentTopology) { s.UpgradeTracker.MachineDeployments.MarkDeferredUpgrade(currentMDState.Object.Name) - return currentVersion, nil + s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) + return currentVersion } - // Return early if we are not allowed to upgrade the machine deployment. - if !s.UpgradeTracker.MachineDeployments.AllowUpgrade() { + // Return early if the AfterControlPlaneUpgrade hook returns a blocking response. + if s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade) { s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) - return currentVersion, nil + return currentVersion } - // If the control plane is being created (current control plane is nil), do not perform - // any machine deployment upgrade in this case. - // Return the current version of the machine deployment. - // NOTE: this case should never happen (upgrading a MachineDeployment) before creating a CP, - // but we are implementing this check for extra safety. - if s.Current.ControlPlane == nil || s.Current.ControlPlane.Object == nil { + // Return early if the upgrade concurrency is reached. + if s.UpgradeTracker.MachineDeployments.UpgradeConcurrencyReached() { s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) - return currentVersion, nil + return currentVersion } - // If the current control plane is upgrading, then do not pick up the desiredVersion yet. + // Return early if the Control Plane is not stable. Do not pick up the desiredVersion yet. // Return the current version of the machine deployment. We will pick up the new version after the control // plane is stable. - cpUpgrading, err := contract.ControlPlane().IsUpgrading(s.Current.ControlPlane.Object) - if err != nil { - return "", errors.Wrap(err, "failed to check if control plane is upgrading") - } - if cpUpgrading { + if !isControlPlaneStable(s) { s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) - return currentVersion, nil + return currentVersion } - // If control plane supports replicas, check if the control plane is in the middle of a scale operation. - // If the current control plane is scaling, then do not pick up the desiredVersion yet. - // Return the current version of the machine deployment. We will pick up the new version after the control - // plane is stable. - if s.Blueprint.Topology.ControlPlane.Replicas != nil { - cpScaling, err := contract.ControlPlane().IsScaling(s.Current.ControlPlane.Object) - if err != nil { - return "", errors.Wrap(err, "failed to check if the control plane is scaling") - } - if cpScaling { - s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) - return currentVersion, nil - } - } + // Control plane and machine deployments are stable. + // Ready to pick up the topology version. + s.UpgradeTracker.MachineDeployments.MarkUpgrading(currentMDState.Object.Name) + return desiredVersion +} - // Check if we are about to upgrade the control plane. In that case, do not upgrade the machine deployment yet. - // Wait for the new upgrade operation on the control plane to finish before picking up the new version for the - // machine deployment. - currentCPVersion, err := contract.ControlPlane().Version().Get(s.Current.ControlPlane.Object) - if err != nil { - return "", errors.Wrap(err, "failed to get version of current control plane") +// isControlPlaneStable returns true is the ControlPlane is stable. +func isControlPlaneStable(s *scope.Scope) bool { + // If the current control plane is upgrading it is not considered stable. + if s.UpgradeTracker.ControlPlane.IsUpgrading { + return false } - desiredCPVersion, err := contract.ControlPlane().Version().Get(desiredControlPlaneState.Object) - if err != nil { - return "", errors.Wrap(err, "failed to get version of desired control plane") + + // If control plane supports replicas, check if the control plane is in the middle of a scale operation. + // If the current control plane is scaling then it is not considered stable. + if s.UpgradeTracker.ControlPlane.IsScaling { + return false } - if *currentCPVersion != *desiredCPVersion { - // The versions of the current and desired control planes do no match, - // implies we are about to upgrade the control plane. - s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) - return currentVersion, nil + + // Check if we are about to upgrade the control plane. Since the control plane is about to start its upgrade process + // it cannot be considered stable. + if s.UpgradeTracker.ControlPlane.IsStartingUpgrade { + return false } - // If the ControlPlane is pending picking up an upgrade then do not pick up the new version yet. - if s.UpgradeTracker.ControlPlane.PendingUpgrade { - s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name) - return currentVersion, nil + // If the ControlPlane is pending picking up an upgrade then it is not yet at the desired state and + // cannot be considered stable. + if s.UpgradeTracker.ControlPlane.IsPendingUpgrade { + return false } - // Control plane and machine deployments are stable. - // Ready to pick up the topology version. - s.UpgradeTracker.MachineDeployments.MarkUpgrading(currentMDState.Object.Name) - return desiredVersion, nil + return true } // isMachineDeploymentDeferred returns true if the upgrade for the mdTopology is deferred. diff --git a/internal/controllers/topology/cluster/desired_state_test.go b/internal/controllers/topology/cluster/desired_state_test.go index 57fc5f167de5..684d13b0f72c 100644 --- a/internal/controllers/topology/cluster/desired_state_test.go +++ b/internal/controllers/topology/cluster/desired_state_test.go @@ -867,6 +867,9 @@ func TestComputeControlPlaneVersion(t *testing.T) { } else { g.Expect(err).To(BeNil()) g.Expect(version).To(Equal(tt.expectedVersion)) + // Verify that if the upgrade is pending it is captured in the upgrade tracker. + upgradePending := tt.expectedVersion != tt.topologyVersion + g.Expect(s.UpgradeTracker.ControlPlane.IsPendingUpgrade).To(Equal(upgradePending)) } }) } @@ -946,13 +949,13 @@ func TestComputeControlPlaneVersion(t *testing.T) { Build() tests := []struct { - name string - s *scope.Scope - hookResponse *runtimehooksv1.AfterControlPlaneUpgradeResponse - wantIntentToCall bool - wantHookToBeCalled bool - wantAllowMDUpgrades bool - wantErr bool + name string + s *scope.Scope + hookResponse *runtimehooksv1.AfterControlPlaneUpgradeResponse + wantIntentToCall bool + wantHookToBeCalled bool + wantHookToBlock bool + wantErr bool }{ { name: "should not call hook if it is not marked", @@ -1071,11 +1074,11 @@ func TestComputeControlPlaneVersion(t *testing.T) { UpgradeTracker: scope.NewUpgradeTracker(), HookResponseTracker: scope.NewHookResponseTracker(), }, - hookResponse: nonBlockingResponse, - wantIntentToCall: false, - wantHookToBeCalled: true, - wantAllowMDUpgrades: true, - wantErr: false, + hookResponse: nonBlockingResponse, + wantIntentToCall: false, + wantHookToBeCalled: true, + wantHookToBlock: false, + wantErr: false, }, { name: "should call hook if the control plane is at desired version - blocking response should leave the hook in pending hooks list and block MD upgrades", @@ -1104,11 +1107,11 @@ func TestComputeControlPlaneVersion(t *testing.T) { UpgradeTracker: scope.NewUpgradeTracker(), HookResponseTracker: scope.NewHookResponseTracker(), }, - hookResponse: blockingResponse, - wantIntentToCall: true, - wantHookToBeCalled: true, - wantAllowMDUpgrades: false, - wantErr: false, + hookResponse: blockingResponse, + wantIntentToCall: true, + wantHookToBeCalled: true, + wantHookToBlock: true, + wantErr: false, }, { name: "should call hook if the control plane is at desired version - failure response should leave the hook in pending hooks list", @@ -1168,7 +1171,7 @@ func TestComputeControlPlaneVersion(t *testing.T) { g.Expect(hooks.IsPending(runtimehooksv1.AfterControlPlaneUpgrade, tt.s.Current.Cluster)).To(Equal(tt.wantIntentToCall)) g.Expect(err != nil).To(Equal(tt.wantErr)) if tt.wantHookToBeCalled && !tt.wantErr { - g.Expect(tt.s.UpgradeTracker.MachineDeployments.AllowUpgrade()).To(Equal(tt.wantAllowMDUpgrades)) + g.Expect(tt.s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade)).To(Equal(tt.wantHookToBlock)) } }) } @@ -1408,7 +1411,7 @@ func TestComputeMachineDeployment(t *testing.T) { scope := scope.New(cluster) scope.Blueprint = blueprint - actual, err := computeMachineDeployment(ctx, scope, nil, mdTopology) + actual, err := computeMachineDeployment(ctx, scope, mdTopology) g.Expect(err).ToNot(HaveOccurred()) g.Expect(actual.BootstrapTemplate.GetLabels()).To(HaveKeyWithValue(clusterv1.ClusterTopologyMachineDeploymentNameLabel, "big-pool-of-machines")) @@ -1477,7 +1480,7 @@ func TestComputeMachineDeployment(t *testing.T) { // missing FailureDomain, NodeDrainTimeout, NodeVolumeDetachTimeout, NodeDeletionTimeout, MinReadySeconds, Strategy } - actual, err := computeMachineDeployment(ctx, scope, nil, mdTopology) + actual, err := computeMachineDeployment(ctx, scope, mdTopology) g.Expect(err).ToNot(HaveOccurred()) // checking only values from CC defaults @@ -1521,7 +1524,7 @@ func TestComputeMachineDeployment(t *testing.T) { }, } - actual, err := computeMachineDeployment(ctx, s, nil, mdTopology) + actual, err := computeMachineDeployment(ctx, s, mdTopology) g.Expect(err).ToNot(HaveOccurred()) actualMd := actual.Object @@ -1569,7 +1572,7 @@ func TestComputeMachineDeployment(t *testing.T) { Name: "big-pool-of-machines", } - _, err := computeMachineDeployment(ctx, scope, nil, mdTopology) + _, err := computeMachineDeployment(ctx, scope, mdTopology) g.Expect(err).To(HaveOccurred()) }) @@ -1674,9 +1677,6 @@ func TestComputeMachineDeployment(t *testing.T) { s.Current.ControlPlane = &scope.ControlPlaneState{ Object: controlPlaneStable123, } - desiredControlPlaneState := &scope.ControlPlaneState{ - Object: controlPlaneStable123, - } mdTopology := clusterv1.MachineDeploymentTopology{ Class: "linux-worker", @@ -1684,7 +1684,7 @@ func TestComputeMachineDeployment(t *testing.T) { Replicas: pointer.Int32(2), } s.UpgradeTracker.MachineDeployments.MarkUpgrading(tt.upgradingMachineDeployments...) - obj, err := computeMachineDeployment(ctx, s, desiredControlPlaneState, mdTopology) + obj, err := computeMachineDeployment(ctx, s, mdTopology) g.Expect(err).NotTo(HaveOccurred()) g.Expect(*obj.Object.Spec.Template.Spec.Version).To(Equal(tt.expectedVersion)) }) @@ -1700,7 +1700,7 @@ func TestComputeMachineDeployment(t *testing.T) { Name: "big-pool-of-machines", } - actual, err := computeMachineDeployment(ctx, scope, nil, mdTopology) + actual, err := computeMachineDeployment(ctx, scope, mdTopology) g.Expect(err).To(BeNil()) // Check that the ClusterName and selector are set properly for the MachineHealthCheck. g.Expect(actual.MachineHealthCheck.Spec.ClusterName).To(Equal(cluster.Name)) @@ -1718,75 +1718,56 @@ func TestComputeMachineDeployment(t *testing.T) { } func TestComputeMachineDeploymentVersion(t *testing.T) { - controlPlaneStable122 := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": "v1.2.2", - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": "v1.2.2", - "status.replicas": int64(2), - "status.updatedReplicas": int64(2), - "status.readyReplicas": int64(2), - "status.unavailableReplicas": int64(0), - }). - Build() - controlPlaneStable123 := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": "v1.2.3", - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": "v1.2.3", - "status.replicas": int64(2), - "status.updatedReplicas": int64(2), - "status.readyReplicas": int64(2), - "status.unavailableReplicas": int64(0), - }). - Build() - controlPlaneUpgrading := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": "v1.2.3", - }). - WithStatusFields(map[string]interface{}{ - "status.version": "v1.2.1", - }). - Build() - controlPlaneScaling := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": "v1.2.3", - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": "v1.2.3", - "status.replicas": int64(1), - "status.updatedReplicas": int64(1), - "status.readyReplicas": int64(1), - "status.unavailableReplicas": int64(0), - }). - Build() - controlPlaneDesired := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": "v1.2.3", - }). + controlPlaneObj := builder.ControlPlane("test1", "cp1"). Build() + mdName := "md-1" + currentMachineDeploymentState := &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", mdName).WithVersion("v1.2.2").Build()} + tests := []struct { - name string - machineDeploymentTopology clusterv1.MachineDeploymentTopology - currentMachineDeploymentState *scope.MachineDeploymentState - upgradingMachineDeployments []string - upgradeConcurrency int - currentControlPlane *unstructured.Unstructured - desiredControlPlane *unstructured.Unstructured - topologyVersion string - expectedVersion string + name string + machineDeploymentTopology clusterv1.MachineDeploymentTopology + currentMachineDeploymentState *scope.MachineDeploymentState + upgradingMachineDeployments []string + upgradeConcurrency int + controlPlaneStartingUpgrade bool + controlPlaneUpgrading bool + controlPlaneScaling bool + controlPlaneProvisioning bool + afterControlPlaneUpgradeHookBlocking bool + topologyVersion string + expectedVersion string + expectPendingCreate bool + expectPendingUpgrade bool }{ { - name: "should return cluster.spec.topology.version if creating a new machine deployment", + name: "should return cluster.spec.topology.version if creating a new machine deployment and if control plane is stable", currentMachineDeploymentState: nil, - topologyVersion: "v1.2.3", - expectedVersion: "v1.2.3", + machineDeploymentTopology: clusterv1.MachineDeploymentTopology{ + Name: "md-topology-1", + }, + topologyVersion: "v1.2.3", + expectedVersion: "v1.2.3", + expectPendingCreate: false, + }, + { + name: "should return cluster.spec.topology.version if creating a new machine deployment and if control plane is not stable - marked as pending create", + controlPlaneScaling: true, + machineDeploymentTopology: clusterv1.MachineDeploymentTopology{ + Name: "md-topology-1", + }, + topologyVersion: "v1.2.3", + expectedVersion: "v1.2.3", + expectPendingCreate: true, + }, + { + name: "should return cluster.spec.topology.version if creating a new machine deployment and if control plane is stable - not marked as pending create", + machineDeploymentTopology: clusterv1.MachineDeploymentTopology{ + Name: "md-topology-1", + }, + topologyVersion: "v1.2.3", + expectedVersion: "v1.2.3", + expectPendingCreate: false, }, { name: "should return machine deployment's spec.template.spec.version if upgrade is deferred", @@ -1797,69 +1778,77 @@ func TestComputeMachineDeploymentVersion(t *testing.T) { }, }, }, - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{}, - currentControlPlane: controlPlaneStable123, - desiredControlPlane: controlPlaneDesired, topologyVersion: "v1.2.3", expectedVersion: "v1.2.2", + expectPendingUpgrade: true, }, { // Control plane is considered upgrading if the control plane's spec.version and status.version is not equal. name: "should return machine deployment's spec.template.spec.version if control plane is upgrading", - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{}, - currentControlPlane: controlPlaneUpgrading, + controlPlaneUpgrading: true, topologyVersion: "v1.2.3", expectedVersion: "v1.2.2", + expectPendingUpgrade: true, }, { // Control plane is considered ready to upgrade if spec.version of current and desired control planes are not equal. - name: "should return machine deployment's spec.template.spec.version if control plane is ready to upgrade", - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + name: "should return machine deployment's spec.template.spec.version if control plane is starting upgrade", + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{}, - currentControlPlane: controlPlaneStable122, - desiredControlPlane: controlPlaneDesired, + controlPlaneStartingUpgrade: true, topologyVersion: "v1.2.3", expectedVersion: "v1.2.2", + expectPendingUpgrade: true, }, { // Control plane is considered scaling if its spec.replicas is not equal to any of status.replicas, status.readyReplicas or status.updatedReplicas. name: "should return machine deployment's spec.template.spec.version if control plane is scaling", - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{}, - currentControlPlane: controlPlaneScaling, + controlPlaneScaling: true, topologyVersion: "v1.2.3", expectedVersion: "v1.2.2", + expectPendingUpgrade: true, }, { name: "should return cluster.spec.topology.version if the control plane is not upgrading, not scaling, not ready to upgrade and none of the machine deployments are upgrading", - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{}, - currentControlPlane: controlPlaneStable123, - desiredControlPlane: controlPlaneDesired, topologyVersion: "v1.2.3", expectedVersion: "v1.2.3", + expectPendingUpgrade: false, + }, + { + name: "should return machine deployment's spec.template.spec.version if control plane is stable, other machine deployments are upgrading, concurrency limit not reached but AfterControlPlaneUpgrade hook is blocking", + currentMachineDeploymentState: currentMachineDeploymentState, + upgradingMachineDeployments: []string{"upgrading-md1"}, + upgradeConcurrency: 2, + afterControlPlaneUpgradeHookBlocking: true, + topologyVersion: "v1.2.3", + expectedVersion: "v1.2.2", + expectPendingUpgrade: true, }, { name: "should return cluster.spec.topology.version if control plane is stable, other machine deployments are upgrading, concurrency limit not reached", - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{"upgrading-md1"}, upgradeConcurrency: 2, - currentControlPlane: controlPlaneStable123, - desiredControlPlane: controlPlaneDesired, topologyVersion: "v1.2.3", expectedVersion: "v1.2.3", + expectPendingUpgrade: false, }, { name: "should return machine deployment's spec.template.spec.version if control plane is stable, other machine deployments are upgrading, concurrency limit reached", - currentMachineDeploymentState: &scope.MachineDeploymentState{Object: builder.MachineDeployment("test1", "md-current").WithVersion("v1.2.2").Build()}, + currentMachineDeploymentState: currentMachineDeploymentState, upgradingMachineDeployments: []string{"upgrading-md1", "upgrading-md2"}, upgradeConcurrency: 2, - currentControlPlane: controlPlaneStable123, - desiredControlPlane: controlPlaneDesired, topologyVersion: "v1.2.3", expectedVersion: "v1.2.2", + expectPendingUpgrade: true, }, } @@ -1876,15 +1865,41 @@ func TestComputeMachineDeploymentVersion(t *testing.T) { Workers: &clusterv1.WorkersTopology{}, }}, Current: &scope.ClusterState{ - ControlPlane: &scope.ControlPlaneState{Object: tt.currentControlPlane}, + ControlPlane: &scope.ControlPlaneState{Object: controlPlaneObj}, }, - UpgradeTracker: scope.NewUpgradeTracker(scope.MaxMDUpgradeConcurrency(tt.upgradeConcurrency)), + UpgradeTracker: scope.NewUpgradeTracker(scope.MaxMDUpgradeConcurrency(tt.upgradeConcurrency)), + HookResponseTracker: scope.NewHookResponseTracker(), } - desiredControlPlaneState := &scope.ControlPlaneState{Object: tt.desiredControlPlane} + if tt.afterControlPlaneUpgradeHookBlocking { + s.HookResponseTracker.Add(runtimehooksv1.AfterControlPlaneUpgrade, &runtimehooksv1.AfterControlPlaneUpgradeResponse{ + CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ + RetryAfterSeconds: 10, + }, + }) + } + s.UpgradeTracker.ControlPlane.IsStartingUpgrade = tt.controlPlaneStartingUpgrade + s.UpgradeTracker.ControlPlane.IsUpgrading = tt.controlPlaneUpgrading + s.UpgradeTracker.ControlPlane.IsScaling = tt.controlPlaneScaling + s.UpgradeTracker.ControlPlane.IsProvisioning = tt.controlPlaneProvisioning s.UpgradeTracker.MachineDeployments.MarkUpgrading(tt.upgradingMachineDeployments...) - version, err := computeMachineDeploymentVersion(s, tt.machineDeploymentTopology, desiredControlPlaneState, tt.currentMachineDeploymentState) - g.Expect(err).NotTo(HaveOccurred()) + version := computeMachineDeploymentVersion(s, tt.machineDeploymentTopology, tt.currentMachineDeploymentState) g.Expect(version).To(Equal(tt.expectedVersion)) + + if tt.currentMachineDeploymentState != nil { + // Verify that if the upgrade is pending it is captured in the upgrade tracker. + if tt.expectPendingUpgrade { + g.Expect(s.UpgradeTracker.MachineDeployments.IsPendingUpgrade(mdName)).To(BeTrue(), "MachineDeployment should be marked as pending upgrade") + } else { + g.Expect(s.UpgradeTracker.MachineDeployments.IsPendingUpgrade(mdName)).To(BeFalse(), "MachineDeployment should not be marked as pending upgrade") + } + } else { + // Verify that if create the pending it is capture in the tracker. + if tt.expectPendingCreate { + g.Expect(s.UpgradeTracker.MachineDeployments.IsPendingCreate(tt.machineDeploymentTopology.Name)).To(BeTrue(), "MachineDeployment topology should be marked as pending create") + } else { + g.Expect(s.UpgradeTracker.MachineDeployments.IsPendingCreate(tt.machineDeploymentTopology.Name)).To(BeFalse(), "MachineDeployment topology should not be marked as pending create") + } + } }) } } diff --git a/internal/controllers/topology/cluster/reconcile_state.go b/internal/controllers/topology/cluster/reconcile_state.go index b6dc1e8cc8a1..ef4b11a39d14 100644 --- a/internal/controllers/topology/cluster/reconcile_state.go +++ b/internal/controllers/topology/cluster/reconcile_state.go @@ -240,33 +240,15 @@ func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope if hooks.IsPending(runtimehooksv1.AfterClusterUpgrade, s.Current.Cluster) { // Call the registered extensions for the hook after the cluster is fully upgraded. // A clusters is considered fully upgraded if: - // - Control plane is not upgrading - // - Control plane is not scaling - // - Control plane is not pending an upgrade - // - MachineDeployments are not currently rolling out - // - MAchineDeployments are not about to roll out + // - Control plane is stable (not upgrading, not scaling, not about to upgrade) + // - MachineDeployments are not currently upgrading // - MachineDeployments are not pending an upgrade - - // Check if the control plane is upgrading. - cpUpgrading, err := contract.ControlPlane().IsUpgrading(s.Current.ControlPlane.Object) - if err != nil { - return errors.Wrap(err, "failed to check if control plane is upgrading") - } - - // Check if the control plane is scaling. If the control plane does not support replicas - // it will be considered as not scaling. - var cpScaling bool - if s.Blueprint.Topology.ControlPlane.Replicas != nil { - cpScaling, err = contract.ControlPlane().IsScaling(s.Current.ControlPlane.Object) - if err != nil { - return errors.Wrap(err, "failed to check if the control plane is scaling") - } - } - - if !cpUpgrading && !cpScaling && !s.UpgradeTracker.ControlPlane.PendingUpgrade && // Control Plane checks + // - MachineDeployments are not pending create + if isControlPlaneStable(s) && // Control Plane stable checks len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) == 0 && // Machine deployments are not upgrading or not about to upgrade - !s.UpgradeTracker.MachineDeployments.PendingUpgrade() && // No MachineDeployments have an upgrade pending - !s.UpgradeTracker.MachineDeployments.DeferredUpgrade() { // No MachineDeployments have an upgrade deferred + !s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate() && // No MachineDeployments are pending create + !s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade() && // No MachineDeployments are pending an upgrade + !s.UpgradeTracker.MachineDeployments.DeferredUpgrade() { // No MachineDeployments have deferred an upgrade // Everything is stable and the cluster can be considered fully upgraded. hookRequest := &runtimehooksv1.AfterClusterUpgradeRequest{ Cluster: *s.Current.Cluster, @@ -307,6 +289,22 @@ func (r *Reconciler) reconcileInfrastructureCluster(ctx context.Context, s *scop // reconcileControlPlane works to bring the current state of a managed topology in line with the desired state. This involves // updating the cluster where needed. func (r *Reconciler) reconcileControlPlane(ctx context.Context, s *scope.Scope) error { + // If the ControlPlane has defined a current or desired MachineHealthCheck attempt to reconcile it. + // MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation + // even if the Control Plane is pending an upgrade. + if s.Desired.ControlPlane.MachineHealthCheck != nil || s.Current.ControlPlane.MachineHealthCheck != nil { + // Reconcile the current and desired state of the MachineHealthCheck. + if err := r.reconcileMachineHealthCheck(ctx, s.Current.ControlPlane.MachineHealthCheck, s.Desired.ControlPlane.MachineHealthCheck); err != nil { + return err + } + } + + // Return early if the control plane is pending an upgrade. + // Do not reconcile the control plane yet to avoid updating the control plane while it is still pending a + // version upgrade. This will prevent the control plane from performing a double rollout. + if s.UpgradeTracker.ControlPlane.IsPendingUpgrade { + return nil + } // If the clusterClass mandates the controlPlane has infrastructureMachines, reconcile it. if s.Blueprint.HasControlPlaneInfrastructureMachine() { ctx, _ := tlog.LoggerFrom(ctx).WithObject(s.Desired.ControlPlane.InfrastructureMachineTemplate).Into(ctx) @@ -361,13 +359,6 @@ func (r *Reconciler) reconcileControlPlane(ctx context.Context, s *scope.Scope) } } - // If the ControlPlane has defined a current or desired MachineHealthCheck attempt to reconcile it. - if s.Desired.ControlPlane.MachineHealthCheck != nil || s.Current.ControlPlane.MachineHealthCheck != nil { - // Reconcile the current and desired state of the MachineHealthCheck. - if err := r.reconcileMachineHealthCheck(ctx, s.Current.ControlPlane.MachineHealthCheck, s.Desired.ControlPlane.MachineHealthCheck); err != nil { - return err - } - } return nil } @@ -457,7 +448,7 @@ func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.S // Create MachineDeployments. for _, mdTopologyName := range diff.toCreate { md := s.Desired.MachineDeployments[mdTopologyName] - if err := r.createMachineDeployment(ctx, s.Current.Cluster, md); err != nil { + if err := r.createMachineDeployment(ctx, s, md); err != nil { return err } } @@ -466,7 +457,7 @@ func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.S for _, mdTopologyName := range diff.toUpdate { currentMD := s.Current.MachineDeployments[mdTopologyName] desiredMD := s.Desired.MachineDeployments[mdTopologyName] - if err := r.updateMachineDeployment(ctx, s.Current.Cluster, mdTopologyName, currentMD, desiredMD); err != nil { + if err := r.updateMachineDeployment(ctx, s, mdTopologyName, currentMD, desiredMD); err != nil { return err } } @@ -482,9 +473,23 @@ func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.S } // createMachineDeployment creates a MachineDeployment and the corresponding Templates. -func (r *Reconciler) createMachineDeployment(ctx context.Context, cluster *clusterv1.Cluster, md *scope.MachineDeploymentState) error { - log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object) +func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope, md *scope.MachineDeploymentState) error { + // Do not create the MachineDeployment if it is marked as pending create. + // This will also block MHC creation because creating the MHC without the corresponding + // MachineDeployment is unnecessary. + mdTopologyName, ok := md.Object.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] + if !ok || mdTopologyName == "" { + // Note: This is only an additional safety check and should not happen. The label will always be added when computing + // the desired MachineDeployment. + return errors.Errorf("new MachineDeployment is missing the %q label", clusterv1.ClusterTopologyMachineDeploymentNameLabel) + } + // Return early if the MachineDeployment is pending create. + if s.UpgradeTracker.MachineDeployments.IsPendingCreate(mdTopologyName) { + return nil + } + log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object) + cluster := s.Current.Cluster infraCtx, _ := log.WithObject(md.InfrastructureMachineTemplate).Into(ctx) if err := r.reconcileReferencedTemplate(infraCtx, reconcileReferencedTemplateInput{ cluster: cluster, @@ -522,9 +527,26 @@ func (r *Reconciler) createMachineDeployment(ctx context.Context, cluster *clust } // updateMachineDeployment updates a MachineDeployment. Also rotates the corresponding Templates if necessary. -func (r *Reconciler) updateMachineDeployment(ctx context.Context, cluster *clusterv1.Cluster, mdTopologyName string, currentMD, desiredMD *scope.MachineDeploymentState) error { +func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope, mdTopologyName string, currentMD, desiredMD *scope.MachineDeploymentState) error { log := tlog.LoggerFrom(ctx).WithMachineDeployment(desiredMD.Object) + // Patch MachineHealthCheck for the MachineDeployment. + // MHC changes are not Kubernetes version dependent, therefore proceed with MHC reconciliation + // even if the MachineDeployment is pending an upgrade. + if desiredMD.MachineHealthCheck != nil || currentMD.MachineHealthCheck != nil { + if err := r.reconcileMachineHealthCheck(ctx, currentMD.MachineHealthCheck, desiredMD.MachineHealthCheck); err != nil { + return err + } + } + + // Return early if the MachineDeployment is pending an upgrade. + // Do not reconcile the MachineDeployment yet to avoid updating the MachineDeployment while it is still pending a + // version upgrade. This will prevent the MachineDeployment from performing a double rollout. + if s.UpgradeTracker.MachineDeployments.IsPendingUpgrade(currentMD.Object.Name) { + return nil + } + + cluster := s.Current.Cluster infraCtx, _ := log.WithObject(desiredMD.InfrastructureMachineTemplate).Into(ctx) if err := r.reconcileReferencedTemplate(infraCtx, reconcileReferencedTemplateInput{ cluster: cluster, @@ -549,13 +571,6 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, cluster *clust return errors.Wrapf(err, "failed to reconcile %s", tlog.KObj{Obj: currentMD.Object}) } - // Patch MachineHealthCheck for the MachineDeployment. - if desiredMD.MachineHealthCheck != nil || currentMD.MachineHealthCheck != nil { - if err := r.reconcileMachineHealthCheck(ctx, currentMD.MachineHealthCheck, desiredMD.MachineHealthCheck); err != nil { - return err - } - } - // Check differences between current and desired MachineDeployment, and eventually patch the current object. log = log.WithObject(desiredMD.Object) patchHelper, err := r.patchHelperFactory(ctx, currentMD.Object, desiredMD.Object) diff --git a/internal/controllers/topology/cluster/reconcile_state_test.go b/internal/controllers/topology/cluster/reconcile_state_test.go index b1e9cc830bd1..32363fc62abd 100644 --- a/internal/controllers/topology/cluster/reconcile_state_test.go +++ b/internal/controllers/topology/cluster/reconcile_state_test.go @@ -492,58 +492,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { } topologyVersion := "v1.2.3" - lowerVersion := "v1.2.2" - controlPlaneStableAtTopologyVersion := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": topologyVersion, - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": topologyVersion, - "status.replicas": int64(2), - "status.updatedReplicas": int64(2), - "status.readyReplicas": int64(2), - "status.unavailableReplicas": int64(0), - }). - Build() - controlPlaneStableAtLowerVersion := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": lowerVersion, - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": lowerVersion, - "status.replicas": int64(2), - "status.updatedReplicas": int64(2), - "status.readyReplicas": int64(2), - "status.unavailableReplicas": int64(0), - }). - Build() - controlPlaneUpgrading := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": topologyVersion, - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": lowerVersion, - "status.replicas": int64(2), - "status.updatedReplicas": int64(2), - "status.readyReplicas": int64(2), - "status.unavailableReplicas": int64(0), - }). - Build() - controlPlaneScaling := builder.ControlPlane("test1", "cp1"). - WithSpecFields(map[string]interface{}{ - "spec.version": topologyVersion, - "spec.replicas": int64(2), - }). - WithStatusFields(map[string]interface{}{ - "status.version": topologyVersion, - "status.replicas": int64(1), - "status.updatedReplicas": int64(1), - "status.readyReplicas": int64(1), - "status.unavailableReplicas": int64(0), - }). + controlPlaneObj := builder.ControlPlane("test1", "cp1"). Build() tests := []struct { @@ -572,6 +521,9 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { }, Spec: clusterv1.ClusterSpec{}, }, + ControlPlane: &scope.ControlPlaneState{ + Object: controlPlaneObj, + }, }, HookResponseTracker: scope.NewHookResponseTracker(), UpgradeTracker: scope.NewUpgradeTracker(), @@ -581,6 +533,43 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { wantHookToBeCalled: false, wantError: false, }, + { + name: "hook should not be called if the control plane is starting a new upgrade - hook is marked", + s: &scope.Scope{ + Blueprint: &scope.ClusterBlueprint{ + Topology: &clusterv1.Topology{ + ControlPlane: clusterv1.ControlPlaneTopology{ + Replicas: pointer.Int32(2), + }, + }, + }, + Current: &scope.ClusterState{ + Cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "test-ns", + Annotations: map[string]string{ + runtimev1.PendingHooksAnnotation: "AfterClusterUpgrade", + }, + }, + Spec: clusterv1.ClusterSpec{}, + }, + ControlPlane: &scope.ControlPlaneState{ + Object: controlPlaneObj, + }, + }, + HookResponseTracker: scope.NewHookResponseTracker(), + UpgradeTracker: func() *scope.UpgradeTracker { + ut := scope.NewUpgradeTracker() + ut.ControlPlane.IsStartingUpgrade = true + return ut + }(), + }, + wantMarked: true, + hookResponse: successResponse, + wantHookToBeCalled: false, + wantError: false, + }, { name: "hook should not be called if the control plane is upgrading - hook is marked", s: &scope.Scope{ @@ -603,11 +592,15 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Spec: clusterv1.ClusterSpec{}, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneUpgrading, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), - UpgradeTracker: scope.NewUpgradeTracker(), + UpgradeTracker: func() *scope.UpgradeTracker { + ut := scope.NewUpgradeTracker() + ut.ControlPlane.IsUpgrading = true + return ut + }(), }, wantMarked: true, hookResponse: successResponse, @@ -636,11 +629,15 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Spec: clusterv1.ClusterSpec{}, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneScaling, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), - UpgradeTracker: scope.NewUpgradeTracker(), + UpgradeTracker: func() *scope.UpgradeTracker { + ut := scope.NewUpgradeTracker() + ut.ControlPlane.IsScaling = true + return ut + }(), }, wantMarked: true, hookResponse: successResponse, @@ -648,7 +645,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { wantError: false, }, { - name: "hook should not be called if the control plane is stable at a lower version and is pending an upgrade - hook is marked", + name: "hook should not be called if the control plane is pending an upgrade - hook is marked", s: &scope.Scope{ Blueprint: &scope.ClusterBlueprint{ Topology: &clusterv1.Topology{ @@ -669,13 +666,13 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Spec: clusterv1.ClusterSpec{}, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneStableAtLowerVersion, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = true + ut.ControlPlane.IsPendingUpgrade = true return ut }(), }, @@ -706,13 +703,13 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Spec: clusterv1.ClusterSpec{}, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneStableAtTopologyVersion, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.MachineDeployments.MarkUpgrading("md1") return ut }(), @@ -723,7 +720,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { wantError: false, }, { - name: "hook should not be called if the control plane is stable at desired version but MDs are pending upgrade - hook is marked", + name: "hook should not be called if the control plane is stable at desired version but MDs are pending create - hook is marked", s: &scope.Scope{ Blueprint: &scope.ClusterBlueprint{ Topology: &clusterv1.Topology{ @@ -744,13 +741,49 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Spec: clusterv1.ClusterSpec{}, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneStableAtTopologyVersion, + Object: controlPlaneObj, + }}, + HookResponseTracker: scope.NewHookResponseTracker(), + UpgradeTracker: func() *scope.UpgradeTracker { + ut := scope.NewUpgradeTracker() + ut.ControlPlane.IsPendingUpgrade = false + ut.MachineDeployments.MarkPendingCreate("md-topology-1") + return ut + }(), + }, + wantMarked: true, + hookResponse: successResponse, + wantHookToBeCalled: false, + wantError: false, + }, + { + name: "hook should not be called if the control plane is stable at desired version but MDs are pending upgrade - hook is marked", + s: &scope.Scope{ + Blueprint: &scope.ClusterBlueprint{ + Topology: &clusterv1.Topology{ + ControlPlane: clusterv1.ControlPlaneTopology{ + Replicas: pointer.Int32(2), + }, }, }, + Current: &scope.ClusterState{ + Cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "test-ns", + Annotations: map[string]string{ + runtimev1.PendingHooksAnnotation: "AfterClusterUpgrade", + }, + }, + Spec: clusterv1.ClusterSpec{}, + }, + ControlPlane: &scope.ControlPlaneState{ + Object: controlPlaneObj, + }}, HookResponseTracker: scope.NewHookResponseTracker(), UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.MachineDeployments.MarkPendingUpgrade("md1") return ut }(), @@ -782,13 +815,13 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Spec: clusterv1.ClusterSpec{}, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneStableAtTopologyVersion, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), UpgradeTracker: func() *scope.UpgradeTracker { ut := scope.NewUpgradeTracker() - ut.ControlPlane.PendingUpgrade = false + ut.ControlPlane.IsPendingUpgrade = false ut.MachineDeployments.MarkDeferredUpgrade("md1") return ut }(), @@ -824,7 +857,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { }, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneStableAtTopologyVersion, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), @@ -861,7 +894,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { }, }, ControlPlane: &scope.ControlPlaneState{ - Object: controlPlaneStableAtTopologyVersion, + Object: controlPlaneObj, }, }, HookResponseTracker: scope.NewHookResponseTracker(), @@ -1182,12 +1215,16 @@ func TestReconcileControlPlane(t *testing.T) { gvk.Kind = "KindChanged" infrastructureMachineTemplateWithIncompatibleChanges.SetGroupVersionKind(gvk) + upgradeTrackerWithControlPlanePendingUpgrade := scope.NewUpgradeTracker() + upgradeTrackerWithControlPlanePendingUpgrade.ControlPlane.IsPendingUpgrade = true + tests := []struct { name string class *scope.ControlPlaneBlueprint original *scope.ControlPlaneState controlPlaneExternalChanges string machineInfrastructureExternalChanges string + upgradeTracker *scope.UpgradeTracker desired *scope.ControlPlaneState want *scope.ControlPlaneState wantRotation bool @@ -1210,6 +1247,15 @@ func TestReconcileControlPlane(t *testing.T) { want: &scope.ControlPlaneState{Object: controlPlaneWithoutInfrastructureWithChanges.DeepCopy()}, wantErr: false, }, + { + name: "Should not update the ControlPlane if ControlPlane is pending upgrade", + class: ccWithoutControlPlaneInfrastructure, + upgradeTracker: upgradeTrackerWithControlPlanePendingUpgrade, + original: &scope.ControlPlaneState{Object: controlPlaneWithoutInfrastructure.DeepCopy()}, + desired: &scope.ControlPlaneState{Object: controlPlaneWithoutInfrastructureWithChanges.DeepCopy()}, + want: &scope.ControlPlaneState{Object: controlPlaneWithoutInfrastructure.DeepCopy()}, + wantErr: false, + }, { name: "Should preserve external changes to ControlPlane without machine infrastructure", class: ccWithoutControlPlaneInfrastructure, @@ -1311,6 +1357,9 @@ func TestReconcileControlPlane(t *testing.T) { Ref: contract.ObjToRef(tt.class.InfrastructureMachineTemplate), } } + if tt.upgradeTracker != nil { + s.UpgradeTracker = tt.upgradeTracker + } s.Current.ControlPlane = &scope.ControlPlaneState{} if tt.original != nil { @@ -1644,12 +1693,17 @@ func TestReconcileMachineDeployments(t *testing.T) { bootstrapTemplate1 := builder.TestBootstrapTemplate(metav1.NamespaceDefault, "bootstrap-config-1").Build() md1 := newFakeMachineDeploymentTopologyState("md-1", infrastructureMachineTemplate1, bootstrapTemplate1, nil) + upgradeTrackerWithMD1PendingCreate := scope.NewUpgradeTracker() + upgradeTrackerWithMD1PendingCreate.MachineDeployments.MarkPendingCreate("md-1-topology") + infrastructureMachineTemplate2 := builder.TestInfrastructureMachineTemplate(metav1.NamespaceDefault, "infrastructure-machine-2").Build() bootstrapTemplate2 := builder.TestBootstrapTemplate(metav1.NamespaceDefault, "bootstrap-config-2").Build() md2 := newFakeMachineDeploymentTopologyState("md-2", infrastructureMachineTemplate2, bootstrapTemplate2, nil) infrastructureMachineTemplate2WithChanges := infrastructureMachineTemplate2.DeepCopy() g.Expect(unstructured.SetNestedField(infrastructureMachineTemplate2WithChanges.Object, "foo", "spec", "template", "spec", "foo")).To(Succeed()) md2WithRotatedInfrastructureMachineTemplate := newFakeMachineDeploymentTopologyState("md-2", infrastructureMachineTemplate2WithChanges, bootstrapTemplate2, nil) + upgradeTrackerWithMD2PendingUpgrade := scope.NewUpgradeTracker() + upgradeTrackerWithMD2PendingUpgrade.MachineDeployments.MarkPendingUpgrade("md-2") infrastructureMachineTemplate3 := builder.TestInfrastructureMachineTemplate(metav1.NamespaceDefault, "infrastructure-machine-3").Build() bootstrapTemplate3 := builder.TestBootstrapTemplate(metav1.NamespaceDefault, "bootstrap-config-3").Build() @@ -1725,6 +1779,7 @@ func TestReconcileMachineDeployments(t *testing.T) { name string current []*scope.MachineDeploymentState desired []*scope.MachineDeploymentState + upgradeTracker *scope.UpgradeTracker want []*scope.MachineDeploymentState wantInfrastructureMachineTemplateRotation map[string]bool wantBootstrapTemplateRotation map[string]bool @@ -1737,6 +1792,14 @@ func TestReconcileMachineDeployments(t *testing.T) { want: []*scope.MachineDeploymentState{md1}, wantErr: false, }, + { + name: "Should not create desired MachineDeployment if the current does not exists yet and it marked as pending create", + current: nil, + upgradeTracker: upgradeTrackerWithMD1PendingCreate, + desired: []*scope.MachineDeploymentState{md1}, + want: nil, + wantErr: false, + }, { name: "No-op if current MachineDeployment is equal to desired", current: []*scope.MachineDeploymentState{md1}, @@ -1752,6 +1815,15 @@ func TestReconcileMachineDeployments(t *testing.T) { wantInfrastructureMachineTemplateRotation: map[string]bool{"md-2": true}, wantErr: false, }, + { + name: "Should not update MachineDeployment if MachineDeployment is pending upgrade", + current: []*scope.MachineDeploymentState{md2}, + desired: []*scope.MachineDeploymentState{md2WithRotatedInfrastructureMachineTemplate}, + upgradeTracker: upgradeTrackerWithMD2PendingUpgrade, + want: []*scope.MachineDeploymentState{md2}, + wantInfrastructureMachineTemplateRotation: map[string]bool{"md-2": false}, + wantErr: false, + }, { name: "Should update MachineDeployment with BootstrapTemplate rotation", current: []*scope.MachineDeploymentState{md3}, @@ -1849,6 +1921,10 @@ func TestReconcileMachineDeployments(t *testing.T) { s.Desired = &scope.ClusterState{MachineDeployments: toMachineDeploymentTopologyStateMap(tt.desired)} + if tt.upgradeTracker != nil { + s.UpgradeTracker = tt.upgradeTracker + } + r := Reconciler{ Client: env, patchHelperFactory: serverSideApplyPatchHelperFactory(env, ssa.NewCache()), @@ -1865,6 +1941,11 @@ func TestReconcileMachineDeployments(t *testing.T) { g.Expect(env.GetAPIReader().List(ctx, &gotMachineDeploymentList, &client.ListOptions{Namespace: namespace.GetName()})).To(Succeed()) g.Expect(gotMachineDeploymentList.Items).To(HaveLen(len(tt.want))) + if tt.want == nil { + // No machine deployments should exist. + g.Expect(gotMachineDeploymentList.Items).To(BeEmpty()) + } + for _, wantMachineDeploymentState := range tt.want { for _, gotMachineDeployment := range gotMachineDeploymentList.Items { if wantMachineDeploymentState.Object.Name != gotMachineDeployment.Name { diff --git a/internal/controllers/topology/cluster/scope/hookresponsetracker.go b/internal/controllers/topology/cluster/scope/hookresponsetracker.go index 92e26115ed4c..e66a3ac718ec 100644 --- a/internal/controllers/topology/cluster/scope/hookresponsetracker.go +++ b/internal/controllers/topology/cluster/scope/hookresponsetracker.go @@ -44,6 +44,26 @@ func (h *HookResponseTracker) Add(hook runtimecatalog.Hook, response runtimehook h.responses[hookName] = response } +// IsBlocking returns true if the hook returned a blocking response. +// If the hook is not called or did not return a blocking response it returns false. +func (h *HookResponseTracker) IsBlocking(hook runtimecatalog.Hook) bool { + hookName := runtimecatalog.HookName(hook) + response, ok := h.responses[hookName] + if !ok { + return false + } + retryableResponse, ok := response.(runtimehooksv1.RetryResponseObject) + if !ok { + // Not a retryable response. Cannot be blocking. + return false + } + if retryableResponse.GetRetryAfterSeconds() == 0 { + // Not a blocking response. + return false + } + return true +} + // AggregateRetryAfter calculates the lowest non-zero retryAfterSeconds time from all the tracked responses. func (h *HookResponseTracker) AggregateRetryAfter() time.Duration { res := int32(0) diff --git a/internal/controllers/topology/cluster/scope/hookresponsetracker_test.go b/internal/controllers/topology/cluster/scope/hookresponsetracker_test.go index c4e5e06d3667..6ced86ba9189 100644 --- a/internal/controllers/topology/cluster/scope/hookresponsetracker_test.go +++ b/internal/controllers/topology/cluster/scope/hookresponsetracker_test.go @@ -21,6 +21,7 @@ import ( "time" . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1" @@ -145,3 +146,59 @@ func TestHookResponseTracker_AggregateMessage(t *testing.T) { g.Expect(hrt.AggregateMessage()).To(Equal("")) }) } + +func TestHookResponseTracker_IsBlocking(t *testing.T) { + nonBlockingBeforeClusterCreateResponse := &runtimehooksv1.BeforeClusterCreateResponse{ + CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ + RetryAfterSeconds: int32(0), + CommonResponse: runtimehooksv1.CommonResponse{ + Status: runtimehooksv1.ResponseStatusSuccess, + }, + }, + } + blockingBeforeClusterCreateResponse := &runtimehooksv1.BeforeClusterCreateResponse{ + CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ + RetryAfterSeconds: int32(10), + CommonResponse: runtimehooksv1.CommonResponse{ + Status: runtimehooksv1.ResponseStatusSuccess, + }, + }, + } + + afterClusterUpgradeResponse := &runtimehooksv1.AfterClusterUpgradeResponse{ + TypeMeta: metav1.TypeMeta{}, + CommonResponse: runtimehooksv1.CommonResponse{}, + } + + t.Run("should return true if the tracker received a blocking response for the hook", func(t *testing.T) { + g := NewWithT(t) + + hrt := NewHookResponseTracker() + hrt.Add(runtimehooksv1.BeforeClusterCreate, blockingBeforeClusterCreateResponse) + + g.Expect(hrt.IsBlocking(runtimehooksv1.BeforeClusterCreate)).To(BeTrue()) + }) + + t.Run("should return false if the tracker received a non blocking response for the hook", func(t *testing.T) { + g := NewWithT(t) + + hrt := NewHookResponseTracker() + hrt.Add(runtimehooksv1.BeforeClusterCreate, nonBlockingBeforeClusterCreateResponse) + + g.Expect(hrt.IsBlocking(runtimehooksv1.BeforeClusterCreate)).To(BeFalse()) + }) + + t.Run("should return false if the tracker did not receive a response for the hook", func(t *testing.T) { + g := NewWithT(t) + hrt := NewHookResponseTracker() + g.Expect(hrt.IsBlocking(runtimehooksv1.BeforeClusterCreate)).To(BeFalse()) + }) + + t.Run("should return false if the hook is non-blocking", func(t *testing.T) { + g := NewWithT(t) + hrt := NewHookResponseTracker() + // AfterClusterUpgradeHook is non-blocking. + hrt.Add(runtimehooksv1.AfterClusterUpgrade, afterClusterUpgradeResponse) + g.Expect(hrt.IsBlocking(runtimehooksv1.AfterClusterUpgrade)).To(BeFalse()) + }) +} diff --git a/internal/controllers/topology/cluster/scope/upgradetracker.go b/internal/controllers/topology/cluster/scope/upgradetracker.go index dae15e67c101..3ac1590365a9 100644 --- a/internal/controllers/topology/cluster/scope/upgradetracker.go +++ b/internal/controllers/topology/cluster/scope/upgradetracker.go @@ -26,28 +26,76 @@ type UpgradeTracker struct { // ControlPlaneUpgradeTracker holds the current upgrade status of the Control Plane. type ControlPlaneUpgradeTracker struct { - // PendingUpgrade is true if the control plane version needs to be updated. False otherwise. - PendingUpgrade bool - - // IsProvisioning is true if the control plane is being provisioned for the first time. False otherwise. + // IsPendingUpgrade is true if the Control Plane version needs to be updated. False otherwise. + // If IsPendingUpgrade is true it also means the Control Plane is not going to pick up the new version + // in the current reconcile loop. + // Example cases when IsPendingUpgrade is set to true: + // - Upgrade is blocked by BeforeClusterUpgrade hook + // - Upgrade is blocked because the current ControlPlane is not stable (provisioning OR scaling OR upgrading) + // - Upgrade is blocked because any of the current MachineDeployments are upgrading. + IsPendingUpgrade bool + + // IsProvisioning is true if the current Control Plane is being provisioned for the first time. False otherwise. IsProvisioning bool - // IsUpgrading is true if the control plane is in the middle of an upgrade. - // Note: Refer to control plane contract for definition of upgrading. + // IsUpgrading is true if the Control Plane is in the middle of an upgrade. + // Note: Refer to Control Plane contract for definition of upgrading. + // IsUpgrading is set to true if the current ControlPlane (ControlPlane at the beginning of the reconcile) + // is upgrading. + // Note: IsUpgrading only represents the current ControlPlane state. If the Control Plane is about to pick up the + // version in the reconcile loop IsUpgrading will not be true, because the current ControlPlane is not upgrading, + // the desired ControlPlane is. + // Also look at: IsStartingUpgrade. IsUpgrading bool - // IsScaling is true if the control plane is in the middle of a scale operation. + // IsStartingUpgrade is true if the Control Plane is picking up the new version in the current reconcile loop. + // If IsStartingUpgrade is true it implies that the desired Control Plane version and the current Control Plane + // versions are different. + IsStartingUpgrade bool + + // IsScaling is true if the current Control Plane is scaling. False otherwise. + // IsScaling only represents the state of the current Control Plane. IsScaling does not represent the state + // of the desired Control Plane. + // Example: + // - IsScaling will be true if the current ControlPlane is scaling. + // - IsScaling will not be true if the current Control Plane is stable and the reconcile loop is going to scale the Control Plane. // Note: Refer to control plane contract for definition of scaling. + // Note: IsScaling will be false if the Control Plane does not support replicas. IsScaling bool } -// MachineDeploymentUpgradeTracker holds the current upgrade status and makes upgrade -// decisions for MachineDeployments. +// MachineDeploymentUpgradeTracker holds the current upgrade status of MachineDeployments. type MachineDeploymentUpgradeTracker struct { - pendingNames sets.Set[string] - deferredNames sets.Set[string] - upgradingNames sets.Set[string] - holdUpgrades bool + // pendingCreateTopologyNames is the set of MachineDeployment topology names that are newly added to the + // Cluster Topology but will not be created in the current reconcile loop. + // By marking a MachineDeployment topology as pendingCreate we skip creating the MachineDeployment. + // Nb. We use MachineDeployment topology names instead of MachineDeployment names because the new MachineDeployment + // names can keep changing for each reconcile loop leading to continuous updates to the TopologyReconciled condition. + pendingCreateTopologyNames sets.Set[string] + + // pendingUpgradeNames is the set of MachineDeployment names that are not going to pick up the new version + // in the current reconcile loop. + // By marking a MachineDeployment as pendingUpgrade we skip reconciling the MachineDeployment. + pendingUpgradeNames sets.Set[string] + + // deferredNames is the set of MachineDeployment names that are not going to pick up the new version + // in the current reconcile loop because they are deferred by the user. + // Note: If a MachineDeployment is marked as deferred it should also be marked as pendingUpgrade. + deferredNames sets.Set[string] + + // upgradingNames is the set of MachineDeployment names that are upgrading. This set contains the names of + // MachineDeployments that are currently upgrading and the names of MachineDeployments that will pick up the upgrade + // in the current reconcile loop. + // Note: This information is used to: + // - decide if ControlPlane can be upgraded. + // - calculate MachineDeployment upgrade concurrency. + // - update TopologyReconciled Condition. + // - decide if the AfterClusterUpgrade hook can be called. + upgradingNames sets.Set[string] + + // maxMachineDeploymentUpgradeConcurrency defines the maximum number of MachineDeployments that should be in an + // upgrading state. This includes the MachineDeployments that are currently upgrading and the MachineDeployments that + // will start the upgrade after the current reconcile loop. maxMachineDeploymentUpgradeConcurrency int } @@ -82,7 +130,8 @@ func NewUpgradeTracker(opts ...UpgradeTrackerOption) *UpgradeTracker { } return &UpgradeTracker{ MachineDeployments: MachineDeploymentUpgradeTracker{ - pendingNames: sets.Set[string]{}, + pendingCreateTopologyNames: sets.Set[string]{}, + pendingUpgradeNames: sets.Set[string]{}, deferredNames: sets.Set[string]{}, upgradingNames: sets.Set[string]{}, maxMachineDeploymentUpgradeConcurrency: options.maxMDUpgradeConcurrency, @@ -103,42 +152,57 @@ func (m *MachineDeploymentUpgradeTracker) UpgradingNames() []string { return sets.List(m.upgradingNames) } -// HoldUpgrades is used to set if any subsequent upgrade operations should be paused, -// e.g. because a AfterControlPlaneUpgrade hook response asked to do so. -// If HoldUpgrades is called with `true` then AllowUpgrade would return false. -func (m *MachineDeploymentUpgradeTracker) HoldUpgrades(val bool) { - m.holdUpgrades = val +// UpgradeConcurrencyReached returns true if the number of MachineDeployments upgrading is at the concurrency limit. +func (m *MachineDeploymentUpgradeTracker) UpgradeConcurrencyReached() bool { + return m.upgradingNames.Len() >= m.maxMachineDeploymentUpgradeConcurrency } -// AllowUpgrade returns true if a MachineDeployment is allowed to upgrade, -// returns false otherwise. -// Note: If AllowUpgrade returns true the machine deployment will pick up -// the topology version. This will eventually trigger a machine deployment -// rollout. -func (m *MachineDeploymentUpgradeTracker) AllowUpgrade() bool { - if m.holdUpgrades { - return false - } - return m.upgradingNames.Len() < m.maxMachineDeploymentUpgradeConcurrency +// MarkPendingCreate marks a machine deployment topology that is pending to be created. +// This is generally used to capture machine deployments that are yet to be created +// because the control plane is not yet stable. +func (m *MachineDeploymentUpgradeTracker) MarkPendingCreate(mdTopologyName string) { + m.pendingCreateTopologyNames.Insert(mdTopologyName) +} + +// IsPendingCreate returns true is the MachineDeployment topology is marked as pending create. +func (m *MachineDeploymentUpgradeTracker) IsPendingCreate(mdTopologyName string) bool { + return m.pendingCreateTopologyNames.Has(mdTopologyName) +} + +// IsAnyPendingCreate returns true if any of the machine deployments are pending +// to be created. Returns false, otherwise. +func (m *MachineDeploymentUpgradeTracker) IsAnyPendingCreate() bool { + return len(m.pendingCreateTopologyNames) != 0 +} + +// PendingCreateTopologyNames returns the list of machine deployment topology names that +// are pending create. +func (m *MachineDeploymentUpgradeTracker) PendingCreateTopologyNames() []string { + return sets.List(m.pendingCreateTopologyNames) } // MarkPendingUpgrade marks a machine deployment as in need of an upgrade. // This is generally used to capture machine deployments that have not yet // picked up the topology version. func (m *MachineDeploymentUpgradeTracker) MarkPendingUpgrade(name string) { - m.pendingNames.Insert(name) + m.pendingUpgradeNames.Insert(name) } -// PendingUpgradeNames returns the list of machine deployment names that -// are pending an upgrade. -func (m *MachineDeploymentUpgradeTracker) PendingUpgradeNames() []string { - return sets.List(m.pendingNames) +// IsPendingUpgrade returns true is the MachineDeployment marked as pending upgrade. +func (m *MachineDeploymentUpgradeTracker) IsPendingUpgrade(name string) bool { + return m.pendingUpgradeNames.Has(name) } -// PendingUpgrade returns true if any of the machine deployments are pending +// IsAnyPendingUpgrade returns true if any of the machine deployments are pending // an upgrade. Returns false, otherwise. -func (m *MachineDeploymentUpgradeTracker) PendingUpgrade() bool { - return len(m.pendingNames) != 0 +func (m *MachineDeploymentUpgradeTracker) IsAnyPendingUpgrade() bool { + return len(m.pendingUpgradeNames) != 0 +} + +// PendingUpgradeNames returns the list of machine deployment names that +// are pending an upgrade. +func (m *MachineDeploymentUpgradeTracker) PendingUpgradeNames() []string { + return sets.List(m.pendingUpgradeNames) } // MarkDeferredUpgrade marks that the upgrade for a MachineDeployment