From 95b8a595eeff7adcd7058f915088a79d48c769fb Mon Sep 17 00:00:00 2001 From: Cesar Wong Date: Tue, 14 May 2024 14:16:09 -0400 Subject: [PATCH] Improve resliency of size tagging when hostedcluster KAS down When the kube apiserver of a hosted cluster is not available, the replica status of nodepools will not be accurate because the CAPI controllers can no longer get node counts from the API server. This commit improves the handling of this situation with 2 changes: - Switches to use .spec.replicas to determine node count of nodepools that do not have autoscaling turned on. - Once a hosted cluster has been tagged with a size, only if the kube apiserver of the hosted cluster is available is the hosted cluster allowed to move to a different size. --- .../hostedclustersizing_controller.go | 23 ++++++++++++++++++- .../hostedclustersizing_controller_test.go | 6 ++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go b/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go index dcad33b8afc..dae714dfd77 100644 --- a/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go +++ b/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller.go @@ -14,6 +14,7 @@ import ( "github.com/openshift/hypershift/support/releaseinfo" hyperutil "github.com/openshift/hypershift/support/util" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" @@ -216,7 +217,9 @@ func (r *reconciler) reconcile( // first, we figure out the node count for the hosted cluster var nodeCount uint32 + nodeCountRequiresAPIServer := false if hccoReportsNodeCount { + nodeCountRequiresAPIServer = true hostedControlPlane, err := r.hostedControlPlaneForHostedCluster(ctx, hostedCluster) if err != nil { return nil, nil @@ -232,7 +235,25 @@ func (r *reconciler) reconcile( } for _, nodePool := range nodePools.Items { - nodeCount += uint32(nodePool.Status.Replicas) + var replicas uint32 + // If autoscaling, the replicas should be returned from status + if nodePool.Spec.AutoScaling != nil { + nodeCountRequiresAPIServer = true + replicas = uint32(nodePool.Status.Replicas) + } else if nodePool.Spec.Replicas != nil { + replicas = uint32(*nodePool.Spec.Replicas) + } + nodeCount += replicas + } + } + + if sizeClassLabelPresent && nodeCountRequiresAPIServer { + // If already assigned a size, we cannot further transition to other sizes if the kube apiserver is not available + // and it's required to be available for node count calculation + kasAvailableCondition := meta.FindStatusCondition(hostedCluster.Status.Conditions, string(hypershiftv1beta1.KubeAPIServerAvailable)) + if kasAvailableCondition == nil || kasAvailableCondition.Status != metav1.ConditionTrue { + logger.Info("HostedCluster kube apiserver is not available, skipping sizing reconciliation") + return nil, nil } } diff --git a/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller_test.go b/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller_test.go index 0e62025cf08..b648a6d691f 100644 --- a/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller_test.go +++ b/hypershift-operator/controllers/hostedclustersizing/hostedclustersizing_controller_test.go @@ -106,9 +106,9 @@ func TestSizingController_Reconcile(t *testing.T) { }, nodePoolsForHostedCluster: func(_ context.Context, _ *hypershiftv1beta1.HostedCluster) (*hypershiftv1beta1.NodePoolList, error) { return &hypershiftv1beta1.NodePoolList{Items: []hypershiftv1beta1.NodePool{ - {Status: hypershiftv1beta1.NodePoolStatus{Replicas: 10}}, - {Status: hypershiftv1beta1.NodePoolStatus{Replicas: 3}}, - {Status: hypershiftv1beta1.NodePoolStatus{Replicas: 17}}, + {Spec: hypershiftv1beta1.NodePoolSpec{Replicas: ptr.To[int32](10)}}, + {Spec: hypershiftv1beta1.NodePoolSpec{Replicas: ptr.To[int32](3)}}, + {Spec: hypershiftv1beta1.NodePoolSpec{AutoScaling: &hypershiftv1beta1.NodePoolAutoScaling{Min: 1, Max: 20}}, Status: hypershiftv1beta1.NodePoolStatus{Replicas: 17}}, }}, nil }, expected: &action{applyCfg: &hypershiftv1beta1applyconfigurations.HostedClusterApplyConfiguration{