Skip to content

Commit

Permalink
Improve resliency of size tagging when hostedcluster KAS down
Browse files Browse the repository at this point in the history
When the kube apiserver of a hosted cluster is not available, the
replica status of nodepools will not be accurate because the CAPI
controllers can no longer get node counts from the API server. This
commit improves the handling of this situation with 2 changes:
- Switches to use .spec.replicas to determine node count of nodepools
  that do not have autoscaling turned on.
- Once a hosted cluster has been tagged with a size, only if the kube
  apiserver of the hosted cluster is available is the hosted cluster
  allowed to move to a different size.
  • Loading branch information
csrwng committed May 16, 2024
1 parent 97d244b commit e468da3
Show file tree
Hide file tree
Showing 2 changed files with 344 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
hyperutil "github.com/openshift/hypershift/support/util"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
Expand Down Expand Up @@ -218,6 +219,12 @@ func (r *reconciler) reconcile(
return nil, fmt.Errorf("failed to determine if HCCO reports node count: %w", err)
}

// Determine if the Kube API Server is available to determine if we can trust the node count from the HCCO or nodepool.status.replicas
// If the Kube API Server is not available, we cannot trust the node count from the HCCO or nodepool.status.replicas
// Ref: kubernetes-sigs/cluster-api#10195
kasAvailableCondition := meta.FindStatusCondition(hostedCluster.Status.Conditions, string(hypershiftv1beta1.KubeAPIServerAvailable))
kasAvailable := kasAvailableCondition != nil && kasAvailableCondition.Status == metav1.ConditionTrue

// first, we figure out the node count for the hosted cluster
var nodeCount uint32
if hccoReportsNodeCount {
Expand All @@ -236,7 +243,18 @@ func (r *reconciler) reconcile(
}

for _, nodePool := range nodePools.Items {
nodeCount += uint32(nodePool.Status.Replicas)
var replicas uint32
// If autoscaling, the replicas should be returned from status
if nodePool.Spec.AutoScaling != nil {
// If the Kube API Server is not available, and we already have a size label, skip processing
if !kasAvailable && sizeClassLabelPresent {
return nil, nil
}
replicas = uint32(nodePool.Status.Replicas)
} else if nodePool.Spec.Replicas != nil {
replicas = uint32(*nodePool.Spec.Replicas)
}
nodeCount += replicas
}
}

Expand Down
Loading

0 comments on commit e468da3

Please sign in to comment.