From dfcce594af9f8529dad5e8b4aee0cf982e79ddb7 Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Mon, 15 Apr 2024 12:15:05 +0800 Subject: [PATCH] Do not update MS status when unable to get workload cluster or machine node --- .../controllers/machineset/machineset_controller.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index aadce8caca5e..2ee582b48b3a 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -187,6 +187,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re // Requeue if the reconcile failed because the ClusterCacheTracker was locked for // the current cluster because of concurrent access. if errors.Is(err, remote.ErrClusterLocked) { + if aggr, ok := err.(kerrors.Aggregate); ok && len(aggr.Errors()) > 1 { + // Print the errors if it's not only ErrClusterLocked. + log.Info(aggr.Error()) + } log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker") return ctrl.Result{RequeueAfter: time.Minute}, nil } @@ -852,7 +856,8 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { } // updateStatus updates the Status field for the MachineSet -// It checks for the current state of the replicas and updates the Status of the MachineSet. +// It checks for the current state of the replicas and updates the Status field of the MachineSet. +// When unable to retrieve the Node status, it returns error and won't update the Status field of the MachineSet. func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { log := ctrl.LoggerFrom(ctx) newStatus := ms.Status.DeepCopy() @@ -890,8 +895,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste node, err := r.getMachineNode(ctx, cluster, machine) if err != nil && machine.GetDeletionTimestamp().IsZero() { - log.Error(err, "Unable to retrieve Node status", "Node", klog.KObj(node)) - continue + return errors.Wrapf(err, "unable to retrieve the status of Node %s", klog.KObj(node)) } if noderefutil.IsNodeReady(node) { @@ -964,6 +968,9 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus } node := &corev1.Node{} if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil { + if apierrors.IsNotFound(err) { + return nil, nil + } return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name) } return node, nil