From 81a64c22e7eebcdc7bb07f1d2b64692785288663 Mon Sep 17 00:00:00 2001 From: Andrea Frittoli Date: Wed, 10 Jun 2020 17:14:56 +0100 Subject: [PATCH] Use permanent errors in the taskrun reconciler Instead of returning nil on error during reconcile, if the error is transient return it, so that the key is requeue. If the error is permanent return a permanent error, so that the key is not requeued. --- pkg/reconciler/taskrun/taskrun.go | 47 +++++++++------- pkg/reconciler/taskrun/taskrun_test.go | 75 ++++++++++++++++++++------ 2 files changed, 85 insertions(+), 37 deletions(-) diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index dbf6bb16f1d..fa1132da088 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -48,6 +48,7 @@ import ( "k8s.io/apimachinery/pkg/types" "knative.dev/pkg/apis" "knative.dev/pkg/configmap" + "knative.dev/pkg/controller" pkgreconciler "knative.dev/pkg/reconciler" "knative.dev/pkg/tracker" ) @@ -179,7 +180,7 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1beta1.TaskRun) pkg c.Logger.Errorf("TaskRun prepare error: %v", err.Error()) // We only return an error if update failed, otherwise we don't want to // reconcile an invalid TaskRun anymore - return c.finishReconcileUpdateEmitEvents(tr, nil, nil) + return c.finishReconcileUpdateEmitEvents(tr, nil, err) } // Store the condition before reconcile @@ -200,6 +201,9 @@ func (c *Reconciler) finishReconcileUpdateEmitEvents(tr *v1beta1.TaskRun, before _, err := c.updateLabelsAndAnnotations(tr) events.EmitError(c.Recorder, err, tr) + if controller.IsPermanentError(previousError) { + return controller.NewPermanentError(multierror.Append(previousError, err)) + } return multierror.Append(previousError, err).ErrorOrNil() } @@ -239,7 +243,7 @@ func (c *Reconciler) prepare(ctx context.Context, tr *v1beta1.TaskRun) (*v1beta1 } c.Logger.Errorf("Failed to determine Task spec to use for taskrun %s: %v", tr.Name, err) tr.Status.MarkResourceFailed(podconvert.ReasonFailedResolution, err) - return nil, nil, err + return nil, nil, controller.NewPermanentError(err) } // Store the fetched TaskSpec on the TaskRun for auditing @@ -279,19 +283,19 @@ func (c *Reconciler) prepare(ctx context.Context, tr *v1beta1.TaskRun) (*v1beta1 if err != nil { c.Logger.Errorf("Failed to resolve references for taskrun %s: %v", tr.Name, err) tr.Status.MarkResourceFailed(podconvert.ReasonFailedResolution, err) - return nil, nil, err + return nil, nil, controller.NewPermanentError(err) } if err := ValidateResolvedTaskResources(tr.Spec.Params, rtr); err != nil { c.Logger.Errorf("TaskRun %q resources are invalid: %v", tr.Name, err) tr.Status.MarkResourceFailed(podconvert.ReasonFailedValidation, err) - return nil, nil, err + return nil, nil, controller.NewPermanentError(err) } if err := workspace.ValidateBindings(taskSpec.Workspaces, tr.Spec.Workspaces); err != nil { c.Logger.Errorf("TaskRun %q workspaces are invalid: %v", tr.Name, err) tr.Status.MarkResourceFailed(podconvert.ReasonFailedValidation, err) - return nil, nil, err + return nil, nil, controller.NewPermanentError(err) } // Initialize the cloud events if at least a CloudEventResource is defined @@ -353,7 +357,7 @@ func (c *Reconciler) reconcile(ctx context.Context, tr *v1beta1.TaskRun, tr.Status.MarkResourceFailed(volumeclaim.ReasonCouldntCreateWorkspacePVC, fmt.Errorf("Failed to create PVC for TaskRun %s workspaces correctly: %s", fmt.Sprintf("%s/%s", tr.Namespace, tr.Name), err)) - return nil + return controller.NewPermanentError(err) } taskRunWorkspaces := applyVolumeClaimTemplates(tr.Spec.Workspaces, tr.GetOwnerReference()) @@ -363,8 +367,9 @@ func (c *Reconciler) reconcile(ctx context.Context, tr *v1beta1.TaskRun, pod, err = c.createPod(ctx, tr, rtr) if err != nil { - c.handlePodCreationError(tr, err) - return nil + newErr := c.handlePodCreationError(tr, err) + c.Logger.Error("Failed to create task run pod for task %q: %v", tr.Name, newErr) + return newErr } go c.timeoutHandler.WaitTaskRun(tr, tr.Status.StartTime) } @@ -415,7 +420,7 @@ func (c *Reconciler) updateLabelsAndAnnotations(tr *v1beta1.TaskRun) (*v1beta1.T return newTr, nil } -func (c *Reconciler) handlePodCreationError(tr *v1beta1.TaskRun, err error) { +func (c *Reconciler) handlePodCreationError(tr *v1beta1.TaskRun, err error) error { var msg string if isExceededResourceQuotaError(err) { backoff, currentlyBackingOff := c.timeoutHandler.GetBackoff(tr) @@ -429,19 +434,21 @@ func (c *Reconciler) handlePodCreationError(tr *v1beta1.TaskRun, err error) { Reason: podconvert.ReasonExceededResourceQuota, Message: fmt.Sprintf("%s: %v", msg, err), }) + // return a transient error, so that the key is requeued + return err + } + // The pod creation failed, not because of quota issues. The most likely + // reason is that something is wrong with the spec of the Task, that we could + // not check with validation before - i.e. pod template fields + msg = fmt.Sprintf("failed to create task run pod %q: %v. Maybe ", tr.Name, err) + if tr.Spec.TaskRef != nil { + msg += fmt.Sprintf("missing or invalid Task %s/%s", tr.Namespace, tr.Spec.TaskRef.Name) } else { - // The pod creation failed, not because of quota issues. The most likely - // reason is that something is wrong with the spec of the Task, that we could - // not check with validation before - i.e. pod template fields - msg = fmt.Sprintf("failed to create task run pod %q: %v. Maybe ", tr.Name, err) - if tr.Spec.TaskRef != nil { - msg += fmt.Sprintf("missing or invalid Task %s/%s", tr.Namespace, tr.Spec.TaskRef.Name) - } else { - msg += "invalid TaskSpec" - } - tr.Status.MarkResourceFailed(podconvert.ReasonCouldntGetTask, errors.New(msg)) + msg += "invalid TaskSpec" } - c.Logger.Error("Failed to create task run pod for task %q: %v", tr.Name, err) + newErr := controller.NewPermanentError(errors.New(msg)) + tr.Status.MarkResourceFailed(podconvert.ReasonCouldntGetTask, newErr) + return newErr } // failTaskRun stops a TaskRun with the provided Reason diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index d915f79cf9a..9541466667e 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -1326,6 +1326,17 @@ func TestReconcile_SetsStartTime(t *testing.T) { } testAssets, cancel := getTaskRunController(t, d) defer cancel() + clients := testAssets.Clients + + t.Logf("Creating SA %s in %s", "default", "foo") + if _, err := clients.Kube.CoreV1().ServiceAccounts("foo").Create(&corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: "default", + Namespace: "foo", + }, + }); err != nil { + t.Fatal(err) + } if err := testAssets.Controller.Reconciler.Reconcile(context.Background(), getRunName(taskRun)); err != nil { t.Errorf("expected no error reconciling valid TaskRun but got %v", err) @@ -1476,16 +1487,18 @@ func TestReconcileInvalidTaskRuns(t *testing.T) { taskRun: noTaskRun, reason: podconvert.ReasonFailedResolution, wantEvents: []string{ - "Normal Started ", - "Warning Failed ", + "Normal Started", + "Warning Failed", + "Warning InternalError", }, }, { name: "task run with wrong ref", taskRun: withWrongRef, reason: podconvert.ReasonFailedResolution, wantEvents: []string{ - "Normal Started ", - "Warning Failed ", + "Normal Started", + "Warning Failed", + "Warning InternalError", }, }} @@ -1495,13 +1508,16 @@ func TestReconcileInvalidTaskRuns(t *testing.T) { defer cancel() c := testAssets.Controller clients := testAssets.Clients - err := c.Reconciler.Reconcile(context.Background(), getRunName(tc.taskRun)) + reconcileErr := c.Reconciler.Reconcile(context.Background(), getRunName(tc.taskRun)) - // When a TaskRun is invalid and can't run, we don't want to return an error because - // an error will tell the Reconciler to keep trying to reconcile; instead we want to stop + // When a TaskRun is invalid and can't run, we return a permanent error because + // a regular error will tell the Reconciler to keep trying to reconcile; instead we want to stop // and forget about the Run. - if err != nil { - t.Errorf("Did not expect to see error when reconciling invalid TaskRun but saw %q", err) + if reconcileErr == nil { + t.Fatalf("Expected to see error when reconciling invalid TaskRun but none") + } + if !controller.IsPermanentError(reconcileErr) { + t.Fatalf("Expected to see a permanet error when reconciling invalid TaskRun, got %s instead", reconcileErr) } // Check actions and events @@ -1510,7 +1526,7 @@ func TestReconcileInvalidTaskRuns(t *testing.T) { t.Errorf("expected 3 actions (first: list namespaces) created by the reconciler, got %d. Actions: %#v", len(actions), actions) } - err = checkEvents(testAssets.Recorder, tc.name, tc.wantEvents) + err := checkEvents(testAssets.Recorder, tc.name, tc.wantEvents) if !(err == nil) { t.Errorf(err.Error()) } @@ -2510,8 +2526,12 @@ func TestReconcileWorkspaceMissing(t *testing.T) { defer cancel() clients := testAssets.Clients - if err := testAssets.Controller.Reconciler.Reconcile(context.Background(), getRunName(taskRun)); err != nil { - t.Errorf("expected no error reconciling valid TaskRun but got %v", err) + err := testAssets.Controller.Reconciler.Reconcile(context.Background(), getRunName(taskRun)) + if err == nil { + t.Fatalf("expected error reconciling invalid TaskRun but got none") + } + if !controller.IsPermanentError(err) { + t.Fatalf("Expected to see a permanet error when reconciling invalid TaskRun, got %s instead", err) } tr, err := clients.Pipeline.TektonV1beta1().TaskRuns(taskRun.Namespace).Get(taskRun.Name, metav1.GetOptions{}) @@ -2526,7 +2546,7 @@ func TestReconcileWorkspaceMissing(t *testing.T) { } } if !failedCorrectly { - t.Errorf("Expected TaskRun to fail validation but it did not. Final conditions were:\n%#v", tr.Status.Conditions) + t.Fatalf("Expected TaskRun to fail validation but it did not. Final conditions were:\n%#v", tr.Status.Conditions) } } @@ -2559,7 +2579,8 @@ func TestReconcileTaskResourceResolutionAndValidation(t *testing.T) { wantFailedReason: podconvert.ReasonFailedResolution, wantEvents: []string{ "Normal Started ", - "Warning Failed", + "Warning Failed", // Event about the TaskRun state changed + "Warning InternalError", // Event about the error (generated by the genreconciler) }, }, { desc: "Fail ValidateResolvedTaskResources", @@ -2581,7 +2602,8 @@ func TestReconcileTaskResourceResolutionAndValidation(t *testing.T) { wantFailedReason: podconvert.ReasonFailedValidation, wantEvents: []string{ "Normal Started ", - "Warning Failed", + "Warning Failed", // Event about the TaskRun state changed + "Warning InternalError", // Event about the error (generated by the genreconciler) }, }} { t.Run(tt.desc, func(t *testing.T) { @@ -2591,8 +2613,16 @@ func TestReconcileTaskResourceResolutionAndValidation(t *testing.T) { clients := testAssets.Clients c := testAssets.Controller - if err := c.Reconciler.Reconcile(context.Background(), getRunName(tt.d.TaskRuns[0])); err != nil { - t.Errorf("expected no error reconciling valid TaskRun but got %v", err) + reconcileErr := c.Reconciler.Reconcile(context.Background(), getRunName(tt.d.TaskRuns[0])) + + // When a TaskRun is invalid and can't run, we return a permanent error because + // a regular error will tell the Reconciler to keep trying to reconcile; instead we want to stop + // and forget about the Run. + if reconcileErr == nil { + t.Fatalf("Expected to see error when reconciling invalid TaskRun but none") + } + if !controller.IsPermanentError(reconcileErr) { + t.Fatalf("Expected to see a permanet error when reconciling invalid TaskRun, got %s instead", reconcileErr) } tr, err := clients.Pipeline.TektonV1beta1().TaskRuns(tt.d.TaskRuns[0].Namespace).Get(tt.d.TaskRuns[0].Name, metav1.GetOptions{}) @@ -2622,6 +2652,7 @@ func TestReconcileWorkspaceWithVolumeClaimTemplate(t *testing.T) { taskWithWorkspace := tb.Task("test-task-with-workspace", tb.TaskNamespace("foo"), tb.TaskSpec( tb.TaskWorkspace(workspaceName, "a test task workspace", "", true), + tb.Step("foo", tb.StepName("simple-step"), tb.StepCommand("/mycmd")), )) taskRun := tb.TaskRun("test-taskrun-missing-workspace", tb.TaskRunNamespace("foo"), tb.TaskRunSpec( tb.TaskRunTaskRef(taskWithWorkspace.Name, tb.TaskRefAPIVersion("a1")), @@ -2643,6 +2674,16 @@ func TestReconcileWorkspaceWithVolumeClaimTemplate(t *testing.T) { defer cancel() clients := testAssets.Clients + t.Logf("Creating SA %s in %s", "default", "foo") + if _, err := clients.Kube.CoreV1().ServiceAccounts("foo").Create(&corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: "default", + Namespace: "foo", + }, + }); err != nil { + t.Fatal(err) + } + if err := testAssets.Controller.Reconciler.Reconcile(context.Background(), getRunName(taskRun)); err != nil { t.Errorf("expected no error reconciling valid TaskRun but got %v", err) }