Skip to content

Commit

Permalink
Ensure that PipelineRuns are marked as timed out if a task timed out …
Browse files Browse the repository at this point in the history
…due to the PR timeout

Fixes #5127

We've been seeing sporadic flaky failures for a number of e2e tests for a while now, such as `TestPipelineRunTimeout` and sidecar-related tests. I recently dug into exactly what differed between a success and a failure, specifically for `TestPipelineRunTimeout`, the most frequent of those flakes. I was able to determine that sometimes, the `TaskRun` would be timed out due to the `PipelineRun`-level timeout, but `pr.HasTimedOut` would not return true on the next reconciliation of the `PipelineRun`. This strongly suggests that the `TaskRun` timeout was calculated to end slightly before the `PipelineRun` timeout would end, and then the `PipelineRun` reconciliation happened in the very brief (milliseconds at most) window between the `TaskRun` completing as timed out and the `PipelineRun` timeout being reached.

It's not possible for us to make the end of the generated `TaskRun` timeout exactly match the end of the specified `PipelineRun` timeout, since the `TaskRun`'s `StartTime` won't be set until the `TaskRun` has actually been created, so there'll always be some difference there, as best as I can tell. So I decided to work around this inherent limitation by instead tracking cases where we've set the `TaskRun` timeout based on `PipelineRun.Status.StartTime + PipelineRun.PipelineTimeout(ctx)`, i.e., the `TaskRun` timeout is the difference between elapsed time of the `PipelineRun` and the time at which the `PipelineRun` proper would be timed out.

Then, if all tasks in a `PipelineRun` have completed, and at least one of them has timed out and had its timeout set based on that difference, we know that the `PipelineRun` has timed out, even if `pr.HasTimedOut` is returning false because we haven't quite yet hit the end of the `PipelineRun`'s timeout duration.

Signed-off-by: Andrew Bayer <andrew.bayer@gmail.com>
  • Loading branch information
abayer committed Jul 13, 2022
1 parent 5126d15 commit aca1517
Show file tree
Hide file tree
Showing 10 changed files with 295 additions and 44 deletions.
4 changes: 4 additions & 0 deletions pkg/apis/pipeline/v1alpha1/run_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ type RunSpec struct {
// Workspaces is a list of WorkspaceBindings from volumes to workspaces.
// +optional
Workspaces []v1beta1.WorkspaceBinding `json:"workspaces,omitempty"`

// TimeoutFromParent is set to true if this Run's timeout was set by a parent PipelineRun.
// +optional
TimeoutFromParent bool `json:"timeoutFromParent,omitempty"`
}

// RunSpecStatus defines the taskrun spec status the user can provide
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/pipeline/v1beta1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/apis/pipeline/v1beta1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -2507,6 +2507,10 @@
"description": "Time after which the build times out. Defaults to 1 hour. Specified build timeout should be less than 24h. Refer Go's ParseDuration documentation for expected format: https://golang.org/pkg/time/#ParseDuration",
"$ref": "#/definitions/v1.Duration"
},
"timeoutFromParent": {
"description": "TimeoutFromParent is set to true if this TaskRun's timeout was set by a parent PipelineRun.",
"type": "boolean"
},
"workspaces": {
"description": "Workspaces is a list of WorkspaceBindings from volumes to workspaces.",
"type": "array",
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/pipeline/v1beta1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ type TaskRunSpec struct {
// +optional
Timeout *metav1.Duration `json:"timeout,omitempty"`
// PodTemplate holds pod specific configuration
// +optional
PodTemplate *PodTemplate `json:"podTemplate,omitempty"`
// Workspaces is a list of WorkspaceBindings from volumes to workspaces.
// +optional
Expand All @@ -78,7 +79,11 @@ type TaskRunSpec struct {
// +listType=atomic
SidecarOverrides []TaskRunSidecarOverride `json:"sidecarOverrides,omitempty"`
// Compute resources to use for this TaskRun
// +optional
ComputeResources *corev1.ResourceRequirements `json:"computeResources,omitempty"`
// TimeoutFromParent is set to true if this TaskRun's timeout was set by a parent PipelineRun.
// +optional
TimeoutFromParent bool `json:"timeoutFromParent,omitempty"`
}

// TaskRunSpecStatus defines the taskrun spec status the user can provide
Expand Down
44 changes: 27 additions & 17 deletions pkg/reconciler/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ func (c *Reconciler) updateRunsStatusDirectly(pr *v1beta1.PipelineRun) error {
return nil
}

type getTimeoutFunc func(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) *metav1.Duration
type getTimeoutFunc func(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) (*metav1.Duration, bool)

func (c *Reconciler) createTaskRuns(ctx context.Context, rpt *resources.ResolvedPipelineTask, pr *v1beta1.PipelineRun, storageBasePath string, getTimeoutFunc getTimeoutFunc) ([]*v1beta1.TaskRun, error) {
var taskRuns []*v1beta1.TaskRun
Expand Down Expand Up @@ -804,6 +804,7 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
rpt.PipelineTask = resources.ApplyPipelineTaskContexts(rpt.PipelineTask)
taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name)
params = append(params, rpt.PipelineTask.Params...)
trTimeout, timeoutFromParent := getTimeoutFunc(ctx, pr, rpt, c.Clock)
tr = &v1beta1.TaskRun{
ObjectMeta: metav1.ObjectMeta{
Name: taskRunName,
Expand All @@ -815,10 +816,11 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
Spec: v1beta1.TaskRunSpec{
Params: params,
ServiceAccountName: taskRunSpec.TaskServiceAccountName,
Timeout: getTimeoutFunc(ctx, pr, rpt, c.Clock),
Timeout: trTimeout,
PodTemplate: taskRunSpec.TaskPodTemplate,
StepOverrides: taskRunSpec.StepOverrides,
SidecarOverrides: taskRunSpec.SidecarOverrides,
TimeoutFromParent: timeoutFromParent,
}}

if rpt.ResolvedTaskResources.TaskName != "" {
Expand Down Expand Up @@ -862,6 +864,7 @@ func (c *Reconciler) createRun(ctx context.Context, runName string, params []v1b
logger := logging.FromContext(ctx)
taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name)
params = append(params, rpt.PipelineTask.Params...)
runTimeout, timeoutFromParent := getTimeoutFunc(ctx, pr, rpt, c.Clock)
r := &v1alpha1.Run{
ObjectMeta: metav1.ObjectMeta{
Name: runName,
Expand All @@ -875,8 +878,9 @@ func (c *Reconciler) createRun(ctx context.Context, runName string, params []v1b
Ref: rpt.PipelineTask.TaskRef,
Params: params,
ServiceAccountName: taskRunSpec.TaskServiceAccountName,
Timeout: getTimeoutFunc(ctx, pr, rpt, c.Clock),
Timeout: runTimeout,
PodTemplate: taskRunSpec.TaskPodTemplate,
TimeoutFromParent: timeoutFromParent,
},
}

Expand Down Expand Up @@ -1099,34 +1103,38 @@ func addMetadataByPrecedence(metadata map[string]string, addedMetadata map[strin
}

// getFinallyTaskRunTimeout returns the timeout to set when creating the ResolvedPipelineTask, which is a finally Task.
// If there is no timeout for the finally TaskRun, returns 0.
// If there is no timeout for the finally TaskRun, returns 0 and false, since finally tasks are not covered by the
// If pipeline level timeouts have already been exceeded, returns 1 second.
func getFinallyTaskRunTimeout(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) *metav1.Duration {
taskRunTimeout := calculateTaskRunTimeout(pr.PipelineTimeout(ctx), pr, rpt, c)
// If the timeout for the TaskRun is determined based on the PipelineRun's own timeout, also returns true.
func getFinallyTaskRunTimeout(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) (*metav1.Duration, bool) {
taskRunTimeout, timeoutFromPR := calculateTaskRunTimeout(pr.PipelineTimeout(ctx), pr, rpt, c, true)
finallyTimeout := pr.FinallyTimeout()
// Return the smaller of taskRunTimeout and finallyTimeout
// This works because all finally tasks run in parallel, so there is no need to consider time spent by other finally tasks
// TODO(#4071): Account for time spent since finally task was first started (i.e. retries)
if finallyTimeout == nil || finallyTimeout.Duration == apisconfig.NoTimeoutDuration {
return taskRunTimeout
return taskRunTimeout, timeoutFromPR
}
if finallyTimeout.Duration < taskRunTimeout.Duration {
return finallyTimeout
return finallyTimeout, false
}
return taskRunTimeout
return taskRunTimeout, timeoutFromPR
}

// getTaskRunTimeout returns the timeout to set when creating the ResolvedPipelineTask.
// If there is no timeout for the TaskRun, returns 0.
// If pipeline level timeouts have already been exceeded, returns 1 second.
func getTaskRunTimeout(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) *metav1.Duration {
// If the timeout for the TaskRun is determined based on the PipelineRun's own timeout, also returns true.
func getTaskRunTimeout(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) (*metav1.Duration, bool) {
var timeout time.Duration
timeoutFromPR := false
if pr.TasksTimeout() != nil {
timeout = pr.TasksTimeout().Duration
} else {
timeout = pr.PipelineTimeout(ctx)
timeoutFromPR = true
}
return calculateTaskRunTimeout(timeout, pr, rpt, c)
return calculateTaskRunTimeout(timeout, pr, rpt, c, timeoutFromPR)
}

// calculateTaskRunTimeout returns the timeout to set when creating the ResolvedPipelineTask.
Expand All @@ -1135,24 +1143,26 @@ func getTaskRunTimeout(ctx context.Context, pr *v1beta1.PipelineRun, rpt *resour
// - If ResolvedPipelineTask is a Finally Task, `timeout` is the Pipeline Timeout
// If there is no timeout for the TaskRun, returns 0.
// If pipeline level timeouts have already been exceeded, returns 1 second.
func calculateTaskRunTimeout(timeout time.Duration, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock) *metav1.Duration {
// If the timeout for the TaskRun is determined based on the PipelineRun's own timeout, also returns true.
func calculateTaskRunTimeout(timeout time.Duration, pr *v1beta1.PipelineRun, rpt *resources.ResolvedPipelineTask, c clock.PassiveClock, isPRTimeout bool) (*metav1.Duration, bool) {
if timeout != apisconfig.NoTimeoutDuration {
pElapsedTime := c.Since(pr.Status.StartTime.Time)
if pElapsedTime > timeout {
return &metav1.Duration{Duration: 1 * time.Second}
return &metav1.Duration{Duration: 1 * time.Second}, false
}
timeRemaining := (timeout - pElapsedTime)
// Return the smaller of timeRemaining and rpt.pipelineTask.timeout
if rpt.PipelineTask.Timeout != nil && rpt.PipelineTask.Timeout.Duration < timeRemaining {
return &metav1.Duration{Duration: rpt.PipelineTask.Timeout.Duration}
return &metav1.Duration{Duration: rpt.PipelineTask.Timeout.Duration}, false
}
return &metav1.Duration{Duration: timeRemaining}
// If the time
return &metav1.Duration{Duration: timeRemaining}, timeRemaining <= timeout && isPRTimeout
}

if timeout == apisconfig.NoTimeoutDuration && rpt.PipelineTask.Timeout != nil {
return &metav1.Duration{Duration: rpt.PipelineTask.Timeout.Duration}
return &metav1.Duration{Duration: rpt.PipelineTask.Timeout.Duration}, false
}
return &metav1.Duration{Duration: apisconfig.NoTimeoutDuration}
return &metav1.Duration{Duration: apisconfig.NoTimeoutDuration}, false
}

func (c *Reconciler) updateLabelsAndAnnotations(ctx context.Context, pr *v1beta1.PipelineRun) (*v1beta1.PipelineRun, error) {
Expand Down
Loading

0 comments on commit aca1517

Please sign in to comment.