From eb9ec3b86de295e8cbd1b4540a8e3b7294bdd2a9 Mon Sep 17 00:00:00 2001 From: Jerop Date: Tue, 9 Nov 2021 16:56:08 -0500 Subject: [PATCH] Stop executing retried `Tasks` past their timeouts Today, when a `Task` is retried, it executes past its timeout. Even worse, the `Task` executing past its timeout causes the execution of `Finally Tasks` to be timedout before executing for as long as their allocated timeouts. As such, the execution of `Finally Tasks` is not guaranteed even when a `finally timeout` has been allocated. @ornew described the problem in the issue report: "The problem for me is that Tekton still doesn't guarantee the execution of `finally` at all. This is a really serious problem for operating in production, as Tekton we operate has caused a lot of resource leaks and a lot of damage by not running finally. It's the biggest problem with existing `timeout`." This prblem is caused by the the interaction between retries and timeouts. When a `TaskRun` is retried: - The status of its failed execution is stored in `RetriesStatus`. - The status of the `TaskRun` is updated, including marking it as `Running` and resetting its `StartTime`. Resetting the `StartTime` is useful in recording the `StartTime` of each retry in the `RetriesStatus`. However, using the reset time as the reset start time of the `TaskRun` when checking whether it has timedout is incorrect and causes the issues described above. In this change, we use the actual start time of the `TaskRun` to check whether the `TaskRun` has timedout. We do this by checking the start time of previous attempts as well, instead of the current attempt only. Alternative approaches considered include: - not resetting the start time of the `TaskRun` upon retry, however it will be challenging to know the execution times of each retry. - keeping track of the actual start time in an extra field, but this is information that's already available in the status References: - [TEP-0046: Finally tasks execution post pipelinerun timeout](https://github.com/tektoncd/community/blob/main/teps/0046-finallytask-execution-post-timeout.md) - [Issue: task time exceeded `timeouts.tasks` when task retried](https://github.com/tektoncd/pipeline/issues/4071) - [Issue: Allow finally tasks to execute after pipeline timeout](https://github.com/tektoncd/pipeline/issues/2989) Co-authored-by: Lee Bernick --- docs/pipelineruns.md | 2 + docs/pipelines.md | 2 +- docs/taskruns.md | 2 + pkg/apis/pipeline/v1beta1/taskrun_types.go | 21 +++++- .../pipeline/v1beta1/taskrun_types_test.go | 68 ++++++++++++++++++- 5 files changed, 91 insertions(+), 4 deletions(-) diff --git a/docs/pipelineruns.md b/docs/pipelineruns.md index bdad3861eab..2c4c52b9ea0 100644 --- a/docs/pipelineruns.md +++ b/docs/pipelineruns.md @@ -528,6 +528,8 @@ The `timeout` value is a `duration` conforming to Go's values are `1h30m`, `1h`, `1m`, and `60s`. If you set the global timeout to 0, all `PipelineRuns` that do not have an individual timeout set will fail immediately upon encountering an error. +The above `timeouts` include the time used to execute `retries`. + ## Monitoring execution status As your `PipelineRun` executes, its `status` field accumulates information on the execution of each `TaskRun` diff --git a/docs/pipelines.md b/docs/pipelines.md index 8349b5411d5..80afb74e0ea 100644 --- a/docs/pipelines.md +++ b/docs/pipelines.md @@ -763,7 +763,7 @@ tasks: ### Configuring the failure timeout You can use the `Timeout` field in the `Task` spec within the `Pipeline` to set the timeout -of the `TaskRun` that executes that `Task` within the `PipelineRun` that executes your `Pipeline.` +of the `TaskRun` (and its `retries`) that executes that `Task` within the `PipelineRun` that executes your `Pipeline.` The `Timeout` value is a `duration` conforming to Go's [`ParseDuration`](https://golang.org/pkg/time/#ParseDuration) format. For example, valid values are `1h30m`, `1h`, `1m`, and `60s`. diff --git a/docs/taskruns.md b/docs/taskruns.md index 1cdf3bb6a7a..04496f05dbf 100644 --- a/docs/taskruns.md +++ b/docs/taskruns.md @@ -398,6 +398,8 @@ If a `TaskRun` runs longer than its timeout value, the pod associated with the ` means that the logs of the `TaskRun` are not preserved. The deletion of the `TaskRun` pod is necessary in order to stop `TaskRun` step containers from running. +The above `timeout` is inclusive of the time used to execute the `TaskRun`'s `retries`. + ### Specifying `ServiceAccount` credentials You can execute the `Task` in your `TaskRun` with a specific set of credentials by diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types.go b/pkg/apis/pipeline/v1beta1/taskrun_types.go index fd701419906..518b7661b40 100644 --- a/pkg/apis/pipeline/v1beta1/taskrun_types.go +++ b/pkg/apis/pipeline/v1beta1/taskrun_types.go @@ -385,7 +385,8 @@ func (tr *TaskRun) IsCancelled() bool { // HasTimedOut returns true if the TaskRun runtime is beyond the allowed timeout func (tr *TaskRun) HasTimedOut(ctx context.Context) bool { - if tr.Status.StartTime.IsZero() { + startTime := tr.getStartTime() + if startTime.IsZero() { return false } timeout := tr.GetTimeout(ctx) @@ -393,10 +394,26 @@ func (tr *TaskRun) HasTimedOut(ctx context.Context) bool { if timeout == apisconfig.NoTimeoutDuration { return false } - runtime := time.Since(tr.Status.StartTime.Time) + runtime := time.Since(startTime.Time) return runtime > timeout } +func (tr *TaskRun) getStartTime() *metav1.Time { + startTime := tr.Status.StartTime + if startTime.IsZero() { + if len(tr.Status.RetriesStatus) == 0 { + return startTime + } + startTime = &metav1.Time{Time: time.Now()} + } + for _, retry := range tr.Status.RetriesStatus { + if retry.StartTime.Time.Before(startTime.Time) { + startTime = retry.StartTime + } + } + return startTime +} + // GetTimeout returns the timeout for the TaskRun, or the default if not specified func (tr *TaskRun) GetTimeout(ctx context.Context) time.Duration { // Use the platform default is no timeout is set diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types_test.go b/pkg/apis/pipeline/v1beta1/taskrun_types_test.go index 5ebbb59bffa..df71a43277b 100644 --- a/pkg/apis/pipeline/v1beta1/taskrun_types_test.go +++ b/pkg/apis/pipeline/v1beta1/taskrun_types_test.go @@ -309,12 +309,78 @@ func TestHasTimedOut(t *testing.T) { }, }, expectedStatus: true, + }, { + name: "TaskRun timed out with retries", + taskRun: &v1beta1.TaskRun{ + Spec: v1beta1.TaskRunSpec{ + Timeout: &metav1.Duration{ + Duration: 10 * time.Second, + }, + }, + Status: v1beta1.TaskRunStatus{ + Status: duckv1beta1.Status{ + Conditions: []apis.Condition{{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + }}, + }, + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + StartTime: &metav1.Time{Time: time.Now().Add(-5 * time.Second)}, + RetriesStatus: []v1beta1.TaskRunStatus{{ + Status: duckv1beta1.Status{ + Conditions: []apis.Condition{{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + }}, + }, + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + StartTime: &metav1.Time{Time: time.Now().Add(-11 * time.Second)}, + CompletionTime: &metav1.Time{Time: time.Now().Add(-6 * time.Second)}, + }, + }}, + }, + }, + }, + expectedStatus: true, + }, { + name: "TaskRun timed out with retries but current execution start time is nil", + taskRun: &v1beta1.TaskRun{ + Spec: v1beta1.TaskRunSpec{ + Timeout: &metav1.Duration{ + Duration: 10 * time.Second, + }, + }, + Status: v1beta1.TaskRunStatus{ + Status: duckv1beta1.Status{ + Conditions: []apis.Condition{{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + }}, + }, + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + StartTime: nil, + RetriesStatus: []v1beta1.TaskRunStatus{{ + Status: duckv1beta1.Status{ + Conditions: []apis.Condition{{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + }}, + }, + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + StartTime: &metav1.Time{Time: time.Now().Add(-11 * time.Second)}, + CompletionTime: &metav1.Time{Time: time.Now().Add(-6 * time.Second)}, + }, + }}, + }, + }, + }, + expectedStatus: true, }} for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { result := tc.taskRun.HasTimedOut(context.Background()) - if d := cmp.Diff(result, tc.expectedStatus); d != "" { + if d := cmp.Diff(tc.expectedStatus, result); d != "" { t.Fatalf(diff.PrintWantGot(d)) } })