From 37e06b80ecc0d46b97ee4d2b18f0f019be7c6eef Mon Sep 17 00:00:00 2001
From: Jerop <jerop@google.com>
Date: Tue, 9 Nov 2021 16:56:08 -0500
Subject: [PATCH] Stop executing retried `Tasks` past their timeouts

Today, when a `Task` is retried, it executes past its timeout. Even
worse, the `Task` executing past its timeout causes the execution of
`Finally Tasks` to be timedout before executing for as long as their
allocated timeouts. As such, the execution of `Finally Tasks` is not
guaranteed even when a `finally timeout` has been allocated.

@ornew described the problem in the issue report:
"The problem for me is that Tekton still doesn't guarantee the execution
of `finally` at all. This is a really serious problem for operating in
production, as Tekton we operate has caused a lot of resource leaks and
a lot of damage by not running finally. It's the biggest problem with
existing `timeout`."

This problem is caused by the the interaction between retries and
timeouts. When a `TaskRun` is retried:
- The status of its failed execution is stored in `RetriesStatus`.
- The status of the `TaskRun` is updated, including marking it as
`Running` and resetting its `StartTime`.

Resetting the `StartTime` is useful in recording the `StartTime` of
each retry in the `RetriesStatus`. However, using the reset time as the
reset start time of the `TaskRun` when checking whether it has timedout
is incorrect and causes the issues described above.

In this change, we use the actual start time of the `TaskRun` to check
whether the `TaskRun` has timedout. We do this by checking the start
time of previous attempts as well, instead of the current attempt only.

Alternative approaches considered include:
- not resetting the start time of the `TaskRun` upon retry, however
it will be challenging to know the execution times of each retry.
- keeping track of the actual start time in an extra field, but this is
information that's already available in the status

References:
- [TEP-0046: Finally tasks execution post pipelinerun timeout](https://github.com/tektoncd/community/blob/main/teps/0046-finallytask-execution-post-timeout.md)
- [Issue: task time exceeded `timeouts.tasks` when task retried](https://github.com/tektoncd/pipeline/issues/4071)
- [Issue: Allow finally tasks to execute after pipeline timeout](https://github.com/tektoncd/pipeline/issues/2989)

Co-authored-by: Lee Bernick <lbernick@google.com>
---
 docs/pipelineruns.md                          |  2 +
 docs/pipelines.md                             |  2 +-
 docs/taskruns.md                              |  2 +
 pkg/apis/pipeline/v1beta1/taskrun_types.go    | 21 +++++-
 .../pipeline/v1beta1/taskrun_types_test.go    | 68 ++++++++++++++++++-
 5 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/docs/pipelineruns.md b/docs/pipelineruns.md
index bdad3861eab..2c4c52b9ea0 100644
--- a/docs/pipelineruns.md
+++ b/docs/pipelineruns.md
@@ -528,6 +528,8 @@ The `timeout` value is a `duration` conforming to Go's
 values are `1h30m`, `1h`, `1m`, and `60s`. If you set the global timeout to 0, all `PipelineRuns`
 that do not have an individual timeout set will fail immediately upon encountering an error.
 
+The above `timeouts` include the time used to execute `retries`. 
+
 ## Monitoring execution status
 
 As your `PipelineRun` executes, its `status` field accumulates information on the execution of each `TaskRun`
diff --git a/docs/pipelines.md b/docs/pipelines.md
index 8349b5411d5..80afb74e0ea 100644
--- a/docs/pipelines.md
+++ b/docs/pipelines.md
@@ -763,7 +763,7 @@ tasks:
 ### Configuring the failure timeout
 
 You can use the `Timeout` field in the `Task` spec within the `Pipeline` to set the timeout
-of the `TaskRun` that executes that `Task` within the `PipelineRun` that executes your `Pipeline.`
+of the `TaskRun` (and its `retries`) that executes that `Task` within the `PipelineRun` that executes your `Pipeline.`
 The `Timeout` value is a `duration` conforming to Go's [`ParseDuration`](https://golang.org/pkg/time/#ParseDuration)
 format. For example, valid values are `1h30m`, `1h`, `1m`, and `60s`.
 
diff --git a/docs/taskruns.md b/docs/taskruns.md
index 1cdf3bb6a7a..04496f05dbf 100644
--- a/docs/taskruns.md
+++ b/docs/taskruns.md
@@ -398,6 +398,8 @@ If a `TaskRun` runs longer than its timeout value, the pod associated with the `
 means that the logs of the `TaskRun` are not preserved. The deletion of the `TaskRun` pod is necessary in order to
 stop `TaskRun` step containers from running.
 
+The above `timeout` is inclusive of the time used to execute the `TaskRun`'s `retries`.
+
 ### Specifying `ServiceAccount` credentials
 
 You can execute the `Task` in your `TaskRun` with a specific set of credentials by
diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types.go b/pkg/apis/pipeline/v1beta1/taskrun_types.go
index fd701419906..518b7661b40 100644
--- a/pkg/apis/pipeline/v1beta1/taskrun_types.go
+++ b/pkg/apis/pipeline/v1beta1/taskrun_types.go
@@ -385,7 +385,8 @@ func (tr *TaskRun) IsCancelled() bool {
 
 // HasTimedOut returns true if the TaskRun runtime is beyond the allowed timeout
 func (tr *TaskRun) HasTimedOut(ctx context.Context) bool {
-	if tr.Status.StartTime.IsZero() {
+	startTime := tr.getStartTime()
+	if startTime.IsZero() {
 		return false
 	}
 	timeout := tr.GetTimeout(ctx)
@@ -393,10 +394,26 @@ func (tr *TaskRun) HasTimedOut(ctx context.Context) bool {
 	if timeout == apisconfig.NoTimeoutDuration {
 		return false
 	}
-	runtime := time.Since(tr.Status.StartTime.Time)
+	runtime := time.Since(startTime.Time)
 	return runtime > timeout
 }
 
+func (tr *TaskRun) getStartTime() *metav1.Time {
+	startTime := tr.Status.StartTime
+	if startTime.IsZero() {
+		if len(tr.Status.RetriesStatus) == 0 {
+			return startTime
+		}
+		startTime = &metav1.Time{Time: time.Now()}
+	}
+	for _, retry := range tr.Status.RetriesStatus {
+		if retry.StartTime.Time.Before(startTime.Time) {
+			startTime = retry.StartTime
+		}
+	}
+	return startTime
+}
+
 // GetTimeout returns the timeout for the TaskRun, or the default if not specified
 func (tr *TaskRun) GetTimeout(ctx context.Context) time.Duration {
 	// Use the platform default is no timeout is set
diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types_test.go b/pkg/apis/pipeline/v1beta1/taskrun_types_test.go
index 5ebbb59bffa..df71a43277b 100644
--- a/pkg/apis/pipeline/v1beta1/taskrun_types_test.go
+++ b/pkg/apis/pipeline/v1beta1/taskrun_types_test.go
@@ -309,12 +309,78 @@ func TestHasTimedOut(t *testing.T) {
 			},
 		},
 		expectedStatus: true,
+	}, {
+		name: "TaskRun timed out with retries",
+		taskRun: &v1beta1.TaskRun{
+			Spec: v1beta1.TaskRunSpec{
+				Timeout: &metav1.Duration{
+					Duration: 10 * time.Second,
+				},
+			},
+			Status: v1beta1.TaskRunStatus{
+				Status: duckv1beta1.Status{
+					Conditions: []apis.Condition{{
+						Type:   apis.ConditionSucceeded,
+						Status: corev1.ConditionFalse,
+					}},
+				},
+				TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+					StartTime: &metav1.Time{Time: time.Now().Add(-5 * time.Second)},
+					RetriesStatus: []v1beta1.TaskRunStatus{{
+						Status: duckv1beta1.Status{
+							Conditions: []apis.Condition{{
+								Type:   apis.ConditionSucceeded,
+								Status: corev1.ConditionFalse,
+							}},
+						},
+						TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+							StartTime:      &metav1.Time{Time: time.Now().Add(-11 * time.Second)},
+							CompletionTime: &metav1.Time{Time: time.Now().Add(-6 * time.Second)},
+						},
+					}},
+				},
+			},
+		},
+		expectedStatus: true,
+	}, {
+		name: "TaskRun timed out with retries but current execution start time is nil",
+		taskRun: &v1beta1.TaskRun{
+			Spec: v1beta1.TaskRunSpec{
+				Timeout: &metav1.Duration{
+					Duration: 10 * time.Second,
+				},
+			},
+			Status: v1beta1.TaskRunStatus{
+				Status: duckv1beta1.Status{
+					Conditions: []apis.Condition{{
+						Type:   apis.ConditionSucceeded,
+						Status: corev1.ConditionFalse,
+					}},
+				},
+				TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+					StartTime: nil,
+					RetriesStatus: []v1beta1.TaskRunStatus{{
+						Status: duckv1beta1.Status{
+							Conditions: []apis.Condition{{
+								Type:   apis.ConditionSucceeded,
+								Status: corev1.ConditionFalse,
+							}},
+						},
+						TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+							StartTime:      &metav1.Time{Time: time.Now().Add(-11 * time.Second)},
+							CompletionTime: &metav1.Time{Time: time.Now().Add(-6 * time.Second)},
+						},
+					}},
+				},
+			},
+		},
+		expectedStatus: true,
 	}}
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			result := tc.taskRun.HasTimedOut(context.Background())
-			if d := cmp.Diff(result, tc.expectedStatus); d != "" {
+			if d := cmp.Diff(tc.expectedStatus, result); d != "" {
 				t.Fatalf(diff.PrintWantGot(d))
 			}
 		})