Skip to content

Commit

Permalink
Terminate TaskRun when Pod fails due to ImagePullBackOff.
Browse files Browse the repository at this point in the history
Prior to this, if the Pod was in ImagePullBackOff state,
the TaskRun would remain `Running` with the message `Pending` until it eventually timed out.
This led to lots of delays. The expected behavior should have been to
terminate the TaskRun and set it to `fail`. This PR addresses issue
tektoncd#4895.
  • Loading branch information
chitrangpatel committed May 31, 2022
1 parent 2fc5f85 commit fff9b71
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ False|\[Error message\]|No|The TaskRun encountered a non-permanent error, and it
False|\[Error message\]|Yes|The TaskRun failed with a permanent error (usually validation).
False|TaskRunCancelled|Yes|The TaskRun was cancelled successfully.
False|TaskRunTimeout|Yes|The TaskRun timed out.
False|TaskRunImagePullFailed|Yes|The TaskRun failed due to one of its steps not being able to pull the image.

When a `TaskRun` changes status, [events](events.md#taskruns) are triggered accordingly.

Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/pipeline/v1beta1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ const (
// TaskRunReasonResolvingTaskRef indicates that the TaskRun is waiting for
// its taskRef to be asynchronously resolved.
TaskRunReasonResolvingTaskRef = "ResolvingTaskRef"
// TaskRunReasonImagePullFailed is the reason set when the step of a task fails due to image not being pulled
TaskRunReasonImagePullFailed TaskRunReason = "TaskRunImagePullFailed"
)

func (t TaskRunReason) String() string {
Expand Down
11 changes: 11 additions & 0 deletions pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1beta1.TaskRun) pkg
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
}

// Check for Pod Failures: Image Pull Failed
// TODO: Might want to wrap this in its own function
// to capture different types of pod failures that should terminate the TaskRun.
for i, _ := range tr.Status.Steps {
if tr.Status.Steps[i].Waiting != nil && tr.Status.Steps[i].Waiting.Reason == "ImagePullBackOff" {
message := fmt.Sprintf(`A step in TaskRun %q failed to pull the image. The pod errored with the message: "%s."`, tr.Name, tr.Status.Steps[i].Waiting.Message)
err := c.failTaskRun(ctx, tr, v1beta1.TaskRunReasonImagePullFailed, message)
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
}
}

// prepare fetches all required resources, validates them together with the
// taskrun, runs API conversions. In case of error we update, emit events and return.
_, rtr, err := c.prepare(ctx, tr)
Expand Down
80 changes: 80 additions & 0 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1884,6 +1884,86 @@ status:
}
}

func TestReconcilePodFailures(t *testing.T) {
type testCase struct {
name string
taskRun *v1beta1.TaskRun
expectedStatus *apis.Condition
wantEvents []string
}

testcases := []testCase{
{
name: "taskrun with pod image pull failure",
taskRun: parse.MustParseTaskRun(t, `
metadata:
name: test-imagepull-fail
namespace: foo
spec:
taskSpec:
steps:
- image: whatever
name: ""
resources: {}
script: "true"
timeout: 1h0m0s
status:
steps:
- container: step-unnamed-0
name: unnamed-0
waiting:
message: Back-off pulling image "whatever"
reason: ImagePullBackOff
taskSpec:
steps:
- image: whatever
name: ""
resources: {}
script: "true"
`),

expectedStatus: &apis.Condition{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
Reason: "TaskRunImagePullFailed",
Message: `A step in TaskRun "test-imagepull-fail" failed to pull the image. The pod errored with the message: "Back-off pulling image "whatever"."`,
},
wantEvents: []string{
"Normal Started ",
`Warning Failed A step in TaskRun "test-imagepull-fail" failed to pull the image. The pod errored with the message: "Back-off pulling image "whatever".`,
},
}}

for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
d := test.Data{
TaskRuns: []*v1beta1.TaskRun{tc.taskRun},
// Tasks: []*v1beta1.Task{simpleTask, saTask},
}
testAssets, cancel := getTaskRunController(t, d)
defer cancel()
c := testAssets.Controller
clients := testAssets.Clients

if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(tc.taskRun)); err != nil {
t.Fatalf("Unexpected error when reconciling completed TaskRun : %v", err)
}
newTr, err := clients.Pipeline.TektonV1beta1().TaskRuns(tc.taskRun.Namespace).Get(testAssets.Ctx, tc.taskRun.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", tc.taskRun.Name, err)
}
condition := newTr.Status.GetCondition(apis.ConditionSucceeded)
if d := cmp.Diff(tc.expectedStatus, condition, ignoreLastTransitionTime); d != "" {
t.Fatalf("Did not get expected condition %s", diff.PrintWantGot(d))
}
err = eventstest.CheckEventsOrdered(t, testAssets.Recorder.Events, tc.taskRun.Name, tc.wantEvents)
if !(err == nil) {
t.Errorf(err.Error())
}
})
}
}

func TestReconcileTimeouts(t *testing.T) {
type testCase struct {
name string
Expand Down

0 comments on commit fff9b71

Please sign in to comment.