From 6fbbcde7b41a9826fad91cd4338e75b9d8aa8083 Mon Sep 17 00:00:00 2001 From: Chitrang Patel Date: Tue, 31 May 2022 10:07:11 -0400 Subject: [PATCH] Terminate TaskRun when Pod fails due to ImagePullBackOff. Prior to this, if the Pod was in ImagePullBackOff state, the TaskRun would remain `Running` with the message `Pending` until it eventually timed out. This led to lots of delays. The expected behavior should have been to terminate the TaskRun and set it to `fail`. This PR addresses issue https://github.com/tektoncd/pipeline/issues/4895. --- docs/taskruns.md | 1 + pkg/apis/pipeline/v1beta1/taskrun_types.go | 2 + pkg/reconciler/taskrun/taskrun.go | 11 +++++ pkg/reconciler/taskrun/taskrun_test.go | 56 ++++++++++++++++++++++ 4 files changed, 70 insertions(+) diff --git a/docs/taskruns.md b/docs/taskruns.md index 5da4f98a1af..5a8d7045327 100644 --- a/docs/taskruns.md +++ b/docs/taskruns.md @@ -560,6 +560,7 @@ False|\[Error message\]|No|The TaskRun encountered a non-permanent error, and it False|\[Error message\]|Yes|The TaskRun failed with a permanent error (usually validation). False|TaskRunCancelled|Yes|The TaskRun was cancelled successfully. False|TaskRunTimeout|Yes|The TaskRun timed out. +False|TaskRunImagePullFailed|Yes|The TaskRun failed due to one of its steps not being able to pull the image. When a `TaskRun` changes status, [events](events.md#taskruns) are triggered accordingly. diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types.go b/pkg/apis/pipeline/v1beta1/taskrun_types.go index c6cea9e7199..1bb1dea40ac 100644 --- a/pkg/apis/pipeline/v1beta1/taskrun_types.go +++ b/pkg/apis/pipeline/v1beta1/taskrun_types.go @@ -143,6 +143,8 @@ const ( // TaskRunReasonResolvingTaskRef indicates that the TaskRun is waiting for // its taskRef to be asynchronously resolved. TaskRunReasonResolvingTaskRef = "ResolvingTaskRef" + // TaskRunReasonImagePullFailed is the reason set when the step of a task fails due to image not being pulled + TaskRunReasonImagePullFailed TaskRunReason = "TaskRunImagePullFailed" ) func (t TaskRunReason) String() string { diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 62faf73632c..b2cd6c251f0 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -155,6 +155,17 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1beta1.TaskRun) pkg return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err) } + // Check for Pod Failures: Image Pull Failed + // TODO: Might want to wrap this in its own function + // to capture different types of pod failures that should terminate the TaskRun. + for _, step := range tr.Status.Steps { + if step.Waiting != nil && step.Waiting.Reason == "ImagePullBackOff" { + message := fmt.Sprintf(`A step in TaskRun %q failed to pull the image. The pod errored with the message: "%s."`, tr.Name, step.Waiting.Message) + err := c.failTaskRun(ctx, tr, v1beta1.TaskRunReasonImagePullFailed, message) + return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err) + } + } + // prepare fetches all required resources, validates them together with the // taskrun, runs API conversions. In case of error we update, emit events and return. _, rtr, err := c.prepare(ctx, tr) diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index 6e685056bc6..78fd4fcdd00 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -1884,6 +1884,62 @@ status: } } +func TestReconcilePodFailures(t *testing.T) { + taskRun := parse.MustParseTaskRun(t, ` +metadata: + name: test-imagepull-fail + namespace: foo +spec: + taskSpec: + steps: + - image: whatever +status: + steps: + - container: step-unnamed-0 + name: unnamed-0 + waiting: + message: Back-off pulling image "whatever" + reason: ImagePullBackOff + taskSpec: + steps: + - image: whatever +`) + expectedStatus := &apis.Condition{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + Reason: "TaskRunImagePullFailed", + Message: `A step in TaskRun "test-imagepull-fail" failed to pull the image. The pod errored with the message: "Back-off pulling image "whatever"."`, + } + + wantEvents := []string{ + "Normal Started ", + `Warning Failed A step in TaskRun "test-imagepull-fail" failed to pull the image. The pod errored with the message: "Back-off pulling image "whatever".`, + } + d := test.Data{ + TaskRuns: []*v1beta1.TaskRun{taskRun}, + } + testAssets, cancel := getTaskRunController(t, d) + defer cancel() + c := testAssets.Controller + clients := testAssets.Clients + + if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil { + t.Fatalf("Unexpected error when reconciling completed TaskRun : %v", err) + } + newTr, err := clients.Pipeline.TektonV1beta1().TaskRuns(taskRun.Namespace).Get(testAssets.Ctx, taskRun.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", taskRun.Name, err) + } + condition := newTr.Status.GetCondition(apis.ConditionSucceeded) + if d := cmp.Diff(expectedStatus, condition, ignoreLastTransitionTime); d != "" { + t.Fatalf("Did not get expected condition %s", diff.PrintWantGot(d)) + } + err = eventstest.CheckEventsOrdered(t, testAssets.Recorder.Events, taskRun.Name, wantEvents) + if err != nil { + t.Errorf(err.Error()) + } +} + func TestReconcileTimeouts(t *testing.T) { type testCase struct { name string