From 4cae9cba9ec503508c78b2c3fcc53fef6703e08d Mon Sep 17 00:00:00 2001 From: joey Date: Tue, 27 Jun 2023 09:56:20 -0400 Subject: [PATCH] The cancellation of taskruns is now done through the entrypoint binary through a new flag called 'stop_on_cancel'. This removes the need for deleting the pods to cancel a taskrun, allowing examination of the logs on the pods from cancelled taskruns. Part of work on issue #3238 Signed-off-by: chengjoey --- cmd/entrypoint/main.go | 12 +- cmd/entrypoint/runner.go | 13 +- cmd/entrypoint/runner_test.go | 47 ++++- cmd/entrypoint/waiter.go | 17 +- cmd/entrypoint/waiter_test.go | 186 +++++++++++++++++- config/config-feature-flags.yaml | 3 + docs/additional-configs.md | 3 +- docs/taskruns.md | 3 + pkg/apis/config/feature_flags.go | 8 + pkg/apis/config/feature_flags_test.go | 10 + .../testdata/feature-flags-all-flags-set.yaml | 1 + ...gs-invalid-disable-affinity-assistant.yaml | 21 ++ ...ture-flags-invalid-keep-pod-on-cancel.yaml | 21 ++ ...in-environment-with-injected-sidecars.yaml | 21 ++ pkg/entrypoint/entrypointer.go | 60 +++++- pkg/entrypoint/entrypointer_test.go | 130 +++++++++++- pkg/pod/entrypoint.go | 43 +++- pkg/pod/entrypoint_test.go | 139 ++++++++++++- pkg/pod/pod.go | 13 +- pkg/pod/pod_test.go | 114 +++++++++++ pkg/reconciler/taskrun/taskrun.go | 12 +- pkg/reconciler/taskrun/taskrun_test.go | 55 ++++++ 22 files changed, 881 insertions(+), 51 deletions(-) create mode 100644 pkg/apis/config/testdata/feature-flags-invalid-disable-affinity-assistant.yaml create mode 100644 pkg/apis/config/testdata/feature-flags-invalid-keep-pod-on-cancel.yaml create mode 100644 pkg/apis/config/testdata/feature-flags-invalid-running-in-environment-with-injected-sidecars.yaml diff --git a/cmd/entrypoint/main.go b/cmd/entrypoint/main.go index c8a0033b67c..9f690e14113 100644 --- a/cmd/entrypoint/main.go +++ b/cmd/entrypoint/main.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "context" "encoding/json" "errors" "flag" @@ -67,7 +68,7 @@ const ( func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) { if e.BreakpointOnFailure { - if waitErr := e.Waiter.Wait(breakpointExitPostFile, false, false); waitErr != nil { + if waitErr := e.Waiter.Wait(context.Background(), breakpointExitPostFile, false, false); waitErr != nil { log.Println("error occurred while waiting for " + breakpointExitPostFile + " : " + waitErr.Error()) } // get exitcode from .breakpointexit @@ -181,6 +182,15 @@ func main() { case termination.MessageLengthError: log.Print(err.Error()) os.Exit(1) + case entrypoint.ContextError: + if errors.Is(err, entrypoint.ErrContextCanceled) { + log.Print("Step was cancelled") + // use the SIGKILL signal to distinguish normal exit programs, just like kill -9 PID + os.Exit(int(syscall.SIGKILL)) + } else { + log.Print(err.Error()) + os.Exit(1) + } case *exec.ExitError: // Copied from https://stackoverflow.com/questions/10385551/get-exit-code-go // This works on both Unix and Windows. Although diff --git a/cmd/entrypoint/runner.go b/cmd/entrypoint/runner.go index c9031e4ae73..6e137378531 100644 --- a/cmd/entrypoint/runner.go +++ b/cmd/entrypoint/runner.go @@ -118,7 +118,10 @@ func (rr *realRunner) Run(ctx context.Context, args ...string) error { // Start defined command if err := cmd.Start(); err != nil { if errors.Is(ctx.Err(), context.DeadlineExceeded) { - return context.DeadlineExceeded + return entrypoint.ErrContextDeadlineExceeded + } + if errors.Is(ctx.Err(), context.Canceled) { + return entrypoint.ErrContextCanceled } return err } @@ -134,9 +137,15 @@ func (rr *realRunner) Run(ctx context.Context, args ...string) error { }() // Wait for command to exit + // as os.exec [note](https://github.com/golang/go/blob/ee522e2cdad04a43bc9374776483b6249eb97ec9/src/os/exec/exec.go#L897-L906) + // cmd.Wait prefer Process error over context error + // but we want to return context error instead if err := cmd.Wait(); err != nil { if errors.Is(ctx.Err(), context.DeadlineExceeded) { - return context.DeadlineExceeded + return entrypoint.ErrContextDeadlineExceeded + } + if errors.Is(ctx.Err(), context.Canceled) { + return entrypoint.ErrContextCanceled } return err } diff --git a/cmd/entrypoint/runner_test.go b/cmd/entrypoint/runner_test.go index b2af31eb1da..a237a5ecec2 100644 --- a/cmd/entrypoint/runner_test.go +++ b/cmd/entrypoint/runner_test.go @@ -22,12 +22,15 @@ import ( "errors" "fmt" "io" + "math/rand" "os" "path/filepath" "strings" "syscall" "testing" "time" + + "github.com/tektoncd/pipeline/pkg/entrypoint" ) // TestRealRunnerSignalForwarding will artificially put an interrupt signal (SIGINT) in the rr.signals chan. @@ -183,10 +186,52 @@ func TestRealRunnerTimeout(t *testing.T) { defer cancel() if err := rr.Run(ctx, "sleep", "0.01"); err != nil { - if !errors.Is(err, context.DeadlineExceeded) { + if !errors.Is(err, entrypoint.ErrContextDeadlineExceeded) { t.Fatalf("unexpected error received: %v", err) } } else { t.Fatalf("step didn't timeout") } } + +func TestRealRunnerCancel(t *testing.T) { + testCases := []struct { + name string + timeout time.Duration + wantErr error + }{ + { + name: "cancel before cmd wait", + timeout: 0, + wantErr: entrypoint.ErrContextCanceled, + }, + { + name: "cancel on cmd wait", + timeout: time.Second * time.Duration(rand.Intn(3)), + wantErr: entrypoint.ErrContextCanceled, + }, + { + name: "cancel after cmd wait", + timeout: time.Second * 4, + wantErr: nil, + }, + } + for _, tc := range testCases { + rr := realRunner{} + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(tc.timeout) + cancel() + }() + err := rr.Run(ctx, "sleep", "3") + if tc.wantErr != nil { + if !errors.Is(err, tc.wantErr) { + t.Fatalf("unexpected error received: %v", err) + } + } else { + if err != nil { + t.Fatalf("unexpected error received: %v", err) + } + } + } +} diff --git a/cmd/entrypoint/waiter.go b/cmd/entrypoint/waiter.go index 29ebf9b4441..729c747e33c 100644 --- a/cmd/entrypoint/waiter.go +++ b/cmd/entrypoint/waiter.go @@ -17,6 +17,8 @@ limitations under the License. package main import ( + "context" + "errors" "fmt" "os" "time" @@ -47,11 +49,22 @@ func (rw *realWaiter) setWaitPollingInterval(pollingInterval time.Duration) *rea // // If a file of the same name with a ".err" extension exists then this Wait // will end with a skipError. -func (rw *realWaiter) Wait(file string, expectContent bool, breakpointOnFailure bool) error { +func (rw *realWaiter) Wait(ctx context.Context, file string, expectContent bool, breakpointOnFailure bool) error { if file == "" { return nil } - for ; ; time.Sleep(rw.waitPollingInterval) { + for { + select { + case <-ctx.Done(): + if errors.Is(ctx.Err(), context.Canceled) { + return entrypoint.ErrContextCanceled + } + if errors.Is(ctx.Err(), context.DeadlineExceeded) { + return entrypoint.ErrContextDeadlineExceeded + } + return nil + case <-time.After(rw.waitPollingInterval): + } if info, err := os.Stat(file); err == nil { if !expectContent || info.Size() > 0 { return nil diff --git a/cmd/entrypoint/waiter_test.go b/cmd/entrypoint/waiter_test.go index 422211802bc..c8d2b016a30 100644 --- a/cmd/entrypoint/waiter_test.go +++ b/cmd/entrypoint/waiter_test.go @@ -17,11 +17,14 @@ limitations under the License. package main import ( + "context" "errors" "os" "strings" "testing" "time" + + "github.com/tektoncd/pipeline/pkg/entrypoint" ) const testWaitPollingInterval = 50 * time.Millisecond @@ -38,7 +41,7 @@ func TestRealWaiterWaitMissingFile(t *testing.T) { rw := realWaiter{} doneCh := make(chan struct{}) go func() { - err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false, false) + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmp.Name(), false, false) if err != nil { t.Errorf("error waiting on tmp file %q", tmp.Name()) } @@ -66,7 +69,7 @@ func TestRealWaiterWaitWithFile(t *testing.T) { rw := realWaiter{} doneCh := make(chan struct{}) go func() { - err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false, false) + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmp.Name(), false, false) if err != nil { t.Errorf("error waiting on tmp file %q", tmp.Name()) } @@ -90,7 +93,7 @@ func TestRealWaiterWaitMissingContent(t *testing.T) { rw := realWaiter{} doneCh := make(chan struct{}) go func() { - err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true, false) + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmp.Name(), true, false) if err != nil { t.Errorf("error waiting on tmp file %q", tmp.Name()) } @@ -117,7 +120,7 @@ func TestRealWaiterWaitWithContent(t *testing.T) { rw := realWaiter{} doneCh := make(chan struct{}) go func() { - err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true, false) + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmp.Name(), true, false) if err != nil { t.Errorf("error waiting on tmp file %q", tmp.Name()) } @@ -146,7 +149,7 @@ func TestRealWaiterWaitWithErrorWaitfile(t *testing.T) { doneCh := make(chan struct{}) go func() { // error of type skipError is returned after encountering a error waitfile - err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmpFileName, false, false) + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmpFileName, false, false) if err == nil { t.Errorf("expected skipError upon encounter error waitfile") } @@ -177,12 +180,152 @@ func TestRealWaiterWaitWithBreakpointOnFailure(t *testing.T) { doneCh := make(chan struct{}) go func() { // When breakpoint on failure is enabled skipError shouldn't be returned for a error waitfile - err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmpFileName, false, true) + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmpFileName, false, true) + if err != nil { + t.Errorf("error waiting on tmp file %q", tmp.Name()) + } + close(doneCh) + }() + delay := time.NewTimer(2 * testWaitPollingInterval) + select { + case <-doneCh: + // Success + case <-delay.C: + t.Errorf("expected Wait() to have detected a non-zero file size by now") + } +} + +func TestRealWaiterWaitWithContextCanceled(t *testing.T) { + tmp, err := os.CreateTemp("", "real_waiter_test_file") + if err != nil { + t.Errorf("error creating temp file: %v", err) + } + defer os.Remove(tmp.Name()) + ctx, cancel := context.WithCancel(context.Background()) + rw := realWaiter{} + errCh := make(chan error) + go func() { + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(ctx, tmp.Name(), true, false) + if err == nil { + t.Errorf("expected context canceled error") + } + errCh <- err + }() + cancel() + delay := time.NewTimer(2 * testWaitPollingInterval) + select { + case err := <-errCh: + if !errors.Is(err, entrypoint.ErrContextCanceled) { + t.Errorf("expected ErrContextCanceled, got %T", err) + } + case <-delay.C: + t.Errorf("expected Wait() to have a ErrContextCanceled") + } +} + +func TestRealWaiterWaitWithTimeout(t *testing.T) { + tmp, err := os.CreateTemp("", "real_waiter_test_file") + if err != nil { + t.Errorf("error creating temp file: %v", err) + } + defer os.Remove(tmp.Name()) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond) + defer cancel() + rw := realWaiter{} + errCh := make(chan error) + go func() { + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(ctx, tmp.Name(), true, false) + if err == nil { + t.Errorf("expected context deadline error") + } + errCh <- err + }() + delay := time.NewTimer(2 * time.Second) + select { + case err := <-errCh: + if !errors.Is(err, entrypoint.ErrContextDeadlineExceeded) { + t.Errorf("expected ErrContextDeadlineExceeded, got %T", err) + } + case <-delay.C: + t.Errorf("expected Wait() to have a ErrContextDeadlineExceeded") + } +} + +func TestRealWaiterWaitContextWithBreakpointOnFailure(t *testing.T) { + tmp, err := os.CreateTemp("", "real_waiter_test_file*.err") + if err != nil { + t.Errorf("error creating temp file: %v", err) + } + tmpFileName := strings.Replace(tmp.Name(), ".err", "", 1) + defer os.Remove(tmp.Name()) + rw := realWaiter{} + doneCh := make(chan struct{}) + go func() { + // When breakpoint on failure is enabled skipError shouldn't be returned for a error waitfile + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmpFileName, false, true) + if err != nil { + t.Errorf("error waiting on tmp file %q", tmp.Name()) + } + close(doneCh) + }() + delay := time.NewTimer(2 * testWaitPollingInterval) + select { + case <-doneCh: + // Success + case <-delay.C: + t.Errorf("expected Wait() to have detected a non-zero file size by now") + } +} + +func TestRealWaiterWaitContextWithErrorWaitfile(t *testing.T) { + tmp, err := os.CreateTemp("", "real_waiter_test_file*.err") + if err != nil { + t.Errorf("error creating temp file: %v", err) + } + tmpFileName := strings.Replace(tmp.Name(), ".err", "", 1) + defer os.Remove(tmp.Name()) + rw := realWaiter{} + doneCh := make(chan struct{}) + go func() { + // error of type skipError is returned after encountering a error waitfile + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmpFileName, false, false) + if err == nil { + t.Errorf("expected skipError upon encounter error waitfile") + } + var skipErr skipError + if errors.As(err, &skipErr) { + close(doneCh) + } else { + t.Errorf("unexpected error type %T", err) + } + }() + delay := time.NewTimer(10 * testWaitPollingInterval) + select { + case <-doneCh: + // Success + case <-delay.C: + t.Errorf("expected Wait() to have detected a non-zero file size by now") + } +} + +func TestRealWaiterWaitContextWithContent(t *testing.T) { + tmp, err := os.CreateTemp("", "real_waiter_test_file") + if err != nil { + t.Errorf("error creating temp file: %v", err) + } + defer os.Remove(tmp.Name()) + rw := realWaiter{} + doneCh := make(chan struct{}) + go func() { + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmp.Name(), true, false) if err != nil { t.Errorf("error waiting on tmp file %q", tmp.Name()) } close(doneCh) }() + if err := os.WriteFile(tmp.Name(), []byte("😺"), 0700); err != nil { + t.Errorf("error writing content to temp file: %v", err) + } delay := time.NewTimer(2 * testWaitPollingInterval) select { case <-doneCh: @@ -191,3 +334,34 @@ func TestRealWaiterWaitWithBreakpointOnFailure(t *testing.T) { t.Errorf("expected Wait() to have detected a non-zero file size by now") } } + +func TestRealWaiterWaitContextMissingFile(t *testing.T) { + // Create a temp file and then immediately delete it to get + // a legitimate tmp path and ensure the file doesnt exist + // prior to testing Wait(). + tmp, err := os.CreateTemp("", "real_waiter_test_file") + if err != nil { + t.Errorf("error creating temp file: %v", err) + } + os.Remove(tmp.Name()) + rw := realWaiter{} + doneCh := make(chan struct{}) + go func() { + err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(context.Background(), tmp.Name(), false, false) + if err != nil { + t.Errorf("error waiting on tmp file %q", tmp.Name()) + } + close(doneCh) + }() + + delay := time.NewTimer(2 * testWaitPollingInterval) + select { + case <-delay.C: + // Success + case <-doneCh: + t.Errorf("did not expect Wait() to have detected a file at path %q", tmp.Name()) + if !delay.Stop() { + <-delay.C + } + } +} diff --git a/config/config-feature-flags.yaml b/config/config-feature-flags.yaml index 4eb7bf4aeff..af73d4a005a 100644 --- a/config/config-feature-flags.yaml +++ b/config/config-feature-flags.yaml @@ -118,3 +118,6 @@ data: # This allows TaskRuns to run in namespaces with "restricted" pod security standards. # Not all Kubernetes implementations support this option. set-security-context: "false" + # Setting this flag to "true" will keep pod on cancellation + # allowing examination of the logs on the pods from cancelled taskruns + keep-pod-on-cancel: "false" diff --git a/docs/additional-configs.md b/docs/additional-configs.md index 598f6352acf..2bdf6c565e4 100644 --- a/docs/additional-configs.md +++ b/docs/additional-configs.md @@ -316,7 +316,8 @@ Features currently in "alpha" are: | [Trusted Resources](./trusted-resources.md) | [TEP-0091](https://github.com/tektoncd/community/blob/main/teps/0091-trusted-resources.md) | N/A | `trusted-resources-verification-no-match-policy` | | [Larger Results via Sidecar Logs](#enabling-larger-results-using-sidecar-logs) | [TEP-0127](https://github.com/tektoncd/community/blob/main/teps/0127-larger-results-via-sidecar-logs.md) | [v0.43.0](https://github.com/tektoncd/pipeline/releases/tag/v0.43.0) | `results-from` | | [Configure Default Resolver](./resolution.md#configuring-built-in-resolvers) | [TEP-0133](https://github.com/tektoncd/community/blob/main/teps/0133-configure-default-resolver.md) | N/A | | -| [Coschedule](./affinityassistants.md) | [TEP-0135](https://github.com/tektoncd/community/blob/main/teps/0135-coscheduling-pipelinerun-pods.md) | N/A |`coschedule` | +| [Coschedule](./affinityassistants.md) | [TEP-0135](https://github.com/tektoncd/community/blob/main/teps/0135-coscheduling-pipelinerun-pods.md) | N/A |`coschedule` | +| [keep pod on cancel](./taskruns.md#cancelling-a-taskrun) | N/A | v0.52 | keep-pod-on-cancel | ### Beta Features diff --git a/docs/taskruns.md b/docs/taskruns.md index 4cb81ed0bb1..f6046ee2ed2 100644 --- a/docs/taskruns.md +++ b/docs/taskruns.md @@ -910,6 +910,9 @@ When you cancel a TaskRun, the running pod associated with that `TaskRun` is del means that the logs of the `TaskRun` are not preserved. The deletion of the `TaskRun` pod is necessary in order to stop `TaskRun` step containers from running. +**Note: if `keep-pod-on-cancel` is set to +`"true"` in the `feature-flags`, the pod associated with that `TaskRun` will not be deleted** + Example of cancelling a `TaskRun`: ```yaml diff --git a/pkg/apis/config/feature_flags.go b/pkg/apis/config/feature_flags.go index e3f43cc2dce..5ad99f228ee 100644 --- a/pkg/apis/config/feature_flags.go +++ b/pkg/apis/config/feature_flags.go @@ -88,6 +88,10 @@ const ( DefaultSetSecurityContext = false // DefaultCoschedule is the default value for coschedule DefaultCoschedule = CoscheduleWorkspaces + // KeepPodOnCancel is the flag used to enable cancelling a pod using the entrypoint, and keep pod on cancel + KeepPodOnCancel = "keep-pod-on-cancel" + // DefaultEnableKeepPodOnCancel is the default value for "keep-pod-on-cancel" + DefaultEnableKeepPodOnCancel = false disableAffinityAssistantKey = "disable-affinity-assistant" disableCredsInitKey = "disable-creds-init" @@ -124,6 +128,7 @@ type FeatureFlags struct { SendCloudEventsForRuns bool AwaitSidecarReadiness bool EnforceNonfalsifiability string + EnableKeepPodOnCancel bool // VerificationNoMatchPolicy is the feature flag for "trusted-resources-verification-no-match-policy" // VerificationNoMatchPolicy can be set to "ignore", "warn" and "fail" values. // ignore: skip trusted resources verification when no matching verification policies found @@ -195,6 +200,9 @@ func NewFeatureFlagsFromMap(cfgMap map[string]string) (*FeatureFlags, error) { if err := setMaxResultSize(cfgMap, DefaultMaxResultSize, &tc.MaxResultSize); err != nil { return nil, err } + if err := setFeature(KeepPodOnCancel, DefaultEnableKeepPodOnCancel, &tc.EnableKeepPodOnCancel); err != nil { + return nil, err + } if err := setEnforceNonFalsifiability(cfgMap, &tc.EnforceNonfalsifiability); err != nil { return nil, err } diff --git a/pkg/apis/config/feature_flags_test.go b/pkg/apis/config/feature_flags_test.go index cddaca7c024..a6f4e60bb1a 100644 --- a/pkg/apis/config/feature_flags_test.go +++ b/pkg/apis/config/feature_flags_test.go @@ -69,6 +69,7 @@ func TestNewFeatureFlagsFromConfigMap(t *testing.T) { VerificationNoMatchPolicy: config.FailNoMatchPolicy, EnableProvenanceInStatus: false, ResultExtractionMethod: "termination-message", + EnableKeepPodOnCancel: true, MaxResultSize: 4096, SetSecurityContext: true, Coschedule: config.CoscheduleDisabled, @@ -259,6 +260,15 @@ func TestNewFeatureFlagsConfigMapErrors(t *testing.T) { }, { fileName: "feature-flags-invalid-coschedule", want: `invalid value for feature flag "coschedule": "invalid"`, + }, { + fileName: "feature-flags-invalid-keep-pod-on-cancel", + want: `failed parsing feature flags config "invalid": strconv.ParseBool: parsing "invalid": invalid syntax`, + }, { + fileName: "feature-flags-invalid-running-in-environment-with-injected-sidecars", + want: `failed parsing feature flags config "invalid-boolean": strconv.ParseBool: parsing "invalid-boolean": invalid syntax`, + }, { + fileName: "feature-flags-invalid-disable-affinity-assistant", + want: `failed parsing feature flags config "truee": strconv.ParseBool: parsing "truee": invalid syntax`, }} { t.Run(tc.fileName, func(t *testing.T) { cm := test.ConfigMapFromTestFile(t, tc.fileName) diff --git a/pkg/apis/config/testdata/feature-flags-all-flags-set.yaml b/pkg/apis/config/testdata/feature-flags-all-flags-set.yaml index a93a71588ea..9e01abd1944 100644 --- a/pkg/apis/config/testdata/feature-flags-all-flags-set.yaml +++ b/pkg/apis/config/testdata/feature-flags-all-flags-set.yaml @@ -31,3 +31,4 @@ data: trusted-resources-verification-no-match-policy: "fail" enable-provenance-in-status: "false" set-security-context: "true" + keep-pod-on-cancel: "true" diff --git a/pkg/apis/config/testdata/feature-flags-invalid-disable-affinity-assistant.yaml b/pkg/apis/config/testdata/feature-flags-invalid-disable-affinity-assistant.yaml new file mode 100644 index 00000000000..f6940f7b4cb --- /dev/null +++ b/pkg/apis/config/testdata/feature-flags-invalid-disable-affinity-assistant.yaml @@ -0,0 +1,21 @@ +# Copyright 2023 The Tekton Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: feature-flags + namespace: tekton-pipelines +data: + disable-affinity-assistant: "truee" diff --git a/pkg/apis/config/testdata/feature-flags-invalid-keep-pod-on-cancel.yaml b/pkg/apis/config/testdata/feature-flags-invalid-keep-pod-on-cancel.yaml new file mode 100644 index 00000000000..c28ca2b6e7d --- /dev/null +++ b/pkg/apis/config/testdata/feature-flags-invalid-keep-pod-on-cancel.yaml @@ -0,0 +1,21 @@ +# Copyright 2023 The Tekton Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: feature-flags + namespace: tekton-pipelines +data: + keep-pod-on-cancel: "invalid" diff --git a/pkg/apis/config/testdata/feature-flags-invalid-running-in-environment-with-injected-sidecars.yaml b/pkg/apis/config/testdata/feature-flags-invalid-running-in-environment-with-injected-sidecars.yaml new file mode 100644 index 00000000000..716c2311cf3 --- /dev/null +++ b/pkg/apis/config/testdata/feature-flags-invalid-running-in-environment-with-injected-sidecars.yaml @@ -0,0 +1,21 @@ +# Copyright 2023 The Tekton Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: feature-flags + namespace: tekton-pipelines +data: + running-in-environment-with-injected-sidecars: "invalid-boolean" diff --git a/pkg/entrypoint/entrypointer.go b/pkg/entrypoint/entrypointer.go index 1f1e63c1ed4..ba0efd66aba 100644 --- a/pkg/entrypoint/entrypointer.go +++ b/pkg/entrypoint/entrypointer.go @@ -26,10 +26,12 @@ import ( "path/filepath" "strconv" "strings" + "syscall" "time" "github.com/tektoncd/pipeline/pkg/apis/config" "github.com/tektoncd/pipeline/pkg/apis/pipeline" + "github.com/tektoncd/pipeline/pkg/pod" "github.com/tektoncd/pipeline/pkg/result" "github.com/tektoncd/pipeline/pkg/spire" "github.com/tektoncd/pipeline/pkg/termination" @@ -43,6 +45,21 @@ const ( FailOnError = "stopAndFail" ) +// ContextError context error type +type ContextError string + +// Error implements error interface +func (e ContextError) Error() string { + return string(e) +} + +var ( + // ErrContextDeadlineExceeded is the error returned when the context deadline is exceeded + ErrContextDeadlineExceeded = ContextError(context.DeadlineExceeded.Error()) + // ErrContextCanceled is the error returned when the context is canceled + ErrContextCanceled = ContextError(context.Canceled.Error()) +) + // Entrypointer holds fields for running commands with redirected // entrypoints. type Entrypointer struct { @@ -91,8 +108,8 @@ type Entrypointer struct { // Waiter encapsulates waiting for files to exist. type Waiter interface { - // Wait blocks until the specified file exists. - Wait(file string, expectContent bool, breakpointOnFailure bool) error + // Wait blocks until the specified file exists or the context is done. + Wait(ctx context.Context, file string, expectContent bool, breakpointOnFailure bool) error } // Runner encapsulates running commands. @@ -121,7 +138,7 @@ func (e Entrypointer) Go() error { }() for _, f := range e.WaitFiles { - if err := e.Waiter.Wait(f, e.WaitFileContent, e.BreakpointOnFailure); err != nil { + if err := e.Waiter.Wait(context.Background(), f, e.WaitFileContent, e.BreakpointOnFailure); err != nil { // An error happened while waiting, so we bail // *but* we write postfile to make next steps bail too. // In case of breakpoint on failure do not write post file. @@ -143,21 +160,26 @@ func (e Entrypointer) Go() error { ResultType: result.InternalTektonResultType, }) - ctx := context.Background() var err error - if e.Timeout != nil && *e.Timeout < time.Duration(0) { err = fmt.Errorf("negative timeout specified") } - + ctx := context.Background() + var cancel context.CancelFunc if err == nil { - var cancel context.CancelFunc - if e.Timeout != nil && *e.Timeout != time.Duration(0) { + ctx, cancel = context.WithCancel(ctx) + if e.Timeout != nil && *e.Timeout > time.Duration(0) { ctx, cancel = context.WithTimeout(ctx, *e.Timeout) - defer cancel() } + defer cancel() + // start a goroutine to listen for cancellation file + go func() { + if err := e.waitingCancellation(ctx, cancel); err != nil { + logger.Error("Error while waiting for cancellation", zap.Error(err)) + } + }() err = e.Runner.Run(ctx, e.Command...) - if errors.Is(err, context.DeadlineExceeded) { + if errors.Is(err, ErrContextDeadlineExceeded) { output = append(output, result.RunResult{ Key: "Reason", Value: "TimeoutExceeded", @@ -168,6 +190,15 @@ func (e Entrypointer) Go() error { var ee *exec.ExitError switch { + case err != nil && errors.Is(err, ErrContextCanceled): + logger.Info("Step was canceling") + output = append(output, result.RunResult{ + Key: "Reason", + Value: "Cancelled", + ResultType: result.InternalTektonResultType, + }) + e.WritePostFile(e.PostFile, ErrContextCanceled) + e.WriteExitCodeFile(e.StepMetadataDir, syscall.SIGKILL.String()) case err != nil && e.BreakpointOnFailure: logger.Info("Skipping writing to PostFile") case e.OnError == ContinueOnError && errors.As(err, &ee): @@ -267,3 +298,12 @@ func (e Entrypointer) WriteExitCodeFile(stepPath, content string) { exitCodeFile := filepath.Join(stepPath, "exitCode") e.PostWriter.Write(exitCodeFile, content) } + +// waitingCancellation waiting cancellation file, if no error occurs, call cancelFunc to cancel the context +func (e Entrypointer) waitingCancellation(ctx context.Context, cancel context.CancelFunc) error { + if err := e.Waiter.Wait(ctx, pod.DownwardMountCancelFile, true, false); err != nil { + return err + } + cancel() + return nil +} diff --git a/pkg/entrypoint/entrypointer_test.go b/pkg/entrypoint/entrypointer_test.go index f451593f839..d2b749cab1b 100644 --- a/pkg/entrypoint/entrypointer_test.go +++ b/pkg/entrypoint/entrypointer_test.go @@ -26,12 +26,14 @@ import ( "path" "path/filepath" "reflect" + "sync" "testing" "time" "github.com/google/go-cmp/cmp" "github.com/tektoncd/pipeline/pkg/apis/config" "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1beta1" + "github.com/tektoncd/pipeline/pkg/pod" "github.com/tektoncd/pipeline/pkg/result" "github.com/tektoncd/pipeline/pkg/spire" "github.com/tektoncd/pipeline/pkg/termination" @@ -194,15 +196,19 @@ func TestEntrypointer(t *testing.T) { } if len(c.waitFiles) > 0 { + fw.Lock() if fw.waited == nil { t.Error("Wanted waited file, got nil") - } else if !reflect.DeepEqual(fw.waited, c.waitFiles) { + } else if !reflect.DeepEqual(fw.waited, append(c.waitFiles, "/tekton/downward/cancel")) { t.Errorf("Waited for %v, want %v", fw.waited, c.waitFiles) } + fw.Unlock() } - if len(c.waitFiles) == 0 && fw.waited != nil { + fw.Lock() + if len(c.waitFiles) == 0 && len(fw.waited) != 1 { t.Errorf("Waited for file when not required") } + fw.Unlock() wantArgs := append([]string{c.entrypoint}, c.args...) if len(wantArgs) != 0 { @@ -602,10 +608,101 @@ func TestEntrypointerResults(t *testing.T) { } } -type fakeWaiter struct{ waited []string } +func Test_waitingCancellation(t *testing.T) { + testCases := []struct { + name string + expectCtxErr error + }{ + { + name: "stopOnCancel is true and want context canceled", + expectCtxErr: context.Canceled, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + fw := &fakeWaiter{} + err := Entrypointer{ + Waiter: fw, + }.waitingCancellation(ctx, cancel) + if err != nil { + t.Fatalf("Entrypointer waitingCancellation failed: %v", err) + } + if tc.expectCtxErr != nil && !errors.Is(ctx.Err(), tc.expectCtxErr) { + t.Errorf("expected context error %v, got %v", tc.expectCtxErr, ctx.Err()) + } + }) + } +} -func (f *fakeWaiter) Wait(file string, _ bool, _ bool) error { +func TestEntrypointerStopOnCancel(t *testing.T) { + testCases := []struct { + name string + runningDuration time.Duration + waitingDuration time.Duration + runningWaitingDuration time.Duration + expectError error + }{ + { + name: "generally running, expect no error", + runningDuration: 1 * time.Second, + runningWaitingDuration: 1 * time.Second, + waitingDuration: 2 * time.Second, + expectError: nil, + }, + { + name: "context canceled during running, expect context canceled error", + runningDuration: 2 * time.Second, + runningWaitingDuration: 2 * time.Second, + waitingDuration: 1 * time.Second, + expectError: ErrContextCanceled, + }, + { + name: "time exceeded during running, expect context deadline exceeded error", + runningDuration: 2 * time.Second, + runningWaitingDuration: 1 * time.Second, + waitingDuration: 1 * time.Second, + expectError: ErrContextDeadlineExceeded, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + terminationPath := "termination" + if terminationFile, err := os.CreateTemp("", "termination"); err != nil { + t.Fatalf("unexpected error creating temporary termination file: %v", err) + } else { + terminationPath = terminationFile.Name() + defer os.Remove(terminationFile.Name()) + } + fw := &fakeWaiter{waitCancelDuration: tc.waitingDuration} + fr := &fakeLongRunner{runningDuration: tc.runningDuration, waitingDuration: tc.runningWaitingDuration} + fp := &fakePostWriter{} + err := Entrypointer{ + Waiter: fw, + Runner: fr, + PostWriter: fp, + TerminationPath: terminationPath, + }.Go() + if !errors.Is(err, tc.expectError) { + t.Errorf("expected error %v, got %v", tc.expectError, err) + } + }) + } +} + +type fakeWaiter struct { + sync.Mutex + waited []string + waitCancelDuration time.Duration +} + +func (f *fakeWaiter) Wait(ctx context.Context, file string, _ bool, _ bool) error { + if file == pod.DownwardMountCancelFile && f.waitCancelDuration > 0 { + time.Sleep(f.waitCancelDuration) + } + f.Lock() f.waited = append(f.waited, file) + f.Unlock() return nil } @@ -633,7 +730,7 @@ func (f *fakePostWriter) Write(file, content string) { type fakeErrorWaiter struct{ waited *string } -func (f *fakeErrorWaiter) Wait(file string, expectContent bool, breakpointOnFailure bool) error { +func (f *fakeErrorWaiter) Wait(ctx context.Context, file string, expectContent bool, breakpointOnFailure bool) error { f.waited = &file return errors.New("waiter failed") } @@ -672,6 +769,29 @@ func (f *fakeExitErrorRunner) Run(ctx context.Context, args ...string) error { return exec.Command("ls", "/bogus/path").Run() } +type fakeLongRunner struct { + runningDuration time.Duration + waitingDuration time.Duration +} + +func (f *fakeLongRunner) Run(ctx context.Context, _ ...string) error { + if f.waitingDuration < f.runningDuration { + return ErrContextDeadlineExceeded + } + select { + case <-time.After(f.runningDuration): + return nil + case <-ctx.Done(): + if errors.Is(ctx.Err(), context.Canceled) { + return ErrContextCanceled + } + if errors.Is(ctx.Err(), context.DeadlineExceeded) { + return ErrContextDeadlineExceeded + } + return nil + } +} + type fakeResultsWriter struct { args *[]string resultsToWrite map[string]string diff --git a/pkg/pod/entrypoint.go b/pkg/pod/entrypoint.go index f1cb453dc55..d7cd3506405 100644 --- a/pkg/pod/entrypoint.go +++ b/pkg/pod/entrypoint.go @@ -59,6 +59,10 @@ const ( stepPrefix = "step-" sidecarPrefix = "sidecar-" + + downwardMountCancelFile = "cancel" + cancelAnnotation = "tekton.dev/cancel" + cancelAnnotationValue = "CANCEL" ) var ( @@ -81,6 +85,12 @@ var ( MountPath: pipeline.StepsDir, } + downwardCancelVolumeItem = corev1.DownwardAPIVolumeFile{ + Path: downwardMountCancelFile, + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: fmt.Sprintf("metadata.annotations['%s']", cancelAnnotation), + }, + } // TODO(#1605): Signal sidecar readiness by injecting entrypoint, // remove dependency on Downward API. downwardVolume = corev1.Volume{ @@ -103,6 +113,8 @@ var ( // since the volume itself is readonly, but including for completeness. ReadOnly: true, } + // DownwardMountCancelFile is cancellation file mount to step, entrypoint will check this file to cancel the step. + DownwardMountCancelFile = filepath.Join(downwardMountPoint, downwardMountCancelFile) ) // orderContainers returns the specified steps, modified so that they are @@ -112,7 +124,7 @@ var ( // command, we must have fetched the image's ENTRYPOINT before calling this // method, using entrypoint_lookup.go. // Additionally, Step timeouts are added as entrypoint flag. -func orderContainers(commonExtraEntrypointArgs []string, steps []corev1.Container, taskSpec *v1.TaskSpec, breakpointConfig *v1.TaskRunDebug, waitForReadyAnnotation bool) ([]corev1.Container, error) { +func orderContainers(commonExtraEntrypointArgs []string, steps []corev1.Container, taskSpec *v1.TaskSpec, breakpointConfig *v1.TaskRunDebug, waitForReadyAnnotation, enableKeepPodOnCancel bool) ([]corev1.Container, error) { if len(steps) == 0 { return nil, errors.New("No steps specified") } @@ -177,10 +189,11 @@ func orderContainers(commonExtraEntrypointArgs []string, steps []corev1.Containe steps[i].Command = []string{entrypointBinary} steps[i].Args = argsForEntrypoint steps[i].TerminationMessagePath = terminationPath - } - if waitForReadyAnnotation { - // Mount the Downward volume into the first step container. - steps[0].VolumeMounts = append(steps[0].VolumeMounts, downwardMount) + if (i == 0 && waitForReadyAnnotation) || enableKeepPodOnCancel { + // Mount the Downward volume into the first step container. + // if enableKeepPodOnCancel is true, mount the Downward volume into all the steps. + steps[i].VolumeMounts = append(steps[i].VolumeMounts, downwardMount) + } } return steps, nil @@ -201,7 +214,7 @@ func collectResultsName(results []v1.TaskResult) string { return strings.Join(resultNames, ",") } -var replaceReadyPatchBytes []byte +var replaceReadyPatchBytes, replaceCancelPatchBytes []byte func init() { // https://stackoverflow.com/questions/55573724/create-a-patch-to-add-a-kubernetes-annotation @@ -215,6 +228,24 @@ func init() { if err != nil { log.Fatalf("failed to marshal replace ready patch bytes: %v", err) } + + cancelAnnotationPath := "/metadata/annotations/" + strings.Replace(cancelAnnotation, "/", "~1", 1) + replaceCancelPatchBytes, err = json.Marshal([]jsonpatch.JsonPatchOperation{{ + Operation: "replace", + Path: cancelAnnotationPath, + Value: cancelAnnotationValue, + }}) + if err != nil { + log.Fatalf("failed to marshal replace cancel patch bytes: %v", err) + } +} + +// CancelPod cancels the pod +func CancelPod(ctx context.Context, kubeClient kubernetes.Interface, namespace, podName string) error { + // PATCH the Pod's annotations to replace the cancel annotation with the + // "CANCEL" value, to signal the pod to be cancelled. + _, err := kubeClient.CoreV1().Pods(namespace).Patch(ctx, podName, types.JSONPatchType, replaceCancelPatchBytes, metav1.PatchOptions{}) + return err } // UpdateReady updates the Pod's annotations to signal the first step to start diff --git a/pkg/pod/entrypoint_test.go b/pkg/pod/entrypoint_test.go index 766d9d3c1c4..56d57c8a146 100644 --- a/pkg/pod/entrypoint_test.go +++ b/pkg/pod/entrypoint_test.go @@ -95,7 +95,7 @@ func TestOrderContainers(t *testing.T) { }, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{}, steps, nil, nil, true) + got, err := orderContainers([]string{}, steps, nil, nil, true, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -163,7 +163,7 @@ func TestOrderContainersWithResultsSidecarLogs(t *testing.T) { }, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{"-dont_send_results_to_termination_path"}, steps, nil, nil, true) + got, err := orderContainers([]string{"-dont_send_results_to_termination_path"}, steps, nil, nil, true, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -209,7 +209,7 @@ func TestOrderContainersWithNoWait(t *testing.T) { VolumeMounts: []corev1.VolumeMount{volumeMount}, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{}, steps, nil, nil, false) + got, err := orderContainers([]string{}, steps, nil, nil, false, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -245,7 +245,35 @@ func TestOrderContainersWithDebugOnFailure(t *testing.T) { OnFailure: "enabled", }, } - got, err := orderContainers([]string{}, steps, nil, taskRunDebugConfig, true) + got, err := orderContainers([]string{}, steps, nil, taskRunDebugConfig, true, false) + if err != nil { + t.Fatalf("orderContainers: %v", err) + } + if d := cmp.Diff(want, got); d != "" { + t.Errorf("Diff %s", diff.PrintWantGot(d)) + } +} + +func TestOrderContainersWithEnabelKeepPodOnCancel(t *testing.T) { + steps := []corev1.Container{{ + Image: "step-1", + Command: []string{"cmd"}, + Args: []string{"arg1", "arg2"}, + }} + want := []corev1.Container{{ + Image: "step-1", + Command: []string{entrypointBinary}, + Args: []string{ + "-post_file", "/tekton/run/0/out", + "-termination_path", "/tekton/termination", + "-step_metadata_dir", "/tekton/run/0/status", + "-entrypoint", "cmd", "--", + "arg1", "arg2", + }, + VolumeMounts: []corev1.VolumeMount{downwardMount}, + TerminationMessagePath: "/tekton/termination", + }} + got, err := orderContainers([]string{}, steps, nil, nil, false, true) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -323,7 +351,7 @@ func TestEntryPointResults(t *testing.T) { }, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{}, steps, &taskSpec, nil, true) + got, err := orderContainers([]string{}, steps, &taskSpec, nil, true, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -364,7 +392,7 @@ func TestEntryPointResultsSingleStep(t *testing.T) { VolumeMounts: []corev1.VolumeMount{downwardMount}, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{}, steps, &taskSpec, nil, true) + got, err := orderContainers([]string{}, steps, &taskSpec, nil, true, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -401,7 +429,7 @@ func TestEntryPointSingleResultsSingleStep(t *testing.T) { VolumeMounts: []corev1.VolumeMount{downwardMount}, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{}, steps, &taskSpec, nil, true) + got, err := orderContainers([]string{}, steps, &taskSpec, nil, true, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -472,7 +500,7 @@ func TestEntryPointOnError(t *testing.T) { err: errors.New("task step onError must be either \"continue\" or \"stopAndFail\" but it is set to an invalid value \"invalid-on-error\""), }} { t.Run(tc.desc, func(t *testing.T) { - got, err := orderContainers([]string{}, steps, &tc.taskSpec, nil, true) + got, err := orderContainers([]string{}, steps, &tc.taskSpec, nil, true, false) if len(tc.wantContainers) == 0 { if err == nil { t.Fatalf("expected an error for an invalid value for onError but received none") @@ -571,7 +599,7 @@ func TestEntryPointStepOutputConfigs(t *testing.T) { }, TerminationMessagePath: "/tekton/termination", }} - got, err := orderContainers([]string{}, steps, &taskSpec, nil, true) + got, err := orderContainers([]string{}, steps, &taskSpec, nil, true, false) if err != nil { t.Fatalf("orderContainers: %v", err) } @@ -874,3 +902,96 @@ func TestStopSidecars(t *testing.T) { }) } } + +func TestCancelPod(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + Annotations: map[string]string{"tekton.dev/cancel": ""}, + }, + } + kubeClient := fakek8s.NewSimpleClientset(pod) + type args struct { + namespace string + podName string + } + testCases := []struct { + name string + args args + expectedErr bool + }{ + { + name: "cancel existed pod", + args: args{ + namespace: "default", + podName: "test-pod", + }, + expectedErr: false, + }, + { + name: "cancel non-existed pod", + args: args{ + namespace: "default", + podName: "non-existed-pod", + }, + expectedErr: true, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + err := CancelPod(context.Background(), kubeClient, tc.args.namespace, tc.args.podName) + if (err != nil) != tc.expectedErr { + t.Errorf("expected error: %v, but got: %v", tc.expectedErr, err) + } + }) + } +} + +func TestIsSidecarStatusRunning(t *testing.T) { + testCases := []struct { + name string + tr *v1.TaskRun + want bool + }{ + { + name: "task not running", + tr: &v1.TaskRun{ + Status: v1.TaskRunStatus{ + TaskRunStatusFields: v1.TaskRunStatusFields{ + Sidecars: []v1.SidecarState{ + { + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{}, + }, + }, + }, + }, + }, + }, + want: false, + }, + { + name: "task running", + tr: &v1.TaskRun{ + Status: v1.TaskRunStatus{ + TaskRunStatusFields: v1.TaskRunStatusFields{ + Sidecars: []v1.SidecarState{ + { + ContainerState: corev1.ContainerState{}, + }, + }, + }, + }, + }, + want: true, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got := IsSidecarStatusRunning(tc.tr); got != tc.want { + t.Errorf("expected IsSidecarStatusRunning: %v, but got: %v", tc.want, got) + } + }) + } +} diff --git a/pkg/pod/pod.go b/pkg/pod/pod.go index 74468b12761..e99ee7e2c62 100644 --- a/pkg/pod/pod.go +++ b/pkg/pod/pod.go @@ -151,6 +151,7 @@ func (b *Builder) Build(ctx context.Context, taskRun *v1.TaskRun, taskSpec v1.Ta defaultForbiddenEnv := config.FromContextOrDefaults(ctx).Defaults.DefaultForbiddenEnv alphaAPIEnabled := featureFlags.EnableAPIFields == config.AlphaAPIFields sidecarLogsResultsEnabled := config.FromContextOrDefaults(ctx).FeatureFlags.ResultExtractionMethod == config.ResultExtractionMethodSidecarLogs + enableKeepPodOnCancel := alphaAPIEnabled && featureFlags.EnableKeepPodOnCancel setSecurityContext := config.FromContextOrDefaults(ctx).FeatureFlags.SetSecurityContext // Add our implicit volumes first, so they can be overridden by the user if they prefer. @@ -238,16 +239,20 @@ func (b *Builder) Build(ctx context.Context, taskRun *v1.TaskRun, taskSpec v1.Ta readyImmediately := isPodReadyImmediately(*featureFlags, taskSpec.Sidecars) if alphaAPIEnabled { - stepContainers, err = orderContainers(commonExtraEntrypointArgs, stepContainers, &taskSpec, taskRun.Spec.Debug, !readyImmediately) + stepContainers, err = orderContainers(commonExtraEntrypointArgs, stepContainers, &taskSpec, taskRun.Spec.Debug, !readyImmediately, enableKeepPodOnCancel) } else { - stepContainers, err = orderContainers(commonExtraEntrypointArgs, stepContainers, &taskSpec, nil, !readyImmediately) + stepContainers, err = orderContainers(commonExtraEntrypointArgs, stepContainers, &taskSpec, nil, !readyImmediately, enableKeepPodOnCancel) } if err != nil { return nil, err } volumes = append(volumes, binVolume) - if !readyImmediately { - volumes = append(volumes, downwardVolume) + if !readyImmediately || enableKeepPodOnCancel { + downwardVolumeDup := downwardVolume.DeepCopy() + if enableKeepPodOnCancel { + downwardVolumeDup.VolumeSource.DownwardAPI.Items = append(downwardVolumeDup.VolumeSource.DownwardAPI.Items, downwardCancelVolumeItem) + } + volumes = append(volumes, *downwardVolumeDup) } // Order of precedence for envs diff --git a/pkg/pod/pod_test.go b/pkg/pod/pod_test.go index ff5ee7db064..6a8960fcc40 100644 --- a/pkg/pod/pod_test.go +++ b/pkg/pod/pod_test.go @@ -2079,6 +2079,120 @@ _EOF_ }, runVolume(0)), ActiveDeadlineSeconds: &defaultActiveDeadlineSeconds, }, + }, { + desc: "keep pod on cancel enabled", + featureFlags: map[string]string{"keep-pod-on-cancel": "true", "enable-api-fields": "alpha"}, + ts: v1.TaskSpec{ + Steps: []v1.Step{{ + Name: "name", + Image: "image", + Command: []string{"cmd"}, // avoid entrypoint lookup. + }}, + }, + want: &corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + InitContainers: []corev1.Container{ + entrypointInitContainer(images.EntrypointImage, []v1.Step{{Name: "name"}}, false, false), + }, + Containers: []corev1.Container{{ + Name: "step-name", + Image: "image", + Command: []string{"/tekton/bin/entrypoint"}, + Args: []string{ + "-wait_file", + "/tekton/downward/ready", + "-wait_file_content", + "-post_file", + "/tekton/run/0/out", + "-termination_path", + "/tekton/termination", + "-step_metadata_dir", + "/tekton/run/0/status", + "-entrypoint", + "cmd", + "--", + }, + VolumeMounts: append([]corev1.VolumeMount{binROMount, runMount(0, false), downwardMount, { + Name: "tekton-creds-init-home-0", + MountPath: "/tekton/creds", + }}, implicitVolumeMounts...), + TerminationMessagePath: "/tekton/termination", + }}, + Volumes: append(implicitVolumes, binVolume, runVolume(0), corev1.Volume{ + Name: downwardVolumeName, + VolumeSource: corev1.VolumeSource{ + DownwardAPI: &corev1.DownwardAPIVolumeSource{ + Items: []corev1.DownwardAPIVolumeFile{{ + Path: downwardMountReadyFile, + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: fmt.Sprintf("metadata.annotations['%s']", readyAnnotation), + }, + }, downwardCancelVolumeItem}, + }, + }, + }, corev1.Volume{ + Name: "tekton-creds-init-home-0", + VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{Medium: corev1.StorageMediumMemory}}, + }), + ActiveDeadlineSeconds: &defaultActiveDeadlineSeconds, + }, + }, { + desc: "keep pod on cancel enabled but not alpha", + featureFlags: map[string]string{"keep-pod-on-cancel": "true"}, + ts: v1.TaskSpec{ + Steps: []v1.Step{{ + Name: "name", + Image: "image", + Command: []string{"cmd"}, // avoid entrypoint lookup. + }}, + }, + want: &corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + InitContainers: []corev1.Container{ + entrypointInitContainer(images.EntrypointImage, []v1.Step{{Name: "name"}}, false, false), + }, + Containers: []corev1.Container{{ + Name: "step-name", + Image: "image", + Command: []string{"/tekton/bin/entrypoint"}, + Args: []string{ + "-wait_file", + "/tekton/downward/ready", + "-wait_file_content", + "-post_file", + "/tekton/run/0/out", + "-termination_path", + "/tekton/termination", + "-step_metadata_dir", + "/tekton/run/0/status", + "-entrypoint", + "cmd", + "--", + }, + VolumeMounts: append([]corev1.VolumeMount{binROMount, runMount(0, false), downwardMount, { + Name: "tekton-creds-init-home-0", + MountPath: "/tekton/creds", + }}, implicitVolumeMounts...), + TerminationMessagePath: "/tekton/termination", + }}, + Volumes: append(implicitVolumes, binVolume, runVolume(0), corev1.Volume{ + Name: downwardVolumeName, + VolumeSource: corev1.VolumeSource{ + DownwardAPI: &corev1.DownwardAPIVolumeSource{ + Items: []corev1.DownwardAPIVolumeFile{{ + Path: downwardMountReadyFile, + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: fmt.Sprintf("metadata.annotations['%s']", readyAnnotation), + }, + }}, + }, + }, + }, corev1.Volume{ + Name: "tekton-creds-init-home-0", + VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{Medium: corev1.StorageMediumMemory}}, + }), + ActiveDeadlineSeconds: &defaultActiveDeadlineSeconds, + }, }} { t.Run(c.desc, func(t *testing.T) { names.TestingSeed() diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 5a9a891fc2a..a72f278eab4 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -687,10 +687,14 @@ func (c *Reconciler) failTaskRun(ctx context.Context, tr *v1.TaskRun, reason v1. return nil } - // tr.Status.PodName will be empty if the pod was never successfully created. This condition - // can be reached, for example, by the pod never being schedulable due to limits imposed by - // a namespace's ResourceQuota. - err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Delete(ctx, tr.Status.PodName, metav1.DeleteOptions{}) + var err error + if reason == v1.TaskRunReasonCancelled && + (config.FromContextOrDefaults(ctx).FeatureFlags.EnableKeepPodOnCancel && config.FromContextOrDefaults(ctx).FeatureFlags.EnableAPIFields == config.AlphaAPIFields) { + logger.Infof("canceling task run %q by entrypoint", tr.Name) + err = podconvert.CancelPod(ctx, c.KubeClientSet, tr.Namespace, tr.Status.PodName) + } else { + err = c.KubeClientSet.CoreV1().Pods(tr.Namespace).Delete(ctx, tr.Status.PodName, metav1.DeleteOptions{}) + } if err != nil && !k8serrors.IsNotFound(err) { logger.Infof("Failed to terminate pod: %v", err) return err diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index fcda8c5974f..b77b3a51891 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -3583,6 +3583,7 @@ func TestFailTaskRun(t *testing.T) { pod *corev1.Pod reason v1.TaskRunReason message string + featureFlags map[string]string expectedStatus apis.Condition expectedStepStates []v1.StepState }{{ @@ -3679,6 +3680,51 @@ status: }, }, }, + }, { + name: "step-status-update-cancel-with-keep-pod-on-cancel", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-run-cancel + namespace: foo +spec: + status: TaskRunCancelled + statusMessage: "Test cancellation message." + taskRef: + name: test-task +status: + conditions: + - status: Unknown + type: Succeeded + podName: foo-is-bar + steps: + - running: + startedAt: "2022-01-01T00:00:00Z" +`), + pod: &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ + Namespace: "foo", + Name: "foo-is-bar", + }}, + reason: v1.TaskRunReasonCancelled, + message: "TaskRun test-taskrun-run-cancel was cancelled. Test cancellation message.", + featureFlags: map[string]string{ + "keep-pod-on-cancel": "true", + }, + expectedStatus: apis.Condition{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + Reason: v1beta1.TaskRunReasonCancelled.String(), + Message: "TaskRun test-taskrun-run-cancel was cancelled. Test cancellation message.", + }, + expectedStepStates: []v1.StepState{ + { + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + Reason: v1.TaskRunReasonCancelled.String(), + }, + }, + }, + }, }, { name: "step-status-update-timeout", taskRun: parse.MustParseV1TaskRun(t, ` @@ -3904,6 +3950,15 @@ status: t.Run(tc.name, func(t *testing.T) { d := test.Data{ TaskRuns: []*v1.TaskRun{tc.taskRun}, + ConfigMaps: []*corev1.ConfigMap{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: config.GetFeatureFlagsConfigName(), + Namespace: system.Namespace(), + }, + Data: tc.featureFlags, + }, + }, } if tc.pod != nil { d.Pods = []*corev1.Pod{tc.pod}