From 2bd7909900e56986e33d4ef095f7c1f064060769 Mon Sep 17 00:00:00 2001 From: Matt Moore Date: Mon, 13 Dec 2021 18:22:47 -0800 Subject: [PATCH] Fix `TestDAGPipelineRun` flakiness. _See also the linked issue for a detailed explanation of the issue this fixes._ This change alters the DAG tests in two meaningful ways: 1. Have the tasks sleep, to actually increase the likelihood of task execution overlap, 2. Use the sleep duration for the minimum delta in start times. These changes combine should guarantee that the tasks *actually* executed in parallel, but the second part also enables this test to be less flaky on busy clusters where `5s` may not be sufficient for the task to start. A fun anecdote to note here is that the Kubernetes [SLO for Pod startup latency](https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md#definition) is `5s` at `99P`, which means Tekton has effectively zero room for overhead. Fixes: https://github.com/tektoncd/pipeline/issues/4418 --- test/dag_test.go | 10 ++++++++-- test/v1alpha1/dag_test.go | 11 +++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/test/dag_test.go b/test/dag_test.go index d0f65cbe1ed..9ed648d902b 100644 --- a/test/dag_test.go +++ b/test/dag_test.go @@ -34,6 +34,8 @@ import ( knativetest "knative.dev/pkg/test" ) +const sleepDuration = 30 * time.Second + // TestDAGPipelineRun creates a graph of arbitrary Tasks, then looks at the corresponding // TaskRun start times to ensure they were run in the order intended, which is: // | @@ -73,9 +75,13 @@ spec: steps: - image: busybox script: 'echo $(params["text"])' + - image: busybox + # Sleep for N seconds so that we can check that tasks that + # should be run in parallel have overlap. + script: 'sleep %d' - image: busybox script: 'ln -s $(resources.inputs.repo.path) $(resources.outputs.repo.path)' -`, namespace)) +`, namespace, int(sleepDuration.Seconds()))) if _, err := c.TaskClient.Create(ctx, echoTask, metav1.CreateOptions{}); err != nil { t.Fatalf("Failed to create echo Task: %s", err) } @@ -242,7 +248,7 @@ func verifyExpectedOrder(ctx context.Context, t *testing.T, c clientset.TaskRunI s1 := taskRuns[1].Status.StartTime.Time s2 := taskRuns[2].Status.StartTime.Time absDiff := time.Duration(math.Abs(float64(s2.Sub(s1)))) - if absDiff > (time.Second * 5) { + if absDiff > sleepDuration { t.Errorf("Expected parallel tasks to execute more or less at the same time, but they were %v apart", absDiff) } } diff --git a/test/v1alpha1/dag_test.go b/test/v1alpha1/dag_test.go index f4d3d5a64af..fdda66c5bbc 100644 --- a/test/v1alpha1/dag_test.go +++ b/test/v1alpha1/dag_test.go @@ -1,3 +1,4 @@ +//go:build e2e // +build e2e /* @@ -34,6 +35,8 @@ import ( knativetest "knative.dev/pkg/test" ) +const sleepDuration = 30 * time.Second + // TestDAGPipelineRun creates a graph of arbitrary Tasks, then looks at the corresponding // TaskRun start times to ensure they were run in the order intended, which is: // | @@ -75,9 +78,13 @@ spec: steps: - image: busybox script: 'echo $(params["text"])' + - image: busybox + # Sleep for N seconds so that we can check that tasks that + # should be run in parallel have overlap. + script: 'sleep %d' - image: busybox script: 'ln -s $(resources.inputs.repo.path) $(resources.outputs.repo.path)' -`, namespace)) +`, namespace, int(sleepDuration.Seconds()))) if _, err := c.TaskClient.Create(ctx, echoTask, metav1.CreateOptions{}); err != nil { t.Fatalf("Failed to create echo Task: %s", err) } @@ -244,7 +251,7 @@ func verifyExpectedOrder(ctx context.Context, t *testing.T, c clientset.TaskRunI s1 := taskRuns[1].Status.StartTime.Time s2 := taskRuns[2].Status.StartTime.Time absDiff := time.Duration(math.Abs(float64(s2.Sub(s1)))) - if absDiff > (time.Second * 5) { + if absDiff > sleepDuration { t.Errorf("Expected parallel tasks to execute more or less at the same time, but they were %v apart", absDiff) } }