From be757f6acd22e3c56ad43499fc737168dc5d9225 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:27:05 +0200 Subject: [PATCH 01/16] fix: continue paused container, when the abort is requested --- cmd/testworkflow-init/orchestration/executions.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/testworkflow-init/orchestration/executions.go b/cmd/testworkflow-init/orchestration/executions.go index 98277c4e534..9cc2cb0e7b1 100644 --- a/cmd/testworkflow-init/orchestration/executions.go +++ b/cmd/testworkflow-init/orchestration/executions.go @@ -133,6 +133,7 @@ func (e *executionGroup) Kill() (err error) { func (e *executionGroup) Abort() { e.aborted.Store(true) _ = e.Kill() + _ = e.Resume() } func (e *executionGroup) IsAborted() bool { From 7022c543d956a2dbe42b956b1b48c06331795f59 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:28:00 +0200 Subject: [PATCH 02/16] fix: ensure the lightweight container watcher will get `finishedAt` timestamp --- pkg/testworkflows/testworkflowcontroller/controller.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/testworkflows/testworkflowcontroller/controller.go b/pkg/testworkflows/testworkflowcontroller/controller.go index 93cbf05de4e..81f8b4de5f3 100644 --- a/pkg/testworkflows/testworkflowcontroller/controller.go +++ b/pkg/testworkflows/testworkflowcontroller/controller.go @@ -231,6 +231,7 @@ func (c *controller) WatchLightweight(parentCtx context.Context) <-chan Lightwei prevNodeName := "" prevPodIP := "" prevStatus := testkube.QUEUED_TestWorkflowStatus + prevIsFinished := false sig := stage.MapSignatureListToInternal(c.signature) ch := make(chan LightweightNotification) go func() { @@ -245,6 +246,7 @@ func (c *controller) WatchLightweight(parentCtx context.Context) <-chan Lightwei podIP, _ := c.PodIP(parentCtx) current := prevCurrent status := prevStatus + isFinished := prevIsFinished if v.Value.Result != nil { if v.Value.Result.Status != nil { status = *v.Value.Result.Status @@ -252,13 +254,17 @@ func (c *controller) WatchLightweight(parentCtx context.Context) <-chan Lightwei status = testkube.QUEUED_TestWorkflowStatus } current = v.Value.Result.Current(sig) + isFinished = v.Value.Result.IsFinished() } - if nodeName != prevNodeName || podIP != prevPodIP || prevStatus != status || prevCurrent != current { + // TODO: the final status should always have the finishedAt too, + // there should be no need for checking isFinished diff + if nodeName != prevNodeName || isFinished != prevIsFinished || podIP != prevPodIP || prevStatus != status || prevCurrent != current { prevNodeName = nodeName prevPodIP = podIP prevStatus = status prevCurrent = current + prevIsFinished = isFinished ch <- LightweightNotification{NodeName: nodeName, PodIP: podIP, Status: status, Current: current, Result: v.Value.Result} } } From 3e349ed9dc51491a6fbbf0de79d62254ba0cff0b Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:28:36 +0200 Subject: [PATCH 03/16] chore: add minor todos --- cmd/testworkflow-init/main.go | 2 ++ pkg/testworkflows/testworkflowprocessor/processor.go | 1 + 2 files changed, 3 insertions(+) diff --git a/cmd/testworkflow-init/main.go b/cmd/testworkflow-init/main.go index 6d37c36437f..3c7be29e66d 100644 --- a/cmd/testworkflow-init/main.go +++ b/cmd/testworkflow-init/main.go @@ -110,6 +110,7 @@ func main() { orchestration.Pause(step, *step.StartedAt) for _, parentRef := range step.Parents { parent := state.GetStep(parentRef) + // TODO: What about parents of the parents? orchestration.Pause(parent, *step.StartedAt) } return err @@ -125,6 +126,7 @@ func main() { orchestration.Resume(step, ts) for _, parentRef := range step.Parents { parent := state.GetStep(parentRef) + // TODO: What about parents of the parents? orchestration.Resume(parent, ts) } return err diff --git a/pkg/testworkflows/testworkflowprocessor/processor.go b/pkg/testworkflows/testworkflowprocessor/processor.go index f941003205f..66c863327ee 100644 --- a/pkg/testworkflows/testworkflowprocessor/processor.go +++ b/pkg/testworkflows/testworkflowprocessor/processor.go @@ -361,6 +361,7 @@ func (p *processor) Bundle(ctx context.Context, workflow *testworkflowsv1.TestWo podSpec.Spec.Containers = containers[len(containers)-1:] // Build job spec + // TODO: Add ownerReferences in case of parent pod? jobSpec := batchv1.Job{ TypeMeta: metav1.TypeMeta{ Kind: "Job", From 1d3c269657b9d5ac0fa58d449a46577edc78b01c Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:29:30 +0200 Subject: [PATCH 04/16] fix: configure no preemption policy by default for Test Workflows --- pkg/testworkflows/testworkflowprocessor/processor.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/testworkflows/testworkflowprocessor/processor.go b/pkg/testworkflows/testworkflowprocessor/processor.go index 66c863327ee..4a1b841cfe9 100644 --- a/pkg/testworkflows/testworkflowprocessor/processor.go +++ b/pkg/testworkflows/testworkflowprocessor/processor.go @@ -326,6 +326,9 @@ func (p *processor) Bundle(ctx context.Context, workflow *testworkflowsv1.TestWo if podConfig.SecurityContext.FSGroup == nil { podConfig.SecurityContext.FSGroup = common.Ptr(constants.DefaultFsGroup) } + if podConfig.PreemptionPolicy == nil { + podConfig.PreemptionPolicy = common.Ptr(corev1.PreemptNever) + } podSpec := corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Annotations: podConfig.Annotations, From 47b9935e4df4ed828c945f2cab80f2440ea72485 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:29:59 +0200 Subject: [PATCH 05/16] fix: allow Test Workflow status notifier to update "Aborted" status with details --- pkg/testworkflows/testworkflowcontroller/notifier.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/testworkflows/testworkflowcontroller/notifier.go b/pkg/testworkflows/testworkflowcontroller/notifier.go index 83e2985aeb1..7892c3eb10d 100644 --- a/pkg/testworkflows/testworkflowcontroller/notifier.go +++ b/pkg/testworkflows/testworkflowcontroller/notifier.go @@ -312,7 +312,7 @@ func (n *notifier) UpdateStepStatus(ref string, status testkube.TestWorkflowStep } func (n *notifier) finishInit(status ContainerResultStep) { - if n.result.Initialization.FinishedAt.Equal(status.FinishedAt) && n.result.Initialization.Status != nil && *n.result.Initialization.Status == status.Status { + if n.result.Initialization.FinishedAt.Equal(status.FinishedAt) && n.result.Initialization.Status != nil && *n.result.Initialization.Status == status.Status && (status.Status != testkube.ABORTED_TestWorkflowStepStatus || n.result.Initialization.ErrorMessage == status.Details) { return } n.result.Initialization.FinishedAt = status.FinishedAt.UTC() @@ -352,7 +352,7 @@ func (n *notifier) FinishStep(ref string, status ContainerResultStep) { n.finishInit(status) return } - if n.result.Steps[ref].FinishedAt.Equal(status.FinishedAt) && n.result.Steps[ref].Status != nil && *n.result.Steps[ref].Status == status.Status { + if n.result.Steps[ref].FinishedAt.Equal(status.FinishedAt) && n.result.Steps[ref].Status != nil && *n.result.Steps[ref].Status == status.Status && (status.Status != testkube.ABORTED_TestWorkflowStepStatus || n.result.Steps[ref].ErrorMessage == status.Details) { return } s := n.result.Steps[ref] From 0f3cdf9d5a7983027827ca70fd12e745755bcfbc Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:30:29 +0200 Subject: [PATCH 06/16] fix: ensure the parallel workers will not end without result --- cmd/tcl/testworkflow-toolkit/commands/parallel.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/tcl/testworkflow-toolkit/commands/parallel.go b/cmd/tcl/testworkflow-toolkit/commands/parallel.go index 24405cf1565..d9831faacc6 100644 --- a/cmd/tcl/testworkflow-toolkit/commands/parallel.go +++ b/cmd/tcl/testworkflow-toolkit/commands/parallel.go @@ -263,6 +263,7 @@ func NewParallelCmd() *cobra.Command { prevStatus := testkube.QUEUED_TestWorkflowStatus prevStep := "" + prevIsFinished := false scheduled := false for v := range ctrl.WatchLightweight(ctx) { // Handle error @@ -283,14 +284,17 @@ func NewParallelCmd() *cobra.Command { } // Handle result change - if v.Status != prevStatus || v.Current != prevStep { + // TODO: the final status should always have the finishedAt too, + // there should be no need for checking isFinished diff + if v.Status != prevStatus || lastResult.IsFinished() != prevIsFinished || v.Current != prevStep { if v.Status != prevStatus { log(string(v.Status)) } updates <- Update{index: index, result: v.Result} prevStep = v.Current prevStatus = v.Status - if v.Result.IsFinished() { + prevIsFinished = lastResult.IsFinished() + if lastResult.IsFinished() { instructions.PrintOutput(env.Ref(), "parallel", ParallelStatus{Index: int(index), Status: v.Status, Result: v.Result}) ctxCancel() return v.Result.IsPassed() From ad1c241215c01572bf6505cc5d34f832bde538ee Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:30:57 +0200 Subject: [PATCH 07/16] fix: properly build timestamps and detect finished resul in the TestWorkflowResult model --- .../model_test_workflow_result_extended.go | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/pkg/api/v1/testkube/model_test_workflow_result_extended.go b/pkg/api/v1/testkube/model_test_workflow_result_extended.go index 5b8f22dc227..7ce5a041edf 100644 --- a/pkg/api/v1/testkube/model_test_workflow_result_extended.go +++ b/pkg/api/v1/testkube/model_test_workflow_result_extended.go @@ -8,7 +8,7 @@ import ( ) func (r *TestWorkflowResult) IsFinished() bool { - return !r.IsStatus(QUEUED_TestWorkflowStatus) && !r.IsStatus(RUNNING_TestWorkflowStatus) && !r.IsStatus(PAUSED_TestWorkflowStatus) + return !r.FinishedAt.IsZero() && !r.IsStatus(QUEUED_TestWorkflowStatus) && !r.IsStatus(RUNNING_TestWorkflowStatus) && !r.IsStatus(PAUSED_TestWorkflowStatus) } func (r *TestWorkflowResult) IsStatus(s TestWorkflowStatus) bool { @@ -90,7 +90,7 @@ func (r *TestWorkflowResult) Fatal(err error, aborted bool, ts time.Time) { if r.FinishedAt.IsZero() { r.FinishedAt = ts.UTC() } - if r.Initialization.Status == nil || (*r.Initialization.Status == QUEUED_TestWorkflowStepStatus) || (*r.Initialization.Status == RUNNING_TestWorkflowStepStatus) { + if r.Initialization.Status == nil || !(*r.Initialization.Status).Finished() { r.Initialization.Status = common.Ptr(FAILED_TestWorkflowStepStatus) if aborted { r.Initialization.Status = common.Ptr(ABORTED_TestWorkflowStepStatus) @@ -158,16 +158,23 @@ func (r *TestWorkflowResult) RecomputeDuration() { if !r.FinishedAt.IsZero() { r.PausedMs = 0 + // Finalize pauses + for i := range r.Pauses { + step := r.Steps[r.Pauses[i].Ref] + if !step.FinishedAt.IsZero() { + if r.Pauses[i].ResumedAt.IsZero() { + r.Pauses[i].ResumedAt = step.FinishedAt + } + if r.Pauses[i].PausedAt.Before(step.StartedAt) { + r.Pauses[i].PausedAt = step.StartedAt + } + } + } + // Get unique pause periods pauses := make([]TestWorkflowPause, 0) loop: for _, p := range r.Pauses { - // Finalize the pause if it's not - step := r.Steps[p.Ref] - if !step.FinishedAt.IsZero() && p.ResumedAt.IsZero() { - p.ResumedAt = step.FinishedAt - } - for i := range pauses { // They don't overlap if p.PausedAt.After(pauses[i].ResumedAt) || p.ResumedAt.Before(pauses[i].PausedAt) { @@ -326,6 +333,14 @@ func (r *TestWorkflowResult) Recompute(sig []TestWorkflowSignature, scheduledAt r.Steps[s.ref] = s.result } + // Ensure initialization timestamps + if !r.Initialization.FinishedAt.IsZero() && r.Initialization.StartedAt.IsZero() { + r.Initialization.StartedAt = r.Initialization.FinishedAt + } + if !r.Initialization.StartedAt.IsZero() && r.Initialization.QueuedAt.IsZero() { + r.Initialization.QueuedAt = r.Initialization.StartedAt + } + // Calibrate the clock for group steps walkSteps(sig, func(s TestWorkflowSignature) { if len(s.Children) == 0 { @@ -543,7 +558,16 @@ func recomputeTestWorkflowStepResult(v TestWorkflowStepResult, sig TestWorkflowS // Ensure there is a start time if the step is already finished if v.StartedAt.IsZero() && !v.FinishedAt.IsZero() { - v.StartedAt = v.QueuedAt + if v.QueuedAt.IsZero() { + v.StartedAt = v.FinishedAt + } else { + v.StartedAt = v.QueuedAt + } + } + + // Ensure there is a queued time if the step is already finished + if v.QueuedAt.IsZero() && !v.StartedAt.IsZero() { + v.QueuedAt = v.StartedAt } // Compute children From 891585667562d897fbdfedb2a0d8488a4459673e Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:32:16 +0200 Subject: [PATCH 08/16] fix: use Pod/Job StatusConditions for detecting the status, make watching more resilient to external problems, expose more Kubernetes error details --- .../testworkflowcontroller/logs.go | 17 ++ .../testworkflowcontroller/podstate.go | 159 +++++++++++++----- .../testworkflowcontroller/utils.go | 11 +- .../watchinstrumentedpod.go | 112 +++++++++--- 4 files changed, 232 insertions(+), 67 deletions(-) diff --git a/pkg/testworkflows/testworkflowcontroller/logs.go b/pkg/testworkflows/testworkflowcontroller/logs.go index ea04f770fe1..bcd303924a9 100644 --- a/pkg/testworkflows/testworkflowcontroller/logs.go +++ b/pkg/testworkflows/testworkflowcontroller/logs.go @@ -40,6 +40,23 @@ type ContainerLog struct { Output *instructions.Instruction } +type ContainerLogType string + +const ( + ContainerLogTypeHint ContainerLogType = "hint" + ContainerLogTypeOutput ContainerLogType = "output" + ContainerLogTypeLog ContainerLogType = "" +) + +func (c *ContainerLog) Type() ContainerLogType { + if c.Hint != nil { + return ContainerLogTypeHint + } else if c.Output != nil { + return ContainerLogTypeOutput + } + return ContainerLogTypeLog +} + // getContainerLogsStream is getting logs stream, and tries to reinitialize the stream on EOF. // EOF may happen not only on the actual container end, but also in case of the log rotation. // @see {@link https://stackoverflow.com/a/68673451} diff --git a/pkg/testworkflows/testworkflowcontroller/podstate.go b/pkg/testworkflows/testworkflowcontroller/podstate.go index ee1a267a5b0..a504e72ee81 100644 --- a/pkg/testworkflows/testworkflowcontroller/podstate.go +++ b/pkg/testworkflows/testworkflowcontroller/podstate.go @@ -2,6 +2,7 @@ package testworkflowcontroller import ( "context" + "fmt" "regexp" "slices" "strconv" @@ -85,6 +86,9 @@ func newPodState(parentCtx context.Context) *podState { func (p *podState) preStartWatcher(name string) *channel[podStateUpdate] { if _, ok := p.prestart[name]; !ok { p.prestart[name] = newChannel[podStateUpdate](p.ctx, eventBufferSize) + if p.ctx.Err() != nil || p.unsafeIsStarted(name) || p.unsafeIsFinished(name) { + p.prestart[name].Close() + } } return p.prestart[name] } @@ -278,6 +282,13 @@ func (p *podState) RegisterJob(job *batchv1.Job) { p.setQueuedAt("", job.CreationTimestamp.Time) if job.Status.CompletionTime != nil { p.setFinishedAt("", job.Status.CompletionTime.Time) + } else if slices.ContainsFunc(job.Status.Conditions, isJobConditionEnd) { + for i := range job.Status.Conditions { + if isJobConditionEnd(job.Status.Conditions[i]) { + p.setFinishedAt("", job.Status.Conditions[i].LastTransitionTime.Time) + break + } + } } else if job.DeletionTimestamp != nil { p.setFinishedAt("", job.DeletionTimestamp.Time) } @@ -293,11 +304,18 @@ func (p *podState) PreStart(name string) <-chan ChannelMessage[podStateUpdate] { return p.preStartWatcher(name).Channel() } +func (p *podState) unsafeIsStarted(name string) bool { + return !p.started[name].IsZero() +} + +func (p *podState) unsafeIsFinished(name string) bool { + return !p.finished[name].IsZero() +} + func (p *podState) IsFinished(name string) bool { p.mu.Lock() defer p.mu.Unlock() - _, ok := p.finished[name] - return ok && p.ctx.Err() == nil + return p.unsafeIsFinished(name) } func (p *podState) Finished(name string) chan struct{} { @@ -331,9 +349,21 @@ func (p *podState) FinishedAt(name string) time.Time { func (p *podState) containerResult(name string) (ContainerResult, error) { status := p.containerStatus(name) if status == nil || status.State.Terminated == nil { - // TODO: Handle it nicer if p.pod != nil && IsPodDone(p.pod) { - return UnknownContainerResult, nil + result := UnknownContainerResult + for _, c := range p.pod.Status.Conditions { + if c.Type == corev1.DisruptionTarget && c.Status == corev1.ConditionTrue { + if c.Reason == "EvictionByEvictionAPI" { + result.Details = "Pod has been requested for deletion using the Kubernetes API" + } else if c.Message == "" { + result.Details = c.Reason + } else { + result.Details = fmt.Sprintf("%s: %s", c.Reason, c.Message) + } + break + } + } + return result, nil } return UnknownContainerResult, ErrNotTerminatedYet } @@ -381,58 +411,97 @@ func (p *podState) ContainerResult(name string) (ContainerResult, error) { func initializePodState(parentCtx context.Context, pod Channel[*corev1.Pod], podEvents Channel[*corev1.Event], job Channel[*batchv1.Job], jobEvents Channel[*corev1.Event], errorHandler func(error)) *podState { ctx, ctxCancel := context.WithCancel(parentCtx) state := newPodState(ctx) + + // Fill optional channels + if job == nil { + job = newChannel[*batchv1.Job](ctx, 0) + job.Close() + } + if jobEvents == nil { + jobEvents = newChannel[*corev1.Event](ctx, 0) + jobEvents.Close() + } + go func() { defer ctxCancel() - var wg sync.WaitGroup - if job != nil { - wg.Add(1) - go func() { - for v := range job.Channel() { - if v.Error != nil { - errorHandler(v.Error) - } else { - state.RegisterJob(v.Value) - } + + // Build channels for the streams + left := 4 + jobCh := job.Channel() + jobEventsCh := jobEvents.Channel() + podCh := pod.Channel() + podEventsCh := podEvents.Channel() + + // Loop for the data + for { + if left == 0 { + return + } + + // Prioritize pod & events + select { + case <-parentCtx.Done(): + return + case v, ok := <-podCh: + if !ok { + podCh = nil + left-- + continue } - wg.Done() - }() - } - if jobEvents != nil { - wg.Add(1) - go func() { - for v := range jobEvents.Channel() { - if v.Error != nil { - errorHandler(v.Error) - } else { - state.RegisterEvent(v.Value) - } + if v.Error != nil { + errorHandler(v.Error) + continue + } + state.RegisterPod(v.Value) + case v, ok := <-jobEventsCh: + if !ok { + jobEventsCh = nil + left-- + continue } - wg.Done() - }() - } - wg.Add(1) - go func() { - for v := range podEvents.Channel() { if v.Error != nil { errorHandler(v.Error) - } else { - state.RegisterEvent(v.Value) + continue + } + state.RegisterEvent(v.Value) + case v, ok := <-podEventsCh: + if !ok { + podEventsCh = nil + left-- + continue + } + if v.Error != nil { + errorHandler(v.Error) + continue + } + state.RegisterEvent(v.Value) + case v, ok := <-jobCh: + if !ok { + jobCh = nil + left-- + continue } - } - wg.Done() - }() - wg.Add(1) - go func() { - for v := range pod.Channel() { if v.Error != nil { errorHandler(v.Error) - } else { - state.RegisterPod(v.Value) + continue } + + // Try to firstly finish with the Pod information when it's possible + if IsJobDone(v.Value) && state.FinishedAt("").IsZero() && HadPodScheduled(v.Value) { + select { + case p, ok := <-podCh: + if p.Error != nil { + errorHandler(p.Error) + } else if ok { + state.RegisterPod(p.Value) + } + case <-time.After(2 * time.Second): + // Continue - likely we won't receive Pod status + } + } + state.RegisterJob(v.Value) } - wg.Done() - }() - wg.Wait() + } }() return state } diff --git a/pkg/testworkflows/testworkflowcontroller/utils.go b/pkg/testworkflows/testworkflowcontroller/utils.go index bfe7fd2a6ee..064bd6147eb 100644 --- a/pkg/testworkflows/testworkflowcontroller/utils.go +++ b/pkg/testworkflows/testworkflowcontroller/utils.go @@ -2,6 +2,7 @@ package testworkflowcontroller import ( "regexp" + "slices" "time" batchv1 "k8s.io/api/batch/v1" @@ -29,8 +30,16 @@ func IsPodDone(pod *corev1.Pod) bool { return (pod.Status.Phase != corev1.PodPending && pod.Status.Phase != corev1.PodRunning) || pod.ObjectMeta.DeletionTimestamp != nil } +func isJobConditionEnd(condition batchv1.JobCondition) bool { + return (condition.Type == batchv1.JobFailed || condition.Type == batchv1.JobComplete) && condition.Status == corev1.ConditionTrue && !condition.LastTransitionTime.IsZero() +} + func IsJobDone(job *batchv1.Job) bool { - return (job.Status.Active == 0 && (job.Status.Succeeded > 0 || job.Status.Failed > 0)) || job.ObjectMeta.DeletionTimestamp != nil + return (job.Status.Active == 0 && (job.Status.Succeeded > 0 || job.Status.Failed > 0)) || job.ObjectMeta.DeletionTimestamp != nil || job.Status.CompletionTime != nil || slices.ContainsFunc(job.Status.Conditions, isJobConditionEnd) +} + +func HadPodScheduled(job *batchv1.Job) bool { + return job.Status.Active > 0 || job.Status.Succeeded > 0 || job.Status.Failed > 0 } type ContainerResultStep struct { diff --git a/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go b/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go index 1485d593e7b..8aa03509879 100644 --- a/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go +++ b/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go @@ -7,6 +7,7 @@ import ( "strconv" "time" + "github.com/gookit/color" "github.com/pkg/errors" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" @@ -65,13 +66,27 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf } // Ensure the queue/start time has been saved - if s.result.QueuedAt.IsZero() || s.result.StartedAt.IsZero() { + if (s.result.QueuedAt.IsZero() || s.result.StartedAt.IsZero()) && state.FinishedAt("").IsZero() { s.Error(errors.New("missing information about scheduled pod")) return } // Load the namespace information - podObj := <-pod.Peek(ctx) + var podObj *corev1.Pod + select { + // Obtain the Pod information for further execution + case p := <-pod.Peek(ctx): + podObj = p + // Handle when the execution have been finished, but there may be no Pod + case <-state.Finished(""): + select { + case <-time.After(1 * time.Second): + s.Error(fmt.Errorf("the execution is finished, but failed to get pod")) + return + case p := <-pod.Peek(ctx): + podObj = p + } + } // Load the references var refs, endRefs [][]string @@ -113,8 +128,30 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf // Update queue time s.Queue(initialRef, lastTs) - // Watch the container events - for v := range state.PreStart(containerName) { + // Watch the container events, along with final finish too + preStartCh := state.PreStart(containerName) + finishedCh := state.Finished("") + loop: + for { + var v ChannelMessage[podStateUpdate] + select { + case vv, ok := <-preStartCh: + if !ok { + break loop + } + v = vv + default: + select { + case vv, ok := <-preStartCh: + if !ok { + break loop + } + v = vv + case <-finishedCh: + break loop + } + } + if v.Value.Queued != nil { s.Queue(initialRef, state.QueuedAt(containerName)) } else if v.Value.Started != nil { @@ -127,22 +164,27 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf } // Ensure the queue/start time has been saved - if s.GetStepResult(initialRef).QueuedAt.IsZero() || s.GetStepResult(initialRef).StartedAt.IsZero() { + if (s.GetStepResult(initialRef).QueuedAt.IsZero() || s.GetStepResult(initialRef).StartedAt.IsZero()) && state.FinishedAt("").IsZero() { s.Error(fmt.Errorf("missing information about scheduled '%s' step in '%s' container", initialRef, container.Name)) return } // Watch the container logs - follow := common.ResolvePtr(opts.Follow, true) && !state.IsFinished(containerName) + follow := common.ResolvePtr(opts.Follow, true) && !state.IsFinished(containerName) && !state.IsFinished("") aborted := false lastStarted := initialRef executionStatuses := map[string]constants.ExecutionResult{} for v := range WatchContainerLogs(ctx, clientSet, podObj.Namespace, podObj.Name, containerName, follow, 10, pod).Channel() { if v.Error != nil { s.Error(v.Error) - } else if v.Value.Output != nil { + } + + switch v.Value.Type() { + case ContainerLogTypeLog: + s.Raw(lastStarted, v.Value.Time, string(v.Value.Log), false) + case ContainerLogTypeOutput: s.Output(v.Value.Output.Ref, v.Value.Time, v.Value.Output) - } else if v.Value.Hint != nil { + case ContainerLogTypeHint: if v.Value.Hint.Ref == constants2.RootOperationName { continue } @@ -189,18 +231,22 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf } s.Resume(v.Value.Hint.Ref, end) } - } else { - s.Raw(lastStarted, v.Value.Time, string(v.Value.Log), false) } } if aborted { // Don't wait for any other statuses if we already know that some task has been aborted } else if follow { - <-state.Finished(container.Name) + select { + case <-state.Finished(container.Name): + case <-state.Finished(""): + // Finish fast when the whole execution has been finished + } } else { select { case <-state.Finished(container.Name): + case <-state.Finished(""): + // Finish fast when the whole execution has been finished case <-time.After(IdleTimeout): return } @@ -222,48 +268,72 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf status := ContainerResultStep{ Status: testkube.ABORTED_TestWorkflowStepStatus, ExitCode: -1, - Details: "", + Details: result.Details, FinishedAt: result.FinishedAt, } + if status.FinishedAt.IsZero() { + status.FinishedAt = state.FinishedAt("") + if status.FinishedAt.IsZero() { + status.FinishedAt = s.GetLastTimestamp(lastStarted) + } + } + if len(result.Steps) > i { status = result.Steps[i] + if status.Details == "" { + status.Details = result.Details + } } - if !s.IsFinished(ref) { - s.FinishStep(ref, status) + s.FinishStep(ref, status) + if status.Status == testkube.ABORTED_TestWorkflowStepStatus { + lastStarted = ref + break } } } // Update the last timestamp - lastTs = s.GetLastTimestamp(lastStarted) + nextLastTs := s.GetLastTimestamp(lastStarted) + if nextLastTs.After(lastTs) { + lastTs = nextLastTs + } // Break the function if the step has been aborted. // Breaking only to the loop is not enough, // because due to GKE bug, the Job is still pending, // so it will get stuck there. if s.IsAnyAborted() { + result, _ := state.ContainerResult(container.Name) reason := s.result.Steps[lastStarted].ErrorMessage + if reason == "" { + reason = result.Details + } message := "Aborted" if reason == "" { - message = fmt.Sprintf("\n%s Aborted", s.GetLastTimestamp(lastStarted).Format(KubernetesLogTimeFormat)) + message = fmt.Sprintf("\n%s Aborted", lastTs.Format(KubernetesLogTimeFormat)) } else { - message = fmt.Sprintf("\n%s Aborted (%s)", s.GetLastTimestamp(lastStarted).Format(KubernetesLogTimeFormat), reason) + message = fmt.Sprintf("\n%s Aborted (%s)", lastTs.Format(KubernetesLogTimeFormat), reason) } - s.Raw(lastStarted, s.GetLastTimestamp(lastStarted), message, false) + s.Raw(lastStarted, lastTs, message, false) // Mark all not started steps as skipped for ref := range s.result.Steps { if !s.IsFinished(ref) { status := testkube.SKIPPED_TestWorkflowStepStatus - details := "The execution was aborted before." + details := "The execution was aborted before" if s.result.Steps[ref].Status != nil && *s.result.Steps[ref].Status != testkube.QUEUED_TestWorkflowStepStatus { status = testkube.ABORTED_TestWorkflowStepStatus - details = "" + details = result.Details + } else if result.Details != "" { + details = fmt.Sprintf("The execution was aborted before %s", color.FgDarkGray.Render("("+result.Details+")")) + } + if details != "" { + s.Raw(ref, lastTs, fmt.Sprintf("%s %s", lastTs.Format(KubernetesLogTimeFormat), details), false) } s.FinishStep(ref, ContainerResultStep{ Status: status, ExitCode: -1, - Details: details, + Details: "", FinishedAt: lastTs, }) } From aaf1673bd7a29c6dc02f18b3bdde950f9c8add90 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:32:41 +0200 Subject: [PATCH 09/16] chore: do not require job/pod events when fetching logs of parallel workers and services --- pkg/testworkflows/testworkflowcontroller/controller.go | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pkg/testworkflows/testworkflowcontroller/controller.go b/pkg/testworkflows/testworkflowcontroller/controller.go index 81f8b4de5f3..843c959ac6c 100644 --- a/pkg/testworkflows/testworkflowcontroller/controller.go +++ b/pkg/testworkflows/testworkflowcontroller/controller.go @@ -277,16 +277,6 @@ func (c *controller) Logs(parentCtx context.Context, follow bool) io.Reader { go func() { defer writer.Close() ref := "" - // Wait until there will be events fetched first - alignTimeoutCh := time.After(alignmentTimeout) - select { - case <-c.jobEvents.Peek(parentCtx): - case <-alignTimeoutCh: - } - select { - case <-c.podEvents.Peek(parentCtx): - case <-alignTimeoutCh: - } ch, err := WatchInstrumentedPod(parentCtx, c.clientSet, c.signature, c.scheduledAt, c.pod, c.podEvents, WatchInstrumentedPodOptions{ JobEvents: c.jobEvents, Job: c.job, From 2a98b47c66958c17e4f7c1aaa64896e58bf1bd39 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:40:50 +0200 Subject: [PATCH 10/16] fixup unit tests --- .../testworkflowprocessor/presets/processor_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go b/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go index fb2f9fb17a1..8aff4b55e22 100644 --- a/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go +++ b/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go @@ -175,6 +175,7 @@ func TestProcessBasic(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{}, Containers: []corev1.Container{ { @@ -289,6 +290,7 @@ func TestProcessShellWithNonStandardImage(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -395,6 +397,7 @@ func TestProcessBasicEnvReference(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{}, Containers: []corev1.Container{ { @@ -488,6 +491,7 @@ func TestProcessMultipleSteps(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -626,6 +630,7 @@ func TestProcessNestedSteps(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -727,6 +732,7 @@ func TestProcessLocalContent(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -806,6 +812,7 @@ func TestProcessGlobalContent(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, + PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", From d7bdcd70dc638cfce66f925975155244651acca3 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 15:54:24 +0200 Subject: [PATCH 11/16] fix: delete preemption policy setup --- pkg/testworkflows/testworkflowprocessor/processor.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/testworkflows/testworkflowprocessor/processor.go b/pkg/testworkflows/testworkflowprocessor/processor.go index 4a1b841cfe9..66c863327ee 100644 --- a/pkg/testworkflows/testworkflowprocessor/processor.go +++ b/pkg/testworkflows/testworkflowprocessor/processor.go @@ -326,9 +326,6 @@ func (p *processor) Bundle(ctx context.Context, workflow *testworkflowsv1.TestWo if podConfig.SecurityContext.FSGroup == nil { podConfig.SecurityContext.FSGroup = common.Ptr(constants.DefaultFsGroup) } - if podConfig.PreemptionPolicy == nil { - podConfig.PreemptionPolicy = common.Ptr(corev1.PreemptNever) - } podSpec := corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Annotations: podConfig.Annotations, From fc785b3824111dbab9e2bc3e378259b4c0db1cc0 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 20:57:43 +0200 Subject: [PATCH 12/16] fixup unit tests --- .../testworkflowprocessor/presets/processor_test.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go b/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go index 8aff4b55e22..2aa325e1715 100644 --- a/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go +++ b/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go @@ -175,7 +175,6 @@ func TestProcessBasic(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{}, Containers: []corev1.Container{ { @@ -290,7 +289,6 @@ func TestProcessShellWithNonStandardImage(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -397,7 +395,6 @@ func TestProcessBasicEnvReference(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{}, Containers: []corev1.Container{ { @@ -491,7 +488,6 @@ func TestProcessMultipleSteps(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -630,7 +626,6 @@ func TestProcessNestedSteps(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", @@ -812,7 +807,6 @@ func TestProcessGlobalContent(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1", From a70e92bcd652916a38d9579a223d33c0462ded2b Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 21:03:39 +0200 Subject: [PATCH 13/16] fix: adjust resume time to avoid negative duration --- pkg/api/v1/testkube/model_test_workflow_result_extended.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/api/v1/testkube/model_test_workflow_result_extended.go b/pkg/api/v1/testkube/model_test_workflow_result_extended.go index 7ce5a041edf..0ac3192d0f5 100644 --- a/pkg/api/v1/testkube/model_test_workflow_result_extended.go +++ b/pkg/api/v1/testkube/model_test_workflow_result_extended.go @@ -168,6 +168,9 @@ func (r *TestWorkflowResult) RecomputeDuration() { if r.Pauses[i].PausedAt.Before(step.StartedAt) { r.Pauses[i].PausedAt = step.StartedAt } + if r.Pauses[i].ResumedAt.Before(r.Pauses[i].PausedAt) { + r.Pauses[i].PausedAt = r.Pauses[i].ResumedAt + } } } From 5652bfd8ad9a32b8235eb3b471ddacc72fb5d3a1 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 21:59:53 +0200 Subject: [PATCH 14/16] fix: calibrate clocks --- .../model_test_workflow_result_extended.go | 8 ++++++-- .../testworkflowcontroller/notifier.go | 6 +++--- .../watchinstrumentedpod.go | 15 +++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pkg/api/v1/testkube/model_test_workflow_result_extended.go b/pkg/api/v1/testkube/model_test_workflow_result_extended.go index 0ac3192d0f5..4909d47a1c3 100644 --- a/pkg/api/v1/testkube/model_test_workflow_result_extended.go +++ b/pkg/api/v1/testkube/model_test_workflow_result_extended.go @@ -404,8 +404,12 @@ func (r *TestWorkflowResult) Recompute(sig []TestWorkflowSignature, scheduledAt r.Status = common.Ptr(RUNNING_TestWorkflowStatus) } - if r.FinishedAt.IsZero() && r.Status != nil && *r.Status == ABORTED_TestWorkflowStatus { - r.FinishedAt = r.LatestTimestamp() + // Ensure the finish time is after all other timestamps + if !r.FinishedAt.IsZero() || (r.Status != nil && *r.Status == ABORTED_TestWorkflowStatus) { + lastTs := r.LatestTimestamp() + if r.FinishedAt.Before(lastTs) { + r.FinishedAt = lastTs + } } // Compute the duration diff --git a/pkg/testworkflows/testworkflowcontroller/notifier.go b/pkg/testworkflows/testworkflowcontroller/notifier.go index 7892c3eb10d..567355efbdb 100644 --- a/pkg/testworkflows/testworkflowcontroller/notifier.go +++ b/pkg/testworkflows/testworkflowcontroller/notifier.go @@ -292,11 +292,11 @@ func (n *notifier) Output(ref string, ts time.Time, output *instructions.Instruc } func (n *notifier) Finish(ts time.Time) { - n.resultMu.Lock() - defer n.resultMu.Unlock() - if !n.result.FinishedAt.Before(ts) { + if ts.IsZero() { return } + n.resultMu.Lock() + defer n.resultMu.Unlock() n.result.FinishedAt = ts n.emit() } diff --git a/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go b/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go index 8aa03509879..107009f5d66 100644 --- a/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go +++ b/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go @@ -269,13 +269,16 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf Status: testkube.ABORTED_TestWorkflowStepStatus, ExitCode: -1, Details: result.Details, - FinishedAt: result.FinishedAt, + FinishedAt: s.GetStepResult(ref).FinishedAt, + } + if status.FinishedAt.IsZero() { + status.FinishedAt = result.FinishedAt } if status.FinishedAt.IsZero() { status.FinishedAt = state.FinishedAt("") - if status.FinishedAt.IsZero() { - status.FinishedAt = s.GetLastTimestamp(lastStarted) - } + } + if status.FinishedAt.IsZero() { + status.FinishedAt = s.GetLastTimestamp(lastStarted) } if len(result.Steps) > i { @@ -283,6 +286,10 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf if status.Details == "" { status.Details = result.Details } + finishedAt := s.GetStepResult(ref).FinishedAt + if !finishedAt.IsZero() { + status.FinishedAt = finishedAt + } } s.FinishStep(ref, status) if status.Status == testkube.ABORTED_TestWorkflowStepStatus { From 95a19117d46d8dd2a031cbfab5456b2869289997 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 22:01:21 +0200 Subject: [PATCH 15/16] chore: use consts --- pkg/testworkflows/testworkflowcontroller/podstate.go | 2 +- .../testworkflowcontroller/watchinstrumentedpod.go | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/testworkflows/testworkflowcontroller/podstate.go b/pkg/testworkflows/testworkflowcontroller/podstate.go index a504e72ee81..cd925c8a7ce 100644 --- a/pkg/testworkflows/testworkflowcontroller/podstate.go +++ b/pkg/testworkflows/testworkflowcontroller/podstate.go @@ -495,7 +495,7 @@ func initializePodState(parentCtx context.Context, pod Channel[*corev1.Pod], pod } else if ok { state.RegisterPod(p.Value) } - case <-time.After(2 * time.Second): + case <-time.After(alignmentTimeout): // Continue - likely we won't receive Pod status } } diff --git a/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go b/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go index 107009f5d66..f0a2ec88e97 100644 --- a/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go +++ b/pkg/testworkflows/testworkflowcontroller/watchinstrumentedpod.go @@ -22,8 +22,9 @@ import ( ) const ( - InitContainerName = "tktw-init" - IdleTimeout = 100 * time.Millisecond + InitContainerName = "tktw-init" + IdleTimeout = 100 * time.Millisecond + ExpectedDelayTimeout = 1 * time.Second ) type WatchInstrumentedPodOptions struct { @@ -80,7 +81,7 @@ func WatchInstrumentedPod(parentCtx context.Context, clientSet kubernetes.Interf // Handle when the execution have been finished, but there may be no Pod case <-state.Finished(""): select { - case <-time.After(1 * time.Second): + case <-time.After(ExpectedDelayTimeout): s.Error(fmt.Errorf("the execution is finished, but failed to get pod")) return case p := <-pod.Peek(ctx): From f66c67f2227b4e782d7703a69a1e13aef21ffc72 Mon Sep 17 00:00:00 2001 From: Dawid Rusnak Date: Tue, 13 Aug 2024 22:06:35 +0200 Subject: [PATCH 16/16] fixup unit tests --- .../testworkflowprocessor/presets/processor_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go b/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go index 2aa325e1715..fb2f9fb17a1 100644 --- a/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go +++ b/pkg/testworkflows/testworkflowprocessor/presets/processor_test.go @@ -727,7 +727,6 @@ func TestProcessLocalContent(t *testing.T) { RestartPolicy: corev1.RestartPolicyNever, EnableServiceLinks: common.Ptr(false), Volumes: volumes, - PreemptionPolicy: common.Ptr(corev1.PreemptNever), InitContainers: []corev1.Container{ { Name: "1",