From 67253e8ab502a850415345a7945b2ba00f00cb82 Mon Sep 17 00:00:00 2001 From: jswxstw Date: Thu, 29 Aug 2024 17:05:14 +0800 Subject: [PATCH] fix: Mark taskResult completed if wait container terminated not gracefully. Fixes #13373 (#13491) Signed-off-by: oninowang --- workflow/controller/operator.go | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 3d9bb0923458..9a7b2e00dcb6 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1531,14 +1531,34 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus } } + waitContainerCleanedUp := true // We cannot fail the node if the wait container is still running because it may be busy saving outputs, and these // would not get captured successfully. for _, c := range pod.Status.ContainerStatuses { - if c.Name == common.WaitContainerName && c.State.Running != nil && new.Phase.Completed() { - woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ") - new.Phase = old.Phase + if c.Name == common.WaitContainerName { + waitContainerCleanedUp = false + switch { + case c.State.Running != nil && new.Phase.Completed(): + woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ") + new.Phase = old.Phase + case c.State.Terminated != nil && c.State.Terminated.ExitCode != 0: + // Mark its taskResult as completed directly since wait container did not exit normally, + // and it will never have a chance to report taskResult correctly. + nodeID := woc.nodeID(pod) + woc.log.WithFields(log.Fields{"nodeID": nodeID, "exitCode": c.State.Terminated.ExitCode, "reason": c.State.Terminated.Reason}). + Warn("marking its taskResult as completed since wait container did not exit normally") + woc.wf.Status.MarkTaskResultComplete(nodeID) + } } } + if pod.Status.Phase == apiv1.PodFailed && pod.Status.Reason == "Evicted" && waitContainerCleanedUp { + // Mark its taskResult as completed directly since wait container has been cleaned up because of pod evicted, + // and it will never have a chance to report taskResult correctly. + nodeID := woc.nodeID(pod) + woc.log.WithFields(log.Fields{"nodeID": nodeID}). + Warn("marking its taskResult as completed since wait container has been cleaned up.") + woc.wf.Status.MarkTaskResultComplete(nodeID) + } // if we are transitioning from Pending to a different state, clear out unchanged message if old.Phase == wfv1.NodePending && new.Phase != wfv1.NodePending && old.Message == new.Message {