Skip to content

Commit

Permalink
fix: Mark its taskResult as completed if wait container has been term…
Browse files Browse the repository at this point in the history
…inated not gracefully. Fixes argoproj#13373

Signed-off-by: oninowang <oninowang@tencent.com>
  • Loading branch information
jswxstw authored and oninowang committed Aug 23, 2024
1 parent dcd9436 commit 606f994
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1495,14 +1495,34 @@ func (woc *wfOperationCtx) assessNodeStatus(ctx context.Context, pod *apiv1.Pod,
}
}

waitContainerCleanedUp := true
// We cannot fail the node if the wait container is still running because it may be busy saving outputs, and these
// would not get captured successfully.
for _, c := range pod.Status.ContainerStatuses {
if c.Name == common.WaitContainerName && c.State.Running != nil && new.Phase.Completed() {
woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ")
new.Phase = old.Phase
if c.Name == common.WaitContainerName {
waitContainerCleanedUp = false
switch {
case c.State.Running != nil && new.Phase.Completed():
woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ")
new.Phase = old.Phase
case c.State.Terminated != nil && c.State.Terminated.ExitCode != 0:
// Mark its taskResult as completed directly since wait container did not exit normally,
// and it will never have a chance to report taskResult correctly.
nodeID := woc.nodeID(pod)
woc.log.WithFields(log.Fields{"nodeID": nodeID, "exitCode": c.State.Terminated.ExitCode, "reason": c.State.Terminated.Reason}).
Debug("marking its taskResult as completed since wait container did not exit normally")
woc.wf.Status.MarkTaskResultComplete(nodeID)
}
}
}
if pod.Status.Phase == apiv1.PodFailed && pod.Status.Reason == "Evicted" && waitContainerCleanedUp {
// Mark its taskResult as completed directly since wait container has been cleaned up because of pod evicted,
// and it will never have a chance to report taskResult correctly.
nodeID := woc.nodeID(pod)
woc.log.WithFields(log.Fields{"nodeID": nodeID}).
Debug("marking its taskResult as completed since wait container has been cleaned up.")
woc.wf.Status.MarkTaskResultComplete(nodeID)
}

// if we are transitioning from Pending to a different state, clear out unchanged message
if old.Phase == wfv1.NodePending && new.Phase != wfv1.NodePending && old.Message == new.Message {
Expand Down

0 comments on commit 606f994

Please sign in to comment.