-
Notifications
You must be signed in to change notification settings - Fork 3.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Allow step restart on workflow retry. Closes #2334 #2431
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
apiVersion: argoproj.io/v1alpha1 | ||
kind: Workflow | ||
metadata: | ||
name: retry-test | ||
labels: | ||
argo-e2e: "true" | ||
spec: | ||
entrypoint: steps-outer | ||
templates: | ||
- name: steps-outer | ||
steps: | ||
- - name: steps-outer-step1 | ||
template: whalesay | ||
- - name: steps-outer-step2 | ||
template: steps-inner | ||
|
||
- name: steps-inner | ||
steps: | ||
- - name: steps-inner-step1 | ||
template: whalesay | ||
- - name: steps-inner-step2 | ||
template: approve | ||
|
||
- name: approve | ||
suspend: {} | ||
|
||
- name: whalesay | ||
container: | ||
image: docker/whalesay:latest | ||
command: [cowsay] | ||
args: ["hello world"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -218,7 +218,7 @@ func (woc *wfOperationCtx) operate() { | |
woc.workflowDeadline = woc.getWorkflowDeadline() | ||
err := woc.podReconciliation() | ||
if err == nil { | ||
err = woc.failSuspendedNodesAfterDeadline() | ||
err = woc.failSuspendedNodesAfterDeadlineOrShutdown() | ||
} | ||
if err != nil { | ||
woc.log.Errorf("%s error: %+v", woc.wf.ObjectMeta.Name, err) | ||
|
@@ -636,6 +636,17 @@ func (woc *wfOperationCtx) processNodeRetries(node *wfv1.NodeStatus, retryStrate | |
return woc.markNodePhase(node.Name, wfv1.NodeSucceeded), true, nil | ||
} | ||
|
||
if woc.wf.Spec.Shutdown != "" || (woc.workflowDeadline != nil && time.Now().UTC().After(*woc.workflowDeadline)) { | ||
var message string | ||
if woc.wf.Spec.Shutdown != "" { | ||
message = fmt.Sprintf("Stopped with strategy '%s'", woc.wf.Spec.Shutdown) | ||
} else { | ||
message = fmt.Sprintf("retry exceeded workflow deadline %s", *woc.workflowDeadline) | ||
} | ||
woc.log.Infoln(message) | ||
return woc.markNodePhase(node.Name, lastChildNode.Phase, message), true, nil | ||
} | ||
|
||
if retryStrategy.Backoff != nil { | ||
// Process max duration limit | ||
if retryStrategy.Backoff.MaxDuration != "" && len(node.Children) > 0 { | ||
|
@@ -808,11 +819,17 @@ func (woc *wfOperationCtx) shouldPrintPodSpec(node wfv1.NodeStatus) bool { | |
} | ||
|
||
//fails any suspended nodes if the workflow deadline has passed | ||
func (woc *wfOperationCtx) failSuspendedNodesAfterDeadline() error { | ||
if woc.workflowDeadline != nil && time.Now().UTC().After(*woc.workflowDeadline) { | ||
func (woc *wfOperationCtx) failSuspendedNodesAfterDeadlineOrShutdown() error { | ||
if woc.wf.Spec.Shutdown != "" || (woc.workflowDeadline != nil && time.Now().UTC().After(*woc.workflowDeadline)) { | ||
Comment on lines
+822
to
+823
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for this! |
||
for _, node := range woc.wf.Status.Nodes { | ||
if node.IsActiveSuspendNode() { | ||
woc.markNodePhase(node.Name, wfv1.NodeFailed, fmt.Sprintf("step exceeded workflow deadline %s", *woc.workflowDeadline)) | ||
var message string | ||
if woc.wf.Spec.Shutdown != "" { | ||
message = fmt.Sprintf("Stopped with strategy '%s'", woc.wf.Spec.Shutdown) | ||
} else { | ||
message = fmt.Sprintf("step exceeded workflow deadline %s", *woc.workflowDeadline) | ||
} | ||
woc.markNodePhase(node.Name, wfv1.NodeFailed, message) | ||
} | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -421,6 +421,20 @@ func ResumeWorkflow(wfIf v1alpha1.WorkflowInterface, repo sqldb.OffloadNodeStatu | |||||
} | ||||||
} | ||||||
|
||||||
func selectorMatchesNode(selector fields.Selector, node wfv1.NodeStatus) bool { | ||||||
nodeFields := fields.Set{ | ||||||
"displayName": node.DisplayName, | ||||||
"templateName": node.TemplateName, | ||||||
} | ||||||
if node.Inputs != nil { | ||||||
for _, inParam := range node.Inputs.Parameters { | ||||||
nodeFields[fmt.Sprintf("inputs.parameters.%s.value", inParam.Name)] = *inParam.Value | ||||||
} | ||||||
} | ||||||
|
||||||
return selector.Matches(nodeFields) | ||||||
} | ||||||
|
||||||
func updateWorkflowNodeByKey(wfIf v1alpha1.WorkflowInterface, workflowName string, nodeFieldSelector string, phase wfv1.NodePhase, message string) error { | ||||||
selector, err := fields.ParseSelector(nodeFieldSelector) | ||||||
|
||||||
|
@@ -441,16 +455,7 @@ func updateWorkflowNodeByKey(wfIf v1alpha1.WorkflowInterface, workflowName strin | |||||
nodeUpdated := false | ||||||
for nodeID, node := range wf.Status.Nodes { | ||||||
if node.IsActiveSuspendNode() { | ||||||
nodeFields := fields.Set{ | ||||||
"displayName": node.DisplayName, | ||||||
} | ||||||
if node.Inputs != nil { | ||||||
for _, inParam := range node.Inputs.Parameters { | ||||||
nodeFields[fmt.Sprintf("inputs.parameters.%s.value", inParam.Name)] = *inParam.Value | ||||||
} | ||||||
} | ||||||
|
||||||
if selector.Matches(nodeFields) { | ||||||
if selectorMatchesNode(selector, node) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||||||
node.Phase = phase | ||||||
node.FinishedAt = metav1.Time{Time: time.Now().UTC()} | ||||||
if len(message) > 0 { | ||||||
|
@@ -612,7 +617,7 @@ func convertNodeID(newWf *wfv1.Workflow, regex *regexp.Regexp, oldNodeID string, | |||||
} | ||||||
|
||||||
// RetryWorkflow updates a workflow, deleting all failed steps as well as the onExit node (and children) | ||||||
func RetryWorkflow(kubeClient kubernetes.Interface, repo sqldb.OffloadNodeStatusRepo, wfClient v1alpha1.WorkflowInterface, wf *wfv1.Workflow) (*wfv1.Workflow, error) { | ||||||
func RetryWorkflow(kubeClient kubernetes.Interface, repo sqldb.OffloadNodeStatusRepo, wfClient v1alpha1.WorkflowInterface, wf *wfv1.Workflow, restartSuccessful bool, nodeFieldSelector string) (*wfv1.Workflow, error) { | ||||||
switch wf.Status.Phase { | ||||||
case wfv1.NodeFailed, wfv1.NodeError: | ||||||
default: | ||||||
|
@@ -627,13 +632,41 @@ func RetryWorkflow(kubeClient kubernetes.Interface, repo sqldb.OffloadNodeStatus | |||||
newWF := wf.DeepCopy() | ||||||
podIf := kubeClient.CoreV1().Pods(wf.ObjectMeta.Namespace) | ||||||
|
||||||
// Get all children of nodes that match filter | ||||||
nodeIDsToReset := make(map[string]bool) | ||||||
if restartSuccessful && len(nodeFieldSelector) > 0 { | ||||||
selector, err := fields.ParseSelector(nodeFieldSelector) | ||||||
if err != nil { | ||||||
return nil, err | ||||||
} else { | ||||||
for _, node := range wf.Status.Nodes { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @mark9white, because of #2645 this actually needs to be moved further down the code. Workflows with offloaded nodes are only retrieved starting in line 678, so if a Workflow has offloaded nodes, While you're at this, would you mind extracting this block out to a helper function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||
if selectorMatchesNode(selector, node) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like this code could be included in the large There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to determine the list of nodes including child nodes first. |
||||||
//traverse all children of the node | ||||||
var queue []string | ||||||
queue = append(queue, node.ID) | ||||||
|
||||||
for len(queue) > 0 { | ||||||
childNode := queue[0] | ||||||
//if the child isn't already in nodeIDsToReset then we add it and traverse its children | ||||||
if _, present := nodeIDsToReset[childNode]; !present { | ||||||
nodeIDsToReset[childNode] = true | ||||||
queue = append(queue, wf.Status.Nodes[childNode].Children...) | ||||||
} | ||||||
queue = queue[1:] | ||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
// Delete/reset fields which indicate workflow completed | ||||||
delete(newWF.Labels, common.LabelKeyCompleted) | ||||||
newWF.Status.Conditions.UpsertCondition(wfv1.WorkflowCondition{Status: metav1.ConditionFalse, Type: wfv1.WorkflowConditionCompleted}) | ||||||
newWF.ObjectMeta.Labels[common.LabelKeyPhase] = string(wfv1.NodeRunning) | ||||||
newWF.Status.Phase = wfv1.NodeRunning | ||||||
newWF.Status.Message = "" | ||||||
newWF.Status.FinishedAt = metav1.Time{} | ||||||
newWF.Spec.Shutdown = "" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch! |
||||||
if newWF.Spec.ActiveDeadlineSeconds != nil && *newWF.Spec.ActiveDeadlineSeconds == 0 { | ||||||
// if it was terminated, unset the deadline | ||||||
newWF.Spec.ActiveDeadlineSeconds = nil | ||||||
|
@@ -655,14 +688,19 @@ func RetryWorkflow(kubeClient kubernetes.Interface, repo sqldb.OffloadNodeStatus | |||||
} | ||||||
|
||||||
for _, node := range nodes { | ||||||
var doForceResetNode = false | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||
if _, present := nodeIDsToReset[node.ID]; present { | ||||||
// if we are resetting this node then don't carry it across regardless of its phase | ||||||
doForceResetNode = true | ||||||
} | ||||||
switch node.Phase { | ||||||
case wfv1.NodeSucceeded, wfv1.NodeSkipped: | ||||||
if !strings.HasPrefix(node.Name, onExitNodeName) { | ||||||
if !strings.HasPrefix(node.Name, onExitNodeName) && !doForceResetNode { | ||||||
newNodes[node.ID] = node | ||||||
continue | ||||||
} | ||||||
case wfv1.NodeError, wfv1.NodeFailed: | ||||||
if !strings.HasPrefix(node.Name, onExitNodeName) && (node.Type == wfv1.NodeTypeDAG || node.Type == wfv1.NodeTypeStepGroup) { | ||||||
if !strings.HasPrefix(node.Name, onExitNodeName) && (node.Type == wfv1.NodeTypeDAG || node.Type == wfv1.NodeTypeStepGroup) && !doForceResetNode { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, fixed |
||||||
newNode := node.DeepCopy() | ||||||
newNode.Phase = wfv1.NodeRunning | ||||||
newNode.Message = "" | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this here? Isn't this covered by
failSuspendedNodesAfterDeadlineOrShutdown()
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Without this, in the case of a retry parent node it just keeps retrying pods that continually fail because they are being executed after the deadline. The integration test didn't work without it.