Skip to content

Commit

Permalink
fix: Workflow should fail on Pod failure before container starts Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sarabala1979 authored Sep 1, 2020
1 parent c4c8006 commit be91d76
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 0 deletions.
13 changes: 13 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,12 @@ func inferFailedReason(pod *apiv1.Pod) (wfv1.NodePhase, string) {
// If multiple containers failed, in order of preference:
// init, main (annotated), main (exit code), wait, sidecars
for _, ctr := range pod.Status.InitContainerStatuses {
// Virtual Kubelet environment will not set the terminate on waiting container
// https://github.com/argoproj/argo/issues/3879
// https://github.com/virtual-kubelet/virtual-kubelet/blob/7f2a02291530d2df14905702e6d51500dd57640a/node/sync.go#L195-L208
if ctr.State.Waiting != nil {
return wfv1.NodeError, fmt.Sprintf("Pod failed before %s container starts", ctr.Name)
}
if ctr.State.Terminated == nil {
// We should never get here
log.Warnf("Pod %s phase was Failed but %s did not have terminated state", pod.ObjectMeta.Name, ctr.Name)
Expand All @@ -1210,6 +1216,13 @@ func inferFailedReason(pod *apiv1.Pod) (wfv1.NodePhase, string) {
}
failMessages := make(map[string]string)
for _, ctr := range pod.Status.ContainerStatuses {
// Virtual Kubelet environment will not set the terminate on waiting container
// https://github.com/argoproj/argo/issues/3879
// https://github.com/virtual-kubelet/virtual-kubelet/blob/7f2a02291530d2df14905702e6d51500dd57640a/node/sync.go#L195-L208

if ctr.State.Waiting != nil {
return wfv1.NodeError, fmt.Sprintf("Pod failed before %s container starts", ctr.Name)
}
if ctr.State.Terminated == nil {
// We should never get here
log.Warnf("Pod %s phase was Failed but %s did not have terminated state", pod.ObjectMeta.Name, ctr.Name)
Expand Down
133 changes: 133 additions & 0 deletions workflow/controller/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/argoproj/argo/config"
wfv1 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1"
"github.com/argoproj/argo/test"
testutil "github.com/argoproj/argo/test/util"
intstrutil "github.com/argoproj/argo/util/intstr"
"github.com/argoproj/argo/workflow/common"
"github.com/argoproj/argo/workflow/controller/cache"
Expand Down Expand Up @@ -4646,3 +4647,135 @@ func TestStorageQuota(t *testing.T) {
assert.Equal(t, wfv1.NodeError, woc.wf.Status.Phase)
assert.Contains(t, woc.wf.Status.Message, "BadRequest")
}

var podWithFailed = `
apiVersion: v1
kind: Pod
metadata:
annotations:
creationTimestamp: '2020-08-27T18:14:19Z'
name: hello-world-lbgpt-2607732259
namespace: argo
spec:
containers:
- command:
- argoexec
- wait
env:
- name: ARGO_POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
image: argoproj/argoexec:v2.9.5
imagePullPolicy: IfNotPresent
name: wait
resources: {}
terminationMessagePath: "/dev/termination-log"
terminationMessagePolicy: File
volumeMounts:
- mountPath: "/argo/podmetadata"
name: podmetadata
- mountPath: "/var/run/docker.sock"
name: docker-sock
readOnly: true
- mountPath: "/var/run/secrets/kubernetes.io/serviceaccount"
name: default-token-rc4ml
readOnly: true
- args:
- import random; import sys; exit_code = random.choice([0, 1, 1]); sys.exit(exit_code)
command:
- python
- "-c"
image: python:alpine3.6
imagePullPolicy: IfNotPresent
name: main
resources: {}
terminationMessagePath: "/dev/termination-log"
terminationMessagePolicy: File
volumeMounts:
- mountPath: "/var/run/secrets/kubernetes.io/serviceaccount"
name: default-token-rc4ml
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: docker-desktop
priority: 0
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- downwardAPI:
defaultMode: 420
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.annotations
path: annotations
name: podmetadata
- hostPath:
path: "/var/run/docker.sock"
type: Socket
name: docker-sock
- name: default-token-rc4ml
secret:
defaultMode: 420
secretName: default-token-rc4ml
status:
conditions:
- lastProbeTime:
lastTransitionTime: '2020-08-27T18:14:19Z'
status: 'True'
type: PodScheduled
containerStatuses:
- containerID: docker://502dda61a8f05e08d10cffc972d2fb9226e82af7daaacff98e84727bb96f11e6
image: python:alpine3.6
imageID: docker-pullable://python@sha256:766a961bf699491995cc29e20958ef11fd63741ff41dcc70ec34355b39d52971
lastState:
waiting: {}
name: main
ready: false
restartCount: 0
started: false
state:
waiting: {}
- containerID: docker://d31f0d56f29b6962ef1493b2df6b7cdb54d48d8b8fa95d7e9c98ddc56f857b35
image: argoproj/argoexec:v2.9.5
imageID: docker-pullable://argoproj/argoexec@sha256:989114232892e051c25be323af626149452578d3ebbdc3e9ec7205bba3918d48
lastState:
waiting: {}
name: wait
ready: false
restartCount: 0
started: false
state:
waiting: {}
hostIP: 192.168.65.3
phase: Failed
podIP: 10.1.28.244
podIPs:
- ip: 10.1.28.244
qosClass: BestEffort
startTime: '2020-08-27T18:14:19Z'
`

func TestPodFailureWithContainerWaitingState(t *testing.T) {
var pod apiv1.Pod
testutil.MustUnmarshallYAML(podWithFailed, &pod)
assert.NotNil(t, pod)
nodeStatus, msg := inferFailedReason(&pod)
assert.Equal(t, wfv1.NodeError, nodeStatus)
assert.Contains(t, msg, "Pod failed before")
}

0 comments on commit be91d76

Please sign in to comment.