-
Notifications
You must be signed in to change notification settings - Fork 28
/
slurm-sge-status.sh
executable file
·160 lines (141 loc) · 5.33 KB
/
slurm-sge-status.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/bin/sh
# The status.sh script must return the status of the supplied grid job id.
# running, success, or failed.
# JobID and return status.
jobid=$1
jobstatus="running"
# Test for Slurm: if sinfo is present, assume slurm works.
slurm=$(which sinfo 2> /dev/null)
# Test for SGE: if SGE_ROOT exists in the environment, assume SGE works.
sge=$SGE_ROOT
# Test for LSF: if LSF_ENVDIR exists in the environment, assume LSF works.
lsf=$LSF_ENVDIR
# Test for PBS; if we have pbsnodes binary
pbs=$(which pbsnodes 2> /dev/null)
# Check Slurm status.
#
# Derived from https://github.com/jdblischak/smk-simple-slurm
#
# Note that if sacct is queried immediately after job submission, it
# is possible that sacct returns no output, and so our default state
# is "running".
#
# dashboard_cli is an NIH biowulf extension that caches sacct results.
#
if [ "x$slurm" != "x" ] ; then
if [ -x /usr/local/bin/dashboard_cli ] ; then
sleep 2
sacct="/usr/local/bin/dashboard_cli jobs --fields state"
else
sacct="sacct --allclusters --format State"
fi
jobstatus=$($sacct -j "$jobid" --noheader)
if [ $? != 0 -o "x$jobstatus" = "x" ] ; then # If sacct fails,
sleep 2 # pause a bit, then
echo running # report that the job is running.
echo 1>&2 "Job $jobid not known (yet) by slurm; will return 'running' by default."
exit 0
fi
jobstatus=$(echo $jobstatus | head -n 1 \
| \
awk \
'BEGIN { stat="running" } \
{ \
if ($1 == "COMPLETED") { stat="success" } \
else if ($1 == "PENDING") { stat="running" } \
else if ($1 == "CONFIGURING") { stat="running" } \
else if ($1 == "RUNNING") { stat="running" } \
else if ($1 == "SUSPENDED") { stat="running" } \
else if ($1 == "PREEMPTED") { stat="running" } \
else if ($1 == "COMPLETING") { stat="running" } \
else if ($1 == "BOOT_FAIL") { stat="failed" } \
else if ($1 == "CANCELLED") { stat="failed" } \
else if ($1 == "DEADLINE") { stat="failed" } \
else if ($1 == "FAILED") { stat="failed" } \
else if ($1 == "NODE_FAIL") { stat="failed" } \
else if ($1 == "OUT_OF_MEMORY") { stat="failed" } \
else if ($1 == "PREEMPTED") { stat="failed" } \
else if ($1 == "TIMEOUT") { stat="failed" } \
else { stat="failed" }
}
END { print stat }')
# Check SGE status.
#
# Query qstat for the job. If the job is in an error state, it will report
# 'error reason', otherwise, if there is output, the job is pending or
# running.
#
# If no output, then we need to query qacct for the job status.
#
elif [ "x$sge" != "x" ] ; then
if [ $(qstat -j $jobid 2> /dev/null | grep error\ reason | wc -c) -gt 0 ] ; then
jobstatus="failed"
elif [ $(qstat -j $jobid 2> /dev/null | wc -c) -gt 0 ] ; then
jobstatus="running"
else
jobstatus=$(qacct -j $jobid 2> /dev/null \
| \
awk \
'{ \
if (($1 == "failed") && ($2 > 0)) { fail=$2 } \
if (($1 == "exit_status") && ($2 > 0)) { fail=$2 } \
} END { \
if (fail > 0) { print "failed" } \
else { print "success" } \
}')
fi
# Check LSF status.
#
# https://github.com/Snakemake-Profiles/lsf/blob/master/%7B%7Bcookiecutter.profile_name%7D%7D/lsf_status.py
# running -- PEND RUN PSUSP USUSP SSUSP WAIT
# success -- DONE POST_DONE
# failed -- UNKWN ZOMBI EXIT POST_ERR
#
elif [ "x$lsf" != "x" ] ; then
jobstatus=$(bjobs -o stat -noheader "$jobid" | head -n 1 \
| \
awk \
'BEGIN { stat="running" } \
{ \
if ($1 == "DONE") { stat="success" } \
else if ($1 == "POST_DONE") { stat="success" } \
else if ($1 == "PEND") { stat="running" } \
else if ($1 == "RUN") { stat="running" } \
else if ($1 == "PSUSP") { stat="running" } \
else if ($1 == "USUSP") { stat="running" } \
else if ($1 == "SSUSP") { stat="running" } \
else if ($1 == "WAIT") { stat="running" } \
else if ($1 == "UNKWN") { stat="failed" } \
else if ($1 == "ZOMBI") { stat="failed" } \
else if ($1 == "EXIT") { stat="failed" } \
else if ($1 == "POST_ERR") { stat="failed" } \
else { stat="failed" } \
} \
END { print stat }')
elif [ "x$pbs" != "x" ]; then
jobstatus=$(qstat -f "$jobid" | \
awk \
'BEGIN { stat="running" } \
/job_state/ {state=$NF} /exit_status/ {exit_status=$NF} \
END { \
if (state == "R") { stat="running" } \
else if (state == "Q") { stat="running" } \
else if (state == "H") { stat="running" } \
else if (state == "T") { stat="running" } \
else if (state == "W") { stat="running" } \
else if (state == "S") { stat="running" } \
else if (state == "B") { stat="running" } \
else if (state == "E") { stat="running" } \
else if (state == "C") { \
if (exit_status == 0) { stat="success" } \
else { stat="failed" } \
} \
else { stat="failed" } \
print stat }')
# Otherwise, do what? Fail!
else
jobstatus="failed"
fi
#echo 1>&2 "Job $jobid is $jobstatus."
echo $jobstatus
exit 0