Skip to content

Commit

Permalink
re-add examples
Browse files Browse the repository at this point in the history
  • Loading branch information
dholt committed Jun 1, 2021
1 parent 60d1bb0 commit 22c570a
Show file tree
Hide file tree
Showing 15 changed files with 246 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/slurm-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ Read through the [slurm usage guide](slurm-usage.md) and [Open OnDemand guide](o
## Prolog and Epilog

The default Slurm deployment includes a collection of prolog and epilog scripts that should be modified to suit a particular system.
For more information, see the [prolog/epilog documentation](slurm-prolog-epilog.md).
For more information, see the [prolog/epilog documentation](slurm-prolog-epilog/README.md).

## Node Health Check

Expand Down
File renamed without changes.
8 changes: 8 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Stop DCGM GPU stats collection if requested
scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null
if [ $? -eq 0 ]; then
OUTPUTDIR=`scontrol show job $SLURM_JOBID | grep WorkDir | cut -d = -f 2`
sudo -u $SLURM_JOB_USER dcgmi stats -x $SLURM_JOBID
sudo -u $SLURM_JOB_USER dcgmi stats -v -j $SLURM_JOBID | sudo -u $SLURM_JOB_USER tee $OUTPUTDIR/dcgm-gpu-stats-$HOSTNAME-$SLURM_JOBID.out
sudo -u $SLURM_JOB_USER nv-hostengine -t
fi
18 changes: 18 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Make sure ECC is on.
nvidia-smi -a | grep -A1 Ecc | grep -i disabled > /dev/null
if [ $? -eq 0 ]; then
logger -t PROLOG "Enabling ECC"
nvidia-smi -e 1
GPUCOUNT=`nvidia-smi -L | wc -l`
GPUMAXINDEX=`expr $GPUCOUNT - 1`
systemctl stop collectd
logger -t PROLOG "Triggering GPU reset"
for i in `seq 0 $GPUMAXINDEX`; do
e=`nvidia-smi -r -i $i 2>&1`
if [ $? -ne 0 ]; then
logger -t PROLOG "WARNING! GPU $i reset failed"
logger -t PROLOG "GPU $i reset error: $e"
fi
done
logger -t PROLOG "GPU reset done"
fi
11 changes: 11 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/epilog-mps
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Quit cuda mps if it's running
ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
if [ $? -eq 0 ]; then
echo quit | nvidia-cuda-mps-control
fi

# Test for presence of mps zombie
ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
if [ $? -eq 0 ]; then
killall nvidia-cuda-mps-server
fi
13 changes: 13 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Disable hypterthreading if requested
scontrol show job $SLURM_JOBID | grep Comment | grep -i nohyperthreading > /dev/null
if [ $? -eq 0 ]; then
cat `find /sys/devices/system/cpu -name thread_siblings_list` | sort | uniq > /tmp/thread_siblings_list
for sibs in `cat /tmp/thread_siblings_list` ; do
echo $sibs | grep ',' >& /dev/null # if there is a comma (','), then need to disable 2nd
if [ $? -eq 0 ] ; then
x=`echo $sibs | cut -f 2 -d ','`
echo Disabling CPU $x
echo 0 > /sys/devices/system/cpu/cpu$x/online
fi
done
fi
8 changes: 8 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Enable hypterthreading if requested
scontrol show job $SLURM_JOBID | grep Comment | grep -i hyperthreading | grep -v nohyperthreading > /dev/null
if [ $? -eq 0 ]; then
for i in /sys/devices/system/cpu/*/online ; do
echo 1 > $i
echo Enabling CPU $(echo $i | egrep -o cpu[0-9]+ | tr -d 'cpu')
done
fi
13 changes: 13 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#
# Check that mounts exist
#
MOUNTS="/raid /gpfs/fs1"
for i in $MOUNTS
do
mount | grep $i &> /dev/null
if [ $? -ne 0 ]
then
echo "$HOSTNAME is missing $i"
echo "scontrol update nodename=$HOSTNAME state=drain reason="Mount missing: $i""
fi
done
19 changes: 19 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#
# Check that all GPUs are healthy via dcgm
#
if [ $NUMGPUS -gt 0 ]; then
echo "Execute dcgm health check"
GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'`
rm /tmp/dcgm.out 2> /dev/null
nv-hostengine
dcgmi group -c gpuinfo
dcgmi group -g 1 -a $GPULIST
dcgmi diag -g 1 -r 1 1> /tmp/dcgm.out
dcgmi group -d 1
nv-hostengine -t
grep -i fail /tmp/dcgm.out > /dev/null
if [ $? -gt 0 ]; then
scontrol update nodename=$HOSTNAME state=drain reason="Failed DCGM, see /tmp/dcgm.out"
exit 0
fi
fi
10 changes: 10 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Start DCGM GPU stats collection if requested
scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null
if [ $? -eq 0 ]; then
GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'`
sudo -u $SLURM_JOB_USER nv-hostengine ~$SLURM_JOB_USER/nvhost.pid
sudo -u $SLURM_JOB_USER dcgmi group -c gpuinfo
sudo -u $SLURM_JOB_USER dcgmi group -g 1 -a $GPULIST
sudo -u $SLURM_JOB_USER dcgmi stats -g 1 --enable
sudo -u $SLURM_JOB_USER dcgmi stats -g 1 -s $SLURM_JOBID
fi
22 changes: 22 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Disable ECC if requested
scontrol show job $SLURM_JOBID | grep Comment | grep -i ecc > /dev/null
if [ $? -eq 0 ]; then
logger -t PROLOG "Disabling ECC"
nvidia-smi -e 0
GPUCOUNT=`nvidia-smi -L | wc -l`
GPUMAXINDEX=`expr $GPUCOUNT - 1`
systemctl stop collectd
logger -t PROLOG "Triggering GPU reset"
for i in `seq 0 $GPUMAXINDEX`; do
logger -t PROLOG "Resetting GPU $i"
e=`nvidia-smi -r -i $i 2>&1`
if [ $? -ne 0 ]; then
logger -t PROLOG "WARNING! GPU $i reset failed"
logger -t PROLOG "GPU $i reset error: $e"
nvidia-smi -e 1
fi
sleep 1
done
logger -t PROLOG "GPU reset done"
systemctl start collectd
fi
27 changes: 27 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-gpureset
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#
# Uncomment the following for a full GPU reset
# Can be used to reset the GPU HW state in situations
# that would otherwise require a machine reboot.
# Typically useful if a double bit ECC error has
# occurred.
# Reset operations are not guarenteed to work in
# all cases and should be used with caution.
#
#GPUCOUNT=`nvidia-smi -L | wc -l`
#GPUMAXINDEX=`expr $GPUCOUNT - 1`
#for i in `seq 0 $GPUMAXINDEX`; do
# e=`nvidia-smi -r -i $i 2>&1`
# if [ $? -ne 0 ]; then
# logger -t PROLOG "WARNING! GPU $i reset failed"
# logger -t PROLOG "GPU $i reset error: $e"
# fi
# sleep 1
#done

# Reset application clocks
nvidia-smi -rac 2>/dev/null
nvidia-smi -acp 0 2>/dev/null

# Make sure accounting is activated and clear current logs
nvidia-smi -am 1 > /dev/null
nvidia-smi -caa > /dev/null
13 changes: 13 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#
# Check that all GPUs are present
#
NUMGPUS=`scontrol -a show nodes $HOSTNAME | grep "Gres=gpu" | cut -d : -f 2`
if [ $NUMGPUS -gt 0 ]; then
PCIGPUSFOUND=`lspci | grep "3D controller: NVIDIA Corporation" | wc -l`
if [ $PCIGPUSFOUND -ne $NUMGPUS ]; then
echo "Slurm expects $NUMGPUS GPUs but lspci found: $PCIGPUSFOUND"
scontrol update nodename=$HOSTNAME state=drain reason="Missing GPUs"
exit 0
fi
echo "Slurm expects $NUMGPUS GPUs lspci found: $PCIGPUSFOUND"
fi
24 changes: 24 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-mps
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Activate NVIDIA MPS if requested
# Server will start automatically, no need to start explicitly
scontrol show job $SLURM_JOBID | grep Comment | grep -i mps | grep -v mps-per-gpu > /dev/null
if [ $? -eq 0 ]; then
NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l)
case $NUM_GPUS in
'' | *[!0-9]*) ;; #skip if non-numeric
*)
GPUS=""
i=0
while [ ${i} -lt ${NUM_GPUS} ]
do
GPUS+="${i},"
let i=i+1
done
GPUS=${GPUS%%,}
esac
sudo nvidia-smi -c 3
export CUDA_VISIBLE_DEVICES=$GPUS
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUS CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY -s nvidia-cuda-mps-control -d
export CUDA_VISIBLE_DEVICES=$GPUS
fi
59 changes: 59 additions & 0 deletions docs/slurm-cluster/slurm-prolog-epilog/prolog-mps-per-gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#
# Check if user gave --comment=mps-per-gpu when submitting their job.
#
scontrol show job $SLURM_JOBID | grep Comment | grep -i mps-per-gpu > /dev/null
if [ $? -eq 0 ]; then
#
# Count how many GPUs are on the system
#
NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l)
case $NUM_GPUS in
'' | *[!0-9]*) ;; #skip if non-numeric
*)
i=0
#
# In this loop we iterate through the number of GPUs starting
# an MPS daemon and MPS server for each GPU.
#
while [ ${i} -lt ${NUM_GPUS} ]
do
GPUINDEX=$i
export CUDA_VISIBLE_DEVICES=$GPUINDEX
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX
#
# Set GPU compute mode to EXCLUSIVE_PROCESS
#
nvidia-smi -i $GPUINDEX -c 3
sleep 1
#
# Start MPS daemon
#
sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUINDEX CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY -s nvidia-cuda-mps-control -d
sleep 1
let i=i+1
done
i=0
while [ ${i} -lt ${NUM_GPUS} ]
do
GPUINDEX=$i
export CUDA_VISIBLE_DEVICES=$GPUINDEX
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX
#
# Start MPS server
#
echo "start_server -uid $SLURM_JOB_UID" | CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY nvidia-cuda-mps-control
sleep 1
#
# Determine MPS server PID and GPU CPU affinity. Bind MPS server to that CPU.
#
MPS_SERVER_PID=`nvidia-smi -q -i $GPUINDEX -d PIDS | grep "Process ID" | awk '{print $4}'`
GPUCPUAFFINITY=$(nvidia-smi topo -m | grep -m2 GPU${GPUINDEX} | tail -n1 | \
awk '{print $NF}')
taskset -p -c ${GPUCPUAFFINITY} $MPS_SERVER_PID

let i=i+1
done
esac
fi

0 comments on commit 22c570a

Please sign in to comment.