diff --git a/docs/slurm-cluster/README.md b/docs/slurm-cluster/README.md index 63546989f..30d7ddb44 100644 --- a/docs/slurm-cluster/README.md +++ b/docs/slurm-cluster/README.md @@ -73,7 +73,7 @@ Read through the [slurm usage guide](slurm-usage.md) and [Open OnDemand guide](o ## Prolog and Epilog The default Slurm deployment includes a collection of prolog and epilog scripts that should be modified to suit a particular system. -For more information, see the [prolog/epilog documentation](slurm-prolog-epilog.md). +For more information, see the [prolog/epilog documentation](slurm-prolog-epilog/README.md). ## Node Health Check diff --git a/docs/slurm-cluster/slurm-prolog-epilog.md b/docs/slurm-cluster/slurm-prolog-epilog/README.md similarity index 100% rename from docs/slurm-cluster/slurm-prolog-epilog.md rename to docs/slurm-cluster/slurm-prolog-epilog/README.md diff --git a/docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats b/docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats new file mode 100644 index 000000000..a2ca270fd --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats @@ -0,0 +1,8 @@ +# Stop DCGM GPU stats collection if requested +scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null +if [ $? -eq 0 ]; then + OUTPUTDIR=`scontrol show job $SLURM_JOBID | grep WorkDir | cut -d = -f 2` + sudo -u $SLURM_JOB_USER dcgmi stats -x $SLURM_JOBID + sudo -u $SLURM_JOB_USER dcgmi stats -v -j $SLURM_JOBID | sudo -u $SLURM_JOB_USER tee $OUTPUTDIR/dcgm-gpu-stats-$HOSTNAME-$SLURM_JOBID.out + sudo -u $SLURM_JOB_USER nv-hostengine -t +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc b/docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc new file mode 100644 index 000000000..24e403785 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc @@ -0,0 +1,18 @@ +# Make sure ECC is on. +nvidia-smi -a | grep -A1 Ecc | grep -i disabled > /dev/null +if [ $? -eq 0 ]; then + logger -t PROLOG "Enabling ECC" + nvidia-smi -e 1 + GPUCOUNT=`nvidia-smi -L | wc -l` + GPUMAXINDEX=`expr $GPUCOUNT - 1` + systemctl stop collectd + logger -t PROLOG "Triggering GPU reset" + for i in `seq 0 $GPUMAXINDEX`; do + e=`nvidia-smi -r -i $i 2>&1` + if [ $? -ne 0 ]; then + logger -t PROLOG "WARNING! GPU $i reset failed" + logger -t PROLOG "GPU $i reset error: $e" + fi + done + logger -t PROLOG "GPU reset done" +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/epilog-mps b/docs/slurm-cluster/slurm-prolog-epilog/epilog-mps new file mode 100644 index 000000000..7271dfb5a --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/epilog-mps @@ -0,0 +1,11 @@ +# Quit cuda mps if it's running +ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null +if [ $? -eq 0 ]; then + echo quit | nvidia-cuda-mps-control +fi + +# Test for presence of mps zombie +ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null +if [ $? -eq 0 ]; then + killall nvidia-cuda-mps-server +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff b/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff new file mode 100644 index 000000000..e1675d7f8 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff @@ -0,0 +1,13 @@ +# Disable hypterthreading if requested +scontrol show job $SLURM_JOBID | grep Comment | grep -i nohyperthreading > /dev/null +if [ $? -eq 0 ]; then + cat `find /sys/devices/system/cpu -name thread_siblings_list` | sort | uniq > /tmp/thread_siblings_list + for sibs in `cat /tmp/thread_siblings_list` ; do + echo $sibs | grep ',' >& /dev/null # if there is a comma (','), then need to disable 2nd + if [ $? -eq 0 ] ; then + x=`echo $sibs | cut -f 2 -d ','` + echo Disabling CPU $x + echo 0 > /sys/devices/system/cpu/cpu$x/online + fi + done +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon b/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon new file mode 100644 index 000000000..43ec6a8bf --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon @@ -0,0 +1,8 @@ +# Enable hypterthreading if requested +scontrol show job $SLURM_JOBID | grep Comment | grep -i hyperthreading | grep -v nohyperthreading > /dev/null +if [ $? -eq 0 ]; then + for i in /sys/devices/system/cpu/*/online ; do + echo 1 > $i + echo Enabling CPU $(echo $i | egrep -o cpu[0-9]+ | tr -d 'cpu') + done +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts b/docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts new file mode 100644 index 000000000..8bf7aa535 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts @@ -0,0 +1,13 @@ +# +# Check that mounts exist +# +MOUNTS="/raid /gpfs/fs1" +for i in $MOUNTS + do + mount | grep $i &> /dev/null + if [ $? -ne 0 ] + then + echo "$HOSTNAME is missing $i" + echo "scontrol update nodename=$HOSTNAME state=drain reason="Mount missing: $i"" + fi +done diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth b/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth new file mode 100644 index 000000000..165951052 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth @@ -0,0 +1,19 @@ +# +# Check that all GPUs are healthy via dcgm +# +if [ $NUMGPUS -gt 0 ]; then + echo "Execute dcgm health check" + GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'` + rm /tmp/dcgm.out 2> /dev/null + nv-hostengine + dcgmi group -c gpuinfo + dcgmi group -g 1 -a $GPULIST + dcgmi diag -g 1 -r 1 1> /tmp/dcgm.out + dcgmi group -d 1 + nv-hostengine -t + grep -i fail /tmp/dcgm.out > /dev/null + if [ $? -gt 0 ]; then + scontrol update nodename=$HOSTNAME state=drain reason="Failed DCGM, see /tmp/dcgm.out" + exit 0 + fi +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats b/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats new file mode 100644 index 000000000..932d6730c --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats @@ -0,0 +1,10 @@ +# Start DCGM GPU stats collection if requested +scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null +if [ $? -eq 0 ]; then + GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'` + sudo -u $SLURM_JOB_USER nv-hostengine ~$SLURM_JOB_USER/nvhost.pid + sudo -u $SLURM_JOB_USER dcgmi group -c gpuinfo + sudo -u $SLURM_JOB_USER dcgmi group -g 1 -a $GPULIST + sudo -u $SLURM_JOB_USER dcgmi stats -g 1 --enable + sudo -u $SLURM_JOB_USER dcgmi stats -g 1 -s $SLURM_JOBID +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc b/docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc new file mode 100644 index 000000000..0e22765bb --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc @@ -0,0 +1,22 @@ +# Disable ECC if requested +scontrol show job $SLURM_JOBID | grep Comment | grep -i ecc > /dev/null +if [ $? -eq 0 ]; then + logger -t PROLOG "Disabling ECC" + nvidia-smi -e 0 + GPUCOUNT=`nvidia-smi -L | wc -l` + GPUMAXINDEX=`expr $GPUCOUNT - 1` + systemctl stop collectd + logger -t PROLOG "Triggering GPU reset" + for i in `seq 0 $GPUMAXINDEX`; do + logger -t PROLOG "Resetting GPU $i" + e=`nvidia-smi -r -i $i 2>&1` + if [ $? -ne 0 ]; then + logger -t PROLOG "WARNING! GPU $i reset failed" + logger -t PROLOG "GPU $i reset error: $e" + nvidia-smi -e 1 + fi + sleep 1 + done + logger -t PROLOG "GPU reset done" + systemctl start collectd +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-gpureset b/docs/slurm-cluster/slurm-prolog-epilog/prolog-gpureset new file mode 100644 index 000000000..e65adcf74 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-gpureset @@ -0,0 +1,27 @@ +# +# Uncomment the following for a full GPU reset +# Can be used to reset the GPU HW state in situations +# that would otherwise require a machine reboot. +# Typically useful if a double bit ECC error has +# occurred. +# Reset operations are not guarenteed to work in +# all cases and should be used with caution. +# +#GPUCOUNT=`nvidia-smi -L | wc -l` +#GPUMAXINDEX=`expr $GPUCOUNT - 1` +#for i in `seq 0 $GPUMAXINDEX`; do +# e=`nvidia-smi -r -i $i 2>&1` +# if [ $? -ne 0 ]; then +# logger -t PROLOG "WARNING! GPU $i reset failed" +# logger -t PROLOG "GPU $i reset error: $e" +# fi +# sleep 1 +#done + +# Reset application clocks +nvidia-smi -rac 2>/dev/null +nvidia-smi -acp 0 2>/dev/null + +# Make sure accounting is activated and clear current logs +nvidia-smi -am 1 > /dev/null +nvidia-smi -caa > /dev/null diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci b/docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci new file mode 100644 index 000000000..b24139309 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci @@ -0,0 +1,13 @@ +# +# Check that all GPUs are present +# +NUMGPUS=`scontrol -a show nodes $HOSTNAME | grep "Gres=gpu" | cut -d : -f 2` +if [ $NUMGPUS -gt 0 ]; then + PCIGPUSFOUND=`lspci | grep "3D controller: NVIDIA Corporation" | wc -l` + if [ $PCIGPUSFOUND -ne $NUMGPUS ]; then + echo "Slurm expects $NUMGPUS GPUs but lspci found: $PCIGPUSFOUND" + scontrol update nodename=$HOSTNAME state=drain reason="Missing GPUs" + exit 0 + fi + echo "Slurm expects $NUMGPUS GPUs lspci found: $PCIGPUSFOUND" +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps b/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps new file mode 100644 index 000000000..5c384c1a9 --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps @@ -0,0 +1,24 @@ +# Activate NVIDIA MPS if requested +# Server will start automatically, no need to start explicitly +scontrol show job $SLURM_JOBID | grep Comment | grep -i mps | grep -v mps-per-gpu > /dev/null +if [ $? -eq 0 ]; then + NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l) + case $NUM_GPUS in + '' | *[!0-9]*) ;; #skip if non-numeric + *) + GPUS="" + i=0 + while [ ${i} -lt ${NUM_GPUS} ] + do + GPUS+="${i}," + let i=i+1 + done + GPUS=${GPUS%%,} + esac + sudo nvidia-smi -c 3 + export CUDA_VISIBLE_DEVICES=$GPUS + export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps + export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log + sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUS CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY -s nvidia-cuda-mps-control -d + export CUDA_VISIBLE_DEVICES=$GPUS +fi diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps-per-gpu b/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps-per-gpu new file mode 100644 index 000000000..1cc08b56f --- /dev/null +++ b/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps-per-gpu @@ -0,0 +1,59 @@ +# +# Check if user gave --comment=mps-per-gpu when submitting their job. +# +scontrol show job $SLURM_JOBID | grep Comment | grep -i mps-per-gpu > /dev/null +if [ $? -eq 0 ]; then + # + # Count how many GPUs are on the system + # + NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l) + case $NUM_GPUS in + '' | *[!0-9]*) ;; #skip if non-numeric + *) + i=0 + # + # In this loop we iterate through the number of GPUs starting + # an MPS daemon and MPS server for each GPU. + # + while [ ${i} -lt ${NUM_GPUS} ] + do + GPUINDEX=$i + export CUDA_VISIBLE_DEVICES=$GPUINDEX + export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX + export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX + # + # Set GPU compute mode to EXCLUSIVE_PROCESS + # + nvidia-smi -i $GPUINDEX -c 3 + sleep 1 + # + # Start MPS daemon + # + sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUINDEX CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY -s nvidia-cuda-mps-control -d + sleep 1 + let i=i+1 + done + i=0 + while [ ${i} -lt ${NUM_GPUS} ] + do + GPUINDEX=$i + export CUDA_VISIBLE_DEVICES=$GPUINDEX + export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX + export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX + # + # Start MPS server + # + echo "start_server -uid $SLURM_JOB_UID" | CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY nvidia-cuda-mps-control + sleep 1 + # + # Determine MPS server PID and GPU CPU affinity. Bind MPS server to that CPU. + # + MPS_SERVER_PID=`nvidia-smi -q -i $GPUINDEX -d PIDS | grep "Process ID" | awk '{print $4}'` + GPUCPUAFFINITY=$(nvidia-smi topo -m | grep -m2 GPU${GPUINDEX} | tail -n1 | \ + awk '{print $NF}') + taskset -p -c ${GPUCPUAFFINITY} $MPS_SERVER_PID + + let i=i+1 + done + esac +fi \ No newline at end of file