re-add examples

NVIDIA · Jun 1, 2021 · 22c570a · 22c570a
1 parent 60d1bb0
commit 22c570a
Show file tree

Hide file tree

Showing 15 changed files with 246 additions and 1 deletion.
diff --git a/docs/slurm-cluster/README.md b/docs/slurm-cluster/README.md
@@ -73,7 +73,7 @@ Read through the [slurm usage guide](slurm-usage.md) and [Open OnDemand guide](o
 ## Prolog and Epilog
 
 The default Slurm deployment includes a collection of prolog and epilog scripts that should be modified to suit a particular system.
-For more information, see the [prolog/epilog documentation](slurm-prolog-epilog.md).
+For more information, see the [prolog/epilog documentation](slurm-prolog-epilog/README.md).
 
 ## Node Health Check
 

diff --git a/docs/slurm-cluster/slurm-prolog-epilog.md → ...urm-cluster/slurm-prolog-epilog/README.md b/docs/slurm-cluster/slurm-prolog-epilog.md → ...urm-cluster/slurm-prolog-epilog/README.md
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats b/docs/slurm-cluster/slurm-prolog-epilog/epilog-dcgmstats
@@ -0,0 +1,8 @@
+# Stop DCGM GPU stats collection if requested
+scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null
+if [ $? -eq 0 ]; then
+        OUTPUTDIR=`scontrol show job $SLURM_JOBID | grep WorkDir | cut -d = -f 2`
+        sudo -u $SLURM_JOB_USER dcgmi stats -x $SLURM_JOBID
+        sudo -u $SLURM_JOB_USER dcgmi stats -v -j $SLURM_JOBID | sudo -u $SLURM_JOB_USER tee $OUTPUTDIR/dcgm-gpu-stats-$HOSTNAME-$SLURM_JOBID.out
+        sudo -u $SLURM_JOB_USER nv-hostengine -t
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc b/docs/slurm-cluster/slurm-prolog-epilog/epilog-ecc
@@ -0,0 +1,18 @@
+# Make sure ECC is on.
+nvidia-smi -a | grep -A1 Ecc | grep -i disabled > /dev/null
+if [ $? -eq 0 ]; then
+    logger -t PROLOG "Enabling ECC"
+    nvidia-smi -e 1
+    GPUCOUNT=`nvidia-smi -L | wc -l`
+    GPUMAXINDEX=`expr $GPUCOUNT - 1`
+    systemctl stop collectd
+    logger -t PROLOG "Triggering GPU reset"
+    for i in `seq 0 $GPUMAXINDEX`; do
+        e=`nvidia-smi -r -i $i 2>&1`
+        if [ $? -ne 0 ]; then
+            logger -t PROLOG "WARNING! GPU $i reset failed"
+            logger -t PROLOG "GPU $i reset error: $e"
+        fi
+    done
+    logger -t PROLOG "GPU reset done"
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/epilog-mps b/docs/slurm-cluster/slurm-prolog-epilog/epilog-mps
@@ -0,0 +1,11 @@
+# Quit cuda mps if it's running
+ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
+if [ $? -eq 0 ]; then
+    echo quit | nvidia-cuda-mps-control
+fi
+
+# Test for presence of mps zombie
+ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
+if [ $? -eq 0 ]; then
+    killall nvidia-cuda-mps-server
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff b/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingoff
@@ -0,0 +1,13 @@
+# Disable hypterthreading if requested
+scontrol show job $SLURM_JOBID | grep Comment | grep -i nohyperthreading > /dev/null
+if [ $? -eq 0 ]; then
+	cat `find /sys/devices/system/cpu -name thread_siblings_list` | sort | uniq > /tmp/thread_siblings_list
+	for sibs in `cat /tmp/thread_siblings_list` ; do
+	    	echo $sibs | grep ',' >& /dev/null # if there is a comma (','), then need to disable 2nd
+    		if [ $? -eq 0 ] ; then
+      			x=`echo $sibs | cut -f 2 -d ','`
+      			echo Disabling CPU $x
+      			echo 0 > /sys/devices/system/cpu/cpu$x/online
+    		fi
+  	done
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon b/docs/slurm-cluster/slurm-prolog-epilog/hyperthreadingon
@@ -0,0 +1,8 @@
+# Enable hypterthreading if requested
+scontrol show job $SLURM_JOBID | grep Comment | grep -i hyperthreading | grep -v nohyperthreading > /dev/null
+if [ $? -eq 0 ]; then
+	for i in /sys/devices/system/cpu/*/online ; do
+		echo 1 > $i
+		echo Enabling CPU $(echo $i | egrep -o cpu[0-9]+ | tr -d 'cpu')
+	done
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts b/docs/slurm-cluster/slurm-prolog-epilog/prolog-checkmounts
@@ -0,0 +1,13 @@
+#
+# Check that mounts exist
+#
+MOUNTS="/raid /gpfs/fs1"
+for i in $MOUNTS
+        do
+	mount | grep $i &> /dev/null
+	if [ $? -ne 0 ]
+		then
+		echo "$HOSTNAME is missing $i"
+		echo "scontrol update nodename=$HOSTNAME state=drain reason="Mount missing: $i""
+	fi
+done
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth b/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmhealth
@@ -0,0 +1,19 @@
+#
+# Check that all GPUs are healthy via dcgm
+#
+if [ $NUMGPUS -gt 0 ]; then
+        echo "Execute dcgm health check"
+        GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'`
+        rm /tmp/dcgm.out 2> /dev/null
+        nv-hostengine
+        dcgmi group -c gpuinfo
+        dcgmi group -g 1 -a $GPULIST
+        dcgmi diag -g 1 -r 1 1> /tmp/dcgm.out
+        dcgmi group -d 1
+        nv-hostengine -t
+        grep -i fail /tmp/dcgm.out > /dev/null
+        if [ $? -gt 0 ]; then
+                scontrol update nodename=$HOSTNAME state=drain reason="Failed DCGM, see /tmp/dcgm.out"
+                exit 0
+        fi
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats b/docs/slurm-cluster/slurm-prolog-epilog/prolog-dcgmstats
@@ -0,0 +1,10 @@
+# Start DCGM GPU stats collection if requested
+scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null
+if [ $? -eq 0 ]; then
+        GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'`
+        sudo -u $SLURM_JOB_USER nv-hostengine ~$SLURM_JOB_USER/nvhost.pid
+        sudo -u $SLURM_JOB_USER dcgmi group -c gpuinfo
+        sudo -u $SLURM_JOB_USER dcgmi group -g 1 -a $GPULIST
+        sudo -u $SLURM_JOB_USER dcgmi stats -g 1 --enable
+        sudo -u $SLURM_JOB_USER dcgmi stats -g 1 -s $SLURM_JOBID
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc b/docs/slurm-cluster/slurm-prolog-epilog/prolog-ecc
@@ -0,0 +1,22 @@
+# Disable ECC if requested
+scontrol show job $SLURM_JOBID | grep Comment | grep -i ecc > /dev/null
+if [ $? -eq 0 ]; then
+    logger -t PROLOG "Disabling ECC"
+    nvidia-smi -e 0
+    GPUCOUNT=`nvidia-smi -L | wc -l`
+    GPUMAXINDEX=`expr $GPUCOUNT - 1`
+    systemctl stop collectd
+    logger -t PROLOG "Triggering GPU reset"
+    for i in `seq 0 $GPUMAXINDEX`; do
+        logger -t PROLOG "Resetting GPU $i"
+        e=`nvidia-smi -r -i $i 2>&1`
+        if [ $? -ne 0 ]; then
+            logger -t PROLOG "WARNING! GPU $i reset failed"
+            logger -t PROLOG "GPU $i reset error: $e"
+            nvidia-smi -e 1
+        fi
+    sleep 1
+    done
+    logger -t PROLOG "GPU reset done"
+    systemctl start collectd
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-gpureset b/docs/slurm-cluster/slurm-prolog-epilog/prolog-gpureset
@@ -0,0 +1,27 @@
+#
+# Uncomment the following for a full GPU reset
+# Can be used to reset the GPU HW state in situations
+# that would otherwise require a machine reboot.
+# Typically useful if a double bit ECC error has
+# occurred.
+# Reset operations are not guarenteed to work in
+# all cases and should be used with caution.
+#
+#GPUCOUNT=`nvidia-smi -L | wc -l`
+#GPUMAXINDEX=`expr $GPUCOUNT - 1`
+#for i in `seq 0 $GPUMAXINDEX`; do
+#	e=`nvidia-smi -r -i $i 2>&1`
+#        if [ $? -ne 0 ]; then
+#            logger -t PROLOG "WARNING! GPU $i reset failed"
+#            logger -t PROLOG "GPU $i reset error: $e"
+#        fi
+#        sleep 1
+#done
+
+# Reset application clocks
+nvidia-smi -rac 2>/dev/null
+nvidia-smi -acp 0 2>/dev/null
+
+# Make sure accounting is activated and clear current logs
+nvidia-smi -am 1 > /dev/null
+nvidia-smi -caa > /dev/null
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci b/docs/slurm-cluster/slurm-prolog-epilog/prolog-lspci
@@ -0,0 +1,13 @@
+#
+# Check that all GPUs are present
+#
+NUMGPUS=`scontrol -a show nodes $HOSTNAME | grep "Gres=gpu" | cut -d : -f 2`
+if [ $NUMGPUS -gt 0 ]; then
+        PCIGPUSFOUND=`lspci | grep "3D controller: NVIDIA Corporation" | wc -l`
+        if [ $PCIGPUSFOUND -ne $NUMGPUS ]; then
+                echo "Slurm expects $NUMGPUS GPUs but lspci found: $PCIGPUSFOUND"
+                scontrol update nodename=$HOSTNAME state=drain reason="Missing GPUs"
+                exit 0
+        fi
+        echo "Slurm expects $NUMGPUS GPUs lspci found: $PCIGPUSFOUND"
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps b/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps
@@ -0,0 +1,24 @@
+# Activate NVIDIA MPS if requested
+# Server will start automatically, no need to start explicitly
+scontrol show job $SLURM_JOBID | grep Comment | grep -i mps | grep -v mps-per-gpu > /dev/null
+if [ $? -eq 0 ]; then
+    NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l)
+    case $NUM_GPUS in
+        '' | *[!0-9]*) ;; #skip if non-numeric
+        *)
+            GPUS=""
+            i=0
+            while [ ${i} -lt ${NUM_GPUS} ]
+            do
+                GPUS+="${i},"
+                let i=i+1
+            done
+            GPUS=${GPUS%%,}
+    esac
+    sudo nvidia-smi -c 3
+    export CUDA_VISIBLE_DEVICES=$GPUS
+    export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
+    export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
+    sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUS CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY -s nvidia-cuda-mps-control -d
+    export CUDA_VISIBLE_DEVICES=$GPUS
+fi
diff --git a/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps-per-gpu b/docs/slurm-cluster/slurm-prolog-epilog/prolog-mps-per-gpu
@@ -0,0 +1,59 @@
+#
+# Check if user gave --comment=mps-per-gpu when submitting their job.
+#
+scontrol show job $SLURM_JOBID | grep Comment | grep -i mps-per-gpu > /dev/null
+if [ $? -eq 0 ]; then
+    #
+    # Count how many GPUs are on the system
+    #
+    NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l)
+    case $NUM_GPUS in
+        '' | *[!0-9]*) ;; #skip if non-numeric
+        *)
+            i=0
+            #
+            # In this loop we iterate through the number of GPUs starting
+            # an MPS daemon and MPS server for each GPU.
+            #
+            while [ ${i} -lt ${NUM_GPUS} ]
+            do
+                GPUINDEX=$i
+                export CUDA_VISIBLE_DEVICES=$GPUINDEX
+                export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX
+                export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX
+                #
+                # Set GPU compute mode to EXCLUSIVE_PROCESS
+                #
+                nvidia-smi -i $GPUINDEX -c 3
+                sleep 1
+                #
+                # Start MPS daemon
+                #
+                sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUINDEX CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY -s nvidia-cuda-mps-control -d
+                sleep 1
+                let i=i+1
+            done
+            i=0
+            while [ ${i} -lt ${NUM_GPUS} ]
+            do
+                GPUINDEX=$i
+                export CUDA_VISIBLE_DEVICES=$GPUINDEX
+                export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX
+                export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX
+                #
+                # Start MPS server
+                #
+                echo "start_server -uid $SLURM_JOB_UID"  | CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY nvidia-cuda-mps-control
+                sleep 1
+                #
+                # Determine MPS server PID and GPU CPU affinity.  Bind MPS server to that CPU.
+                #
+                MPS_SERVER_PID=`nvidia-smi -q -i $GPUINDEX -d PIDS | grep "Process ID" | awk '{print $4}'`
+                GPUCPUAFFINITY=$(nvidia-smi topo -m | grep -m2 GPU${GPUINDEX} | tail -n1 | \
+                  awk '{print $NF}')
+                taskset -p -c ${GPUCPUAFFINITY} $MPS_SERVER_PID
+
+                let i=i+1
+            done
+    esac
+fi