-
Notifications
You must be signed in to change notification settings - Fork 332
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
246 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Stop DCGM GPU stats collection if requested | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null | ||
if [ $? -eq 0 ]; then | ||
OUTPUTDIR=`scontrol show job $SLURM_JOBID | grep WorkDir | cut -d = -f 2` | ||
sudo -u $SLURM_JOB_USER dcgmi stats -x $SLURM_JOBID | ||
sudo -u $SLURM_JOB_USER dcgmi stats -v -j $SLURM_JOBID | sudo -u $SLURM_JOB_USER tee $OUTPUTDIR/dcgm-gpu-stats-$HOSTNAME-$SLURM_JOBID.out | ||
sudo -u $SLURM_JOB_USER nv-hostengine -t | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Make sure ECC is on. | ||
nvidia-smi -a | grep -A1 Ecc | grep -i disabled > /dev/null | ||
if [ $? -eq 0 ]; then | ||
logger -t PROLOG "Enabling ECC" | ||
nvidia-smi -e 1 | ||
GPUCOUNT=`nvidia-smi -L | wc -l` | ||
GPUMAXINDEX=`expr $GPUCOUNT - 1` | ||
systemctl stop collectd | ||
logger -t PROLOG "Triggering GPU reset" | ||
for i in `seq 0 $GPUMAXINDEX`; do | ||
e=`nvidia-smi -r -i $i 2>&1` | ||
if [ $? -ne 0 ]; then | ||
logger -t PROLOG "WARNING! GPU $i reset failed" | ||
logger -t PROLOG "GPU $i reset error: $e" | ||
fi | ||
done | ||
logger -t PROLOG "GPU reset done" | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Quit cuda mps if it's running | ||
ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null | ||
if [ $? -eq 0 ]; then | ||
echo quit | nvidia-cuda-mps-control | ||
fi | ||
|
||
# Test for presence of mps zombie | ||
ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null | ||
if [ $? -eq 0 ]; then | ||
killall nvidia-cuda-mps-server | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Disable hypterthreading if requested | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i nohyperthreading > /dev/null | ||
if [ $? -eq 0 ]; then | ||
cat `find /sys/devices/system/cpu -name thread_siblings_list` | sort | uniq > /tmp/thread_siblings_list | ||
for sibs in `cat /tmp/thread_siblings_list` ; do | ||
echo $sibs | grep ',' >& /dev/null # if there is a comma (','), then need to disable 2nd | ||
if [ $? -eq 0 ] ; then | ||
x=`echo $sibs | cut -f 2 -d ','` | ||
echo Disabling CPU $x | ||
echo 0 > /sys/devices/system/cpu/cpu$x/online | ||
fi | ||
done | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Enable hypterthreading if requested | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i hyperthreading | grep -v nohyperthreading > /dev/null | ||
if [ $? -eq 0 ]; then | ||
for i in /sys/devices/system/cpu/*/online ; do | ||
echo 1 > $i | ||
echo Enabling CPU $(echo $i | egrep -o cpu[0-9]+ | tr -d 'cpu') | ||
done | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# | ||
# Check that mounts exist | ||
# | ||
MOUNTS="/raid /gpfs/fs1" | ||
for i in $MOUNTS | ||
do | ||
mount | grep $i &> /dev/null | ||
if [ $? -ne 0 ] | ||
then | ||
echo "$HOSTNAME is missing $i" | ||
echo "scontrol update nodename=$HOSTNAME state=drain reason="Mount missing: $i"" | ||
fi | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# | ||
# Check that all GPUs are healthy via dcgm | ||
# | ||
if [ $NUMGPUS -gt 0 ]; then | ||
echo "Execute dcgm health check" | ||
GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'` | ||
rm /tmp/dcgm.out 2> /dev/null | ||
nv-hostengine | ||
dcgmi group -c gpuinfo | ||
dcgmi group -g 1 -a $GPULIST | ||
dcgmi diag -g 1 -r 1 1> /tmp/dcgm.out | ||
dcgmi group -d 1 | ||
nv-hostengine -t | ||
grep -i fail /tmp/dcgm.out > /dev/null | ||
if [ $? -gt 0 ]; then | ||
scontrol update nodename=$HOSTNAME state=drain reason="Failed DCGM, see /tmp/dcgm.out" | ||
exit 0 | ||
fi | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Start DCGM GPU stats collection if requested | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i dcgmstats > /dev/null | ||
if [ $? -eq 0 ]; then | ||
GPULIST=`nvidia-smi | grep Tesla | awk -vORS=, '{print $2}' | sed 's/,$/\n/'` | ||
sudo -u $SLURM_JOB_USER nv-hostengine ~$SLURM_JOB_USER/nvhost.pid | ||
sudo -u $SLURM_JOB_USER dcgmi group -c gpuinfo | ||
sudo -u $SLURM_JOB_USER dcgmi group -g 1 -a $GPULIST | ||
sudo -u $SLURM_JOB_USER dcgmi stats -g 1 --enable | ||
sudo -u $SLURM_JOB_USER dcgmi stats -g 1 -s $SLURM_JOBID | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Disable ECC if requested | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i ecc > /dev/null | ||
if [ $? -eq 0 ]; then | ||
logger -t PROLOG "Disabling ECC" | ||
nvidia-smi -e 0 | ||
GPUCOUNT=`nvidia-smi -L | wc -l` | ||
GPUMAXINDEX=`expr $GPUCOUNT - 1` | ||
systemctl stop collectd | ||
logger -t PROLOG "Triggering GPU reset" | ||
for i in `seq 0 $GPUMAXINDEX`; do | ||
logger -t PROLOG "Resetting GPU $i" | ||
e=`nvidia-smi -r -i $i 2>&1` | ||
if [ $? -ne 0 ]; then | ||
logger -t PROLOG "WARNING! GPU $i reset failed" | ||
logger -t PROLOG "GPU $i reset error: $e" | ||
nvidia-smi -e 1 | ||
fi | ||
sleep 1 | ||
done | ||
logger -t PROLOG "GPU reset done" | ||
systemctl start collectd | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# | ||
# Uncomment the following for a full GPU reset | ||
# Can be used to reset the GPU HW state in situations | ||
# that would otherwise require a machine reboot. | ||
# Typically useful if a double bit ECC error has | ||
# occurred. | ||
# Reset operations are not guarenteed to work in | ||
# all cases and should be used with caution. | ||
# | ||
#GPUCOUNT=`nvidia-smi -L | wc -l` | ||
#GPUMAXINDEX=`expr $GPUCOUNT - 1` | ||
#for i in `seq 0 $GPUMAXINDEX`; do | ||
# e=`nvidia-smi -r -i $i 2>&1` | ||
# if [ $? -ne 0 ]; then | ||
# logger -t PROLOG "WARNING! GPU $i reset failed" | ||
# logger -t PROLOG "GPU $i reset error: $e" | ||
# fi | ||
# sleep 1 | ||
#done | ||
|
||
# Reset application clocks | ||
nvidia-smi -rac 2>/dev/null | ||
nvidia-smi -acp 0 2>/dev/null | ||
|
||
# Make sure accounting is activated and clear current logs | ||
nvidia-smi -am 1 > /dev/null | ||
nvidia-smi -caa > /dev/null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# | ||
# Check that all GPUs are present | ||
# | ||
NUMGPUS=`scontrol -a show nodes $HOSTNAME | grep "Gres=gpu" | cut -d : -f 2` | ||
if [ $NUMGPUS -gt 0 ]; then | ||
PCIGPUSFOUND=`lspci | grep "3D controller: NVIDIA Corporation" | wc -l` | ||
if [ $PCIGPUSFOUND -ne $NUMGPUS ]; then | ||
echo "Slurm expects $NUMGPUS GPUs but lspci found: $PCIGPUSFOUND" | ||
scontrol update nodename=$HOSTNAME state=drain reason="Missing GPUs" | ||
exit 0 | ||
fi | ||
echo "Slurm expects $NUMGPUS GPUs lspci found: $PCIGPUSFOUND" | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Activate NVIDIA MPS if requested | ||
# Server will start automatically, no need to start explicitly | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i mps | grep -v mps-per-gpu > /dev/null | ||
if [ $? -eq 0 ]; then | ||
NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l) | ||
case $NUM_GPUS in | ||
'' | *[!0-9]*) ;; #skip if non-numeric | ||
*) | ||
GPUS="" | ||
i=0 | ||
while [ ${i} -lt ${NUM_GPUS} ] | ||
do | ||
GPUS+="${i}," | ||
let i=i+1 | ||
done | ||
GPUS=${GPUS%%,} | ||
esac | ||
sudo nvidia-smi -c 3 | ||
export CUDA_VISIBLE_DEVICES=$GPUS | ||
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps | ||
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log | ||
sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUS CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY -s nvidia-cuda-mps-control -d | ||
export CUDA_VISIBLE_DEVICES=$GPUS | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# | ||
# Check if user gave --comment=mps-per-gpu when submitting their job. | ||
# | ||
scontrol show job $SLURM_JOBID | grep Comment | grep -i mps-per-gpu > /dev/null | ||
if [ $? -eq 0 ]; then | ||
# | ||
# Count how many GPUs are on the system | ||
# | ||
NUM_GPUS=$(/usr/sbin/lspci -n -d 10de: | wc -l) | ||
case $NUM_GPUS in | ||
'' | *[!0-9]*) ;; #skip if non-numeric | ||
*) | ||
i=0 | ||
# | ||
# In this loop we iterate through the number of GPUs starting | ||
# an MPS daemon and MPS server for each GPU. | ||
# | ||
while [ ${i} -lt ${NUM_GPUS} ] | ||
do | ||
GPUINDEX=$i | ||
export CUDA_VISIBLE_DEVICES=$GPUINDEX | ||
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX | ||
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX | ||
# | ||
# Set GPU compute mode to EXCLUSIVE_PROCESS | ||
# | ||
nvidia-smi -i $GPUINDEX -c 3 | ||
sleep 1 | ||
# | ||
# Start MPS daemon | ||
# | ||
sudo -u $SLURM_JOB_USER PATH=${PATH} CUDA_VISIBLE_DEVICES=$GPUINDEX CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY -s nvidia-cuda-mps-control -d | ||
sleep 1 | ||
let i=i+1 | ||
done | ||
i=0 | ||
while [ ${i} -lt ${NUM_GPUS} ] | ||
do | ||
GPUINDEX=$i | ||
export CUDA_VISIBLE_DEVICES=$GPUINDEX | ||
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_$GPUINDEX | ||
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_$GPUINDEX | ||
# | ||
# Start MPS server | ||
# | ||
echo "start_server -uid $SLURM_JOB_UID" | CUDA_MPS_LOG_DIRECTORY=$CUDA_MPS_LOG_DIRECTORY CUDA_MPS_PIPE_DIRECTORY=$CUDA_MPS_PIPE_DIRECTORY nvidia-cuda-mps-control | ||
sleep 1 | ||
# | ||
# Determine MPS server PID and GPU CPU affinity. Bind MPS server to that CPU. | ||
# | ||
MPS_SERVER_PID=`nvidia-smi -q -i $GPUINDEX -d PIDS | grep "Process ID" | awk '{print $4}'` | ||
GPUCPUAFFINITY=$(nvidia-smi topo -m | grep -m2 GPU${GPUINDEX} | tail -n1 | \ | ||
awk '{print $NF}') | ||
taskset -p -c ${GPUCPUAFFINITY} $MPS_SERVER_PID | ||
|
||
let i=i+1 | ||
done | ||
esac | ||
fi |