Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nexus: Add Perlmutter to Nexus machines #4356

Merged
merged 5 commits into from
Dec 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 132 additions & 6 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2121,11 +2121,6 @@ def write_job_header(self,job):
#end class NerscMachine


class Edison(NerscMachine):
name = 'edison'
#end class Edison


class Cori(NerscMachine):
name = 'cori'

Expand Down Expand Up @@ -2206,6 +2201,137 @@ def write_job_header(self,job):




class Perlmutter(NerscMachine):
name = 'perlmutter'

def pre_process_job(self,job):
# Set default queue and node type
if job.queue is None:
job.queue = 'regular'
#end if
if job.constraint is None:
job.constraint = 'cpu'
#end if
# Account for dual nature of Perlmutter
if 'cpu' in job.constraint:
self.nodes = 3072
self.procs_per_node = 2
self.cores_per_node = 128
self.ram_per_node = 512
elif 'gpu' in job.constraint:
self.nodes = 1536
self.procs_per_node = 1
self.cores_per_node = 64
self.ram_per_node = 256
self.gpus_per_node = 4
else:
self.error('SLURM input "constraint" must contain either "cpu" or "gpu" on Perlmutter\nyou provided: {0}'.format(job.constraint))
#end if
#end def pre_process_job

def write_job_header(self,job):
self.pre_process_job(job) # sync machine view with job

# Check if the user gave reasonable processes_per_node
if 'cpu' in job.constraint:
if job.processes_per_node > self.cores_per_node:
self.error('processes_per_node can not be greater than logical CPUs per node (256)\nyou provided: {0}'.format(job.processes_per_node))
#end if
elif 'gpu' in job.constraint:
if job.processes_per_node > self.gpus_per_node:
self.error('processes_per_node can not be greater than GPUs per node (4)\nyou provided: {0}'.format(job.processes_per_node))
#end if
# Also check if the user forgot to include '_g' in the account name for GPU jobs
if ('_g' in job.account) == False:
jtkrogel marked this conversation as resolved.
Show resolved Hide resolved
job.account = job.account + '_g'
#end if
#end if

# Check if the user gave reasonable queue inputs
if job.queue == 'debug':
base_partition = 1
max_partition = 8
max_time = 0.5
elif job.queue == 'regular':
base_partition = 1
max_partition = self.nodes
max_time = 12
elif job.queue == 'preempt':
base_partition = 1
max_partition = 128
max_time = 24
elif job.queue == 'overrun':
base_partition = 1
max_partition = self.nodes
max_time = 12
else:
self.error('The requested queue is not implemented.')
#end if
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time:
self.error('The maximum runtime on {0} queue should not be more than {1} hours\n you requested: {2} hours'.format(job.queue,max_time,job.total_hours))
#end if
if job.nodes<base_partition:
self.error('The number of nodes on {0} queue should not be less than {1}\n you requested: {2}'.format(job.queue,base_partition,job.nodes))
elif job.nodes>max_partition:
self.error('The number of nodes on {0} queue should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes))
#end if

# Use the user cpus_per_task if specified. If not specified, then use available cpus for each process
if job.cpus_per_task is not None:
cpus_per_task = job.cpus_per_task
else:
hyperthreads = 2 # Both CPU and GPU nodes use the same AMD EPYC 7763 (Milan) CPUs
jtkrogel marked this conversation as resolved.
Show resolved Hide resolved
cpus_per_task = int(floor(float(self.cores_per_node)/job.processes_per_node))*hyperthreads
#end if

c='#!/bin/bash\n'
if job.account is not None:
c+= '#SBATCH -A '+job.account+'\n'
#end if
c+='#SBATCH -C '+str(job.constraint)+'\n'
c+='#SBATCH -q '+job.queue+'\n'
c+='#SBATCH -t '+job.sbatch_walltime()+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node)
c+='#SBATCH -c '+str(cpus_per_task)+'\n'
c+='#SBATCH -J '+str(job.name)+'\n'
c+='#SBATCH -o '+job.outfile+'\n'
c+='#SBATCH -e '+job.errfile+'\n'

if 'gpu' in job.constraint:
gpus_per_task = int(floor(float(self.gpus_per_node)/job.processes_per_node))
c+='#SBATCH --gpus-per-task={0}\n'.format(gpus_per_task)
#end if

if job.user_env:
c+='#SBATCH --export=ALL\n' # equiv to PBS -V
else:
c+='#SBATCH --export=NONE\n'
#end if
c+='''
echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR
'''
if job.threads>1:
jtkrogel marked this conversation as resolved.
Show resolved Hide resolved
c+='''
export OMP_PROC_BIND=true
export OMP_PLACES=threads
'''
#end if
if 'gpu' in job.constraint:
c+='''
export SLURM_CPU_BIND="cores"
'''
#end if
return c
#end def write_job_header
#end class Perlmutter




class BlueWatersXK(Supercomputer):

name = 'bluewaters_xk'
Expand Down Expand Up @@ -3454,7 +3580,6 @@ def specialized_bundle_commands(self,job,launcher,serial):
Kraken( 9408, 2, 6, 16, 100, 'aprun', 'qsub', 'qstat', 'qdel')
Golub( 512, 2, 6, 32, 1000, 'mpirun', 'qsub', 'qstat', 'qdel')
OIC5( 28, 2, 16, 128, 1000, 'mpirun', 'qsub', 'qstat', 'qdel')
Edison( 664, 2, 12, 64, 100, 'srun', 'sbatch', 'squeue', 'scancel')
Cori( 9688, 1, 68, 96, 100, 'srun', 'sbatch', 'squeue', 'scancel')
BlueWatersXK( 3072, 1, 16, 32, 100, 'aprun', 'qsub', 'qstat', 'qdel')
BlueWatersXE(22640, 2, 16, 64, 100, 'aprun', 'qsub', 'qstat', 'qdel')
Expand Down Expand Up @@ -3486,6 +3611,7 @@ def specialized_bundle_commands(self,job,launcher,serial):
SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel')
Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Polaris( 560, 1, 32, 512, 8,'mpiexec', 'qsub', 'qstat', 'qdel')
Perlmutter( 3072, 2, 128, 512, 5000, 'srun', 'sbatch', 'squeue', 'scancel')


#machine accessor functions
Expand Down
47 changes: 24 additions & 23 deletions nexus/tests/unit/test_machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,12 +1099,6 @@ def job_commands_equal(c1,c2):
('eclipse' , 'n2_t2' ) : 'srun test.x',
('eclipse' , 'n2_t2_e' ) : 'srun test.x',
('eclipse' , 'n2_t2_p2' ) : 'srun test.x',
('edison' , 'n1' ) : 'srun test.x',
('edison' , 'n1_p1' ) : 'srun test.x',
('edison' , 'n2' ) : 'srun test.x',
('edison' , 'n2_t2' ) : 'srun test.x',
('edison' , 'n2_t2_e' ) : 'srun test.x',
('edison' , 'n2_t2_p2' ) : 'srun test.x',
('eos' , 'n1' ) : 'aprun -n 16 test.x',
('eos' , 'n1_p1' ) : 'aprun -n 1 test.x',
('eos' , 'n2' ) : 'aprun -n 32 test.x',
Expand Down Expand Up @@ -1153,6 +1147,12 @@ def job_commands_equal(c1,c2):
('oic5' , 'n2_t2' ) : 'mpirun -np 32 test.x',
('oic5' , 'n2_t2_e' ) : 'mpirun -np 32 test.x',
('oic5' , 'n2_t2_p2' ) : 'mpirun -np 4 test.x',
('perlmutter' , 'n1' ) : 'srun test.x',
('perlmutter' , 'n1_p1' ) : 'srun test.x',
('perlmutter' , 'n2' ) : 'srun test.x',
('perlmutter' , 'n2_t2' ) : 'srun test.x',
('perlmutter' , 'n2_t2_e' ) : 'srun test.x',
('perlmutter' , 'n2_t2_p2' ) : 'srun test.x',
('polaris' , 'n1' ) : 'mpiexec --cpu-bind depth --depth=1 -n 32 --ppn 32 --env OMP_NUM_THREADS=1 test.x',
('polaris' , 'n1_p1' ) : 'mpiexec --cpu-bind depth --depth=1 -n 1 --ppn 1 --env OMP_NUM_THREADS=1 test.x',
('polaris' , 'n2' ) : 'mpiexec --cpu-bind depth --depth=1 -n 64 --ppn 32 --env OMP_NUM_THREADS=1 test.x',
Expand Down Expand Up @@ -1555,23 +1555,6 @@ def test_write_job():
#SBATCH -o test.out
#SBATCH -e test.err

export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
edison = '''#!/bin/bash
#SBATCH -p regular
#SBATCH -J jobname
#SBATCH -t 06:30:00
#SBATCH -N 2
#SBATCH --ntasks-per-node=24
#SBATCH --cpus-per-task=1
#SBATCH -o test.out
#SBATCH -e test.err
#SBATCH --export=ALL

echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR

export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
Expand Down Expand Up @@ -1702,6 +1685,24 @@ def test_write_job():
export OMP_NUM_THREADS=1
export ENV_VAR=1
mpirun -np 64 test.x''',
perlmutter = '''#!/bin/bash
#SBATCH -C cpu
#SBATCH -q regular
#SBATCH -t 06:30:00
#SBATCH -N 2
#SBATCH --ntasks-per-node=128
#SBATCH -c 2
#SBATCH -J jobname
#SBATCH -o test.out
#SBATCH -e test.err
#SBATCH --export=ALL

echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR

export OMP_NUM_THREADS=1
export ENV_VAR=1
srun test.x''',
polaris = '''#!/bin/sh
#PBS -l select=2:system=polaris
#PBS -l place=scatter
Expand Down