Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nexus: Add Perlmutter to Nexus machines #4356

Merged
merged 5 commits into from
Dec 12, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 133 additions & 6 deletions nexus/lib/machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2113,11 +2113,6 @@ def write_job_header(self,job):
#end class NerscMachine


class Edison(NerscMachine):
name = 'edison'
#end class Edison


class Cori(NerscMachine):
name = 'cori'

Expand Down Expand Up @@ -2198,6 +2193,138 @@ def write_job_header(self,job):




class Perlmutter(NerscMachine):
name = 'perlmutter'

def pre_process_job(self,job):
# Account for dual nature of Perlmutter
if 'cpu' in job.constraint:
self.nodes = 3072
self.procs_per_node = 2
self.cores_per_node = 128
self.ram_per_node = 512
elif 'gpu' in job.constraint:
self.nodes = 1536
self.procs_per_node = 1
self.cores_per_node = 64
self.ram_per_node = 256
self.gpus_per_node = 4
else:
self.error('SLURM input "constraint" must contain either "cpu" or "gpu" on Perlmutter\nyou provided: {0}'.format(job.constraint))
#end if

# Set default queue and node type
if job.queue is None:
job.queue = 'regular'
#end if
if job.constraint is None:
job.constraint = 'cpu'
#end if
#end def pre_process_job

def write_job_header(self,job):
self.pre_process_job(job) # sync machine view with job

# Check if the user gave reasonable processes_per_node
if 'cpu' in job.constraint:
if job.processes_per_node > self.cores_per_node:
self.error('processes_per_node can not be greater than logical CPUs per node (256)\nyou provided: {0}'.format(job.processes_per_node))
#end if
elif 'gpu' in job.constraint:
if job.processes_per_node > self.gpus_per_node:
self.error('processes_per_node can not be greater than GPUs per node (4)\nyou provided: {0}'.format(job.processes_per_node))
#end if
# Also check if the user forgot to include '_g' in the account name for GPU jobs
if ('_g' in job.account) == False:
jtkrogel marked this conversation as resolved.
Show resolved Hide resolved
job.account = job.account + '_g'
#end if
#end if

# Check if the user gave reasonable queue inputs
if job.queue == 'debug':
base_partition = 1
max_partition = 8
max_time = 0.5
elif job.queue == 'regular':
base_partition = 1
max_partition = self.nodes
max_time = 12
elif job.queue == 'preempt':
base_partition = 1
max_partition = 128
max_time = 24
elif job.queue == 'overrun':
base_partition = 1
max_partition = self.nodes
max_time = 12
else:
self.error('The requested queue is not implemented.')
#end if
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time:
self.error('The maximum runtime on {0} queue should not be more than {1} hours\n you requested: {2} hours'.format(job.queue,max_time,job.total_hours))
#end if
if job.nodes<base_partition:
self.error('The number of nodes on {0} queue should not be less than {1}\n you requested: {2}'.format(job.queue,base_partition,job.nodes))
elif job.nodes>max_partition:
self.error('The number of nodes on {0} queue should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes))
#end if

# Use the user cpus_per_task if specified. If not specified, then use available cpus for each process
if job.cpus_per_task is not None:
cpus_per_task = job.cpus_per_task
else:
hyperthreads = 2 # Both CPU and GPU nodes use the same AMD EPYC 7763 (Milan) CPUs
jtkrogel marked this conversation as resolved.
Show resolved Hide resolved
cpus_per_task = int(floor(float(self.cores_per_node)/job.processes_per_node))*hyperthreads
#end if

c='#!/bin/bash\n'
if job.account is not None:
c+= '#SBATCH -A '+job.account+'\n'
#end if
c+='#SBATCH -C '+str(job.constraint)+'\n'
c+='#SBATCH -q '+job.queue+'\n'
c+='#SBATCH -t '+job.sbatch_walltime()+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node)
c+='#SBATCH -c '+str(cpus_per_task)+'\n'
c+='#SBATCH -J '+str(job.name)+'\n'
c+='#SBATCH -o '+job.outfile+'\n'
c+='#SBATCH -e '+job.errfile+'\n'

if 'gpu' in job.constraint:
gpus_per_task = int(floor(float(self.gpus_per_node)/job.processes_per_node))
c+='#SBATCH --gpus-per-task={0}\n'.format(gpus_per_task)
#end if

if job.user_env:
c+='#SBATCH --export=ALL\n' # equiv to PBS -V
else:
c+='#SBATCH --export=NONE\n'
#end if
c+='''
echo $SLURM_SUBMIT_DIR
cd $SLURM_SUBMIT_DIR
'''
if job.threads>1:
jtkrogel marked this conversation as resolved.
Show resolved Hide resolved
c+='''
export OMP_PROC_BIND=true
export OMP_PLACES=threads
'''
#end if
if 'gpu' in job.constraint:
c+='''
export SLURM_CPU_BIND="cores"
'''
#end if
return c
#end def write_job_header
#end class Perlmutter




class BlueWatersXK(Supercomputer):

name = 'bluewaters_xk'
Expand Down Expand Up @@ -3383,7 +3510,6 @@ def write_job_header(self,job):
Kraken( 9408, 2, 6, 16, 100, 'aprun', 'qsub', 'qstat', 'qdel')
Golub( 512, 2, 6, 32, 1000, 'mpirun', 'qsub', 'qstat', 'qdel')
OIC5( 28, 2, 16, 128, 1000, 'mpirun', 'qsub', 'qstat', 'qdel')
Edison( 664, 2, 12, 64, 100, 'srun', 'sbatch', 'squeue', 'scancel')
Cori( 9688, 1, 68, 96, 100, 'srun', 'sbatch', 'squeue', 'scancel')
BlueWatersXK( 3072, 1, 16, 32, 100, 'aprun', 'qsub', 'qstat', 'qdel')
BlueWatersXE(22640, 2, 16, 64, 100, 'aprun', 'qsub', 'qstat', 'qdel')
Expand Down Expand Up @@ -3414,6 +3540,7 @@ def write_job_header(self,job):
Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel')
SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel')
Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Perlmutter( 3072, 2, 128, 512, 5000, 'srun', 'sbatch', 'squeue', 'scancel')


#machine accessor functions
Expand Down