Skip to content

Commit

Permalink
Organized job memory handeling
Browse files Browse the repository at this point in the history
- Organized the memory used for each ESS
- Added an optional memory attribute to the servers dictionary to limit
the requested memory if needed
- Modified the memory troubleshooting accordingly
  • Loading branch information
alongd committed Jun 15, 2019
1 parent faed1a0 commit f57d561
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 27 deletions.
2 changes: 1 addition & 1 deletion arc/job/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
input_files = {
'gaussian': """%chk=check.chk
%mem={memory}mb
%nproc={cpus}
%NProcShared={cpus}
#P {job_type_1} {restricted}{method}{slash}{basis} {job_type_2} {fine} {trsh} iop(2/9=2000)
Expand Down
38 changes: 28 additions & 10 deletions arc/job/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class Job(object):
`scan_res` ``int`` The rotor scan resolution in degrees
`software` ``str`` The electronic structure software to be used
`server_nodes` ``list`` A list of nodes this job was submitted to (for troubleshooting)
`memory` ``int`` The allocated memory (1500 MB by default)
`memory` ``int`` The total job allocated memory in GB (15 MB by default)
`method` ``str`` The calculation method (e.g., 'B3LYP', 'CCSD(T)', 'CBS-QB3'...)
`basis_set` ``str`` The basis set (e.g., '6-311++G(d,p)', 'aug-cc-pVTZ'...)
`fine` ``bool`` Whether to use fine geometry optimization parameters
Expand Down Expand Up @@ -97,7 +97,7 @@ class Job(object):
"""
def __init__(self, project, ess_settings, species_name, xyz, job_type, level_of_theory, multiplicity,
project_directory, charge=0, conformer=-1, fine=False, shift='', software=None, is_ts=False, scan='',
pivots=None, memory=15000, comments='', trsh='', scan_trsh='', ess_trsh_methods=None, bath_gas=None,
pivots=None, memory=15, comments='', trsh='', scan_trsh='', ess_trsh_methods=None, bath_gas=None,
initial_trsh=None, job_num=None, job_server_name=None, job_name=None, job_id=None, server=None,
initial_time=None, occ=None, max_job_time=120, scan_res=None, checkfile=None, number_of_radicals=None,
testing=False):
Expand Down Expand Up @@ -295,10 +295,30 @@ def __init__(self, project, ess_settings, species_name, xyz, job_type, level_of_

self.server = server if server is not None else self.ess_settings[self.software][0]

self.cpus = servers[self.server].get('cpus', 8) # set to 8 by default
self.mem_per_cpu = memory * 1000 / self.cpus # The `#SBATCH --mem-per-cpu` directive is in MB
max_mem = servers[self.server].get('memory', None)
if max_mem is not None and memory > max_mem * 0.9:
logging.warning('The memory for job {0} using {1} ({2} GB) exceeds 90% of the the maximum node memory on '
'{3}. Setting it to {4} GB.'.format(self.job_name, self.software,
memory, self.server, max_mem))
memory = 0.9 * max_mem
self.memory_gb = memory # store the memory in GB for troubleshooting
if self.software == 'molpro':
# molpro's memory is in MW, 1500 MW should be enough as an initial general memory requirement assessment
memory /= 10
self.memory = memory
# Molpro's memory is per cpu and in MW (mega word; 1 MW ~= 8 MB; 1 GB = 128 MW)
self.memory = memory * 128 / self.cpus
if self.software == 'terachem':
# TeraChem's memory is in MW (mega word; 1 MW ~= 8 MB; 1 GB = 128 MW)
self.memory = memory * 128
elif self.software == 'gaussian':
# Gaussian's memory is in MB, total for all cpus
self.memory = memory * 1000
elif self.software == 'orca':
# Orca's memory is in MB
self.memory = memory * 1000
elif self.software == 'qchem':
pass # QChem manages its memory automatically, for now ARC will not intervene
# see http://www.q-chem.com/qchem-website/manual/qchem44_manual/CCparallel.html

self.fine = fine
self.shift = shift
Expand Down Expand Up @@ -452,18 +472,17 @@ def write_submit_script(self):
else:
raise JobError('Could not determine format for maximal job time.\n Format is determined by {0}, but '
'got {1} for {2}'.format(t_max_format, servers[self.server]['cluster_soft'], self.server))
cpus = servers[self.server]['cpus'] if 'cpus' in servers[self.server] else 8
architecture = ''
if self.server.lower() == 'pharos':
# here we're hard-coding ARC for Pharos, a Green Group server
# If your server has different node architectures, implement something similar
if cpus <= 8:
if self.cpus <= 8:
architecture = '\n#$ -l harpertown'
else:
architecture = '\n#$ -l magnycours'
try:
self.submit = submit_scripts[self.server][self.software.lower()].format(
name=self.job_server_name, un=un, t_max=t_max, mem_cpu=int(self.memory / cpus), cpus=cpus,
name=self.job_server_name, un=un, t_max=t_max, mem_per_cpu=int(self.mem_per_cpu), cpus=self.cpus,
architecture=architecture)
except KeyError:
logging.error('Could not find submit script for server {0}, make sure your submit scripts '
Expand Down Expand Up @@ -719,10 +738,9 @@ def write_input_file(self):
raise
else:
try:
cpus = servers[self.server]['cpus'] if 'cpus' in servers[self.server] else 8
self.input = self.input.format(memory=self.memory, method=self.method, slash=slash, bath=self.bath_gas,
basis=self.basis_set, charge=self.charge, multiplicity=self.multiplicity,
spin=self.spin, xyz=self.xyz, job_type_1=job_type_1, cpus=cpus,
spin=self.spin, xyz=self.xyz, job_type_1=job_type_1, cpus=self.cpus,
job_type_2=job_type_2, scan=scan_string, restricted=restricted,
fine=fine, shift=self.shift, trsh=self.trsh, scan_trsh=self.scan_trsh,)
except KeyError:
Expand Down
13 changes: 6 additions & 7 deletions arc/job/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu {mem_cpu}
#SBATCH --mem-per-cpu {mem_per_cpu}
module add c3ddb/gaussian/09.d01
which g09
Expand Down Expand Up @@ -62,7 +62,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu {mem_cpu}
#SBATCH --mem-per-cpu {mem_per_cpu}
module add c3ddb/orca/4.1.2
module add c3ddb/openmpi/3.1.3
Expand Down Expand Up @@ -106,7 +106,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu={mem_cpu}
#SBATCH --mem-per-cpu={mem_per_cpu}
#SBATCH -x node07, node05
which 16
Expand Down Expand Up @@ -148,7 +148,7 @@
#SBATCH -N 1
#SBATCH -n {cpus}
#SBATCH --time={t_max}
#SBATCH --mem-per-cpu={mem_cpu}
#SBATCH --mem-per-cpu={mem_per_cpu}
#SBATCH -x node07, node05
export PATH=/opt/molpro/molprop_2015_1_linux_x86_64_i8/bin:$PATH
Expand Down Expand Up @@ -187,7 +187,6 @@
#$ -l long{architecture}
#$ -l h_rt={t_max}
#$ -pe singlenode {cpus}
#$ -l h=!node60.cluster
#$ -cwd
#$ -o out.txt
#$ -e err.txt
Expand Down Expand Up @@ -254,7 +253,7 @@
mkdir -p /scratch/{un}/{name}/qlscratch
qchem -nt 6 input.in output.out
qchem -nt {cpus} input.in output.out
rm -r /scratch/{un}/{name}
Expand All @@ -276,7 +275,7 @@
sdir=/scratch/{un}
mkdir -p /scratch/{un}/qlscratch
molpro -d $sdir -n 6 input.in
molpro -d $sdir -n {cpus} input.in
""",
# oneDMin
'onedmin': """#! /bin/bash -l
Expand Down
4 changes: 2 additions & 2 deletions arc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class ARC(object):
`rmgdb` ``RMGDatabase`` The RMG database object
`allow_nonisomorphic_2d` ``bool`` Whether to optimize species even if they do not have a 3D conformer that is
isomorphic to the 2D graph representation
`memory` ``int`` The allocated job memory in MB (1500 MB by default)
`memory` ``int`` The total allocated job memory in GB (15 by default)
`job_types` ``dict`` A dictionary of job types to execute. Keys are job types, values are boolean
`bath_gas` ``str`` A bath gas. Currently used in OneDMin to calc L-J parameters.
Allowed values are He, Ne, Ar, Kr, H2, N2, O2
Expand All @@ -95,7 +95,7 @@ def __init__(self, input_dict=None, project=None, arc_species_list=None, arc_rxn
conformer_level='', composite_method='', opt_level='', freq_level='', sp_level='', scan_level='',
ts_guess_level='', use_bac=True, job_types=None, model_chemistry='', initial_trsh=None, t_min=None,
t_max=None, t_count=None, verbose=logging.INFO, project_directory=None, max_job_time=120,
allow_nonisomorphic_2d=False, job_memory=15000, ess_settings=None, bath_gas=None,
allow_nonisomorphic_2d=False, job_memory=15, ess_settings=None, bath_gas=None,
adaptive_levels=None):
self.__version__ = '1.0.0'
self.verbose = verbose
Expand Down
14 changes: 8 additions & 6 deletions arc/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
from arc.species.species import ARCSpecies, TSGuess, determine_rotor_symmetry
from arc.species.converter import get_xyz_string, molecules_from_xyz, check_isomorphism
from arc.ts.atst import autotst
from arc.settings import rotor_scan_resolution, inconsistency_ab, inconsistency_az, maximum_barrier, default_job_types
from arc.settings import rotor_scan_resolution, inconsistency_ab, inconsistency_az, maximum_barrier, default_job_types,\
servers

##################################################################

Expand Down Expand Up @@ -82,7 +83,7 @@ class Scheduler(object):
isomorphic to the 2D graph representation
`dont_gen_confs` ``list`` A list of species labels for which conformer jobs were loaded from a restart file,
and additional conformer generation should be avoided
`memory` ``int`` The allocated job memory (1500 MB by default)
`memory` ``int`` The total allocated job memory in GB (15 by default)
`job_types` ``dict`` A dictionary of job types to execute. Keys are job types, values are boolean
`bath_gas` ``str`` A bath gas. Currently used in OneDMin to calc L-J parameters.
Allowed values are He, Ne, Ar, Kr, H2, N2, O2
Expand Down Expand Up @@ -119,7 +120,7 @@ class Scheduler(object):
def __init__(self, project, ess_settings, species_list, composite_method, conformer_level, opt_level, freq_level,
sp_level, scan_level, ts_guess_level, orbitals_level, adaptive_levels, project_directory, rmgdatabase,
job_types=None, initial_trsh=None, rxn_list=None, restart_dict=None, max_job_time=120,
allow_nonisomorphic_2d=False, memory=15000, testing=False, bath_gas=None):
allow_nonisomorphic_2d=False, memory=15, testing=False, bath_gas=None):
self.rmgdb = rmgdatabase
self.restart_dict = restart_dict
self.species_list = species_list
Expand Down Expand Up @@ -1567,9 +1568,10 @@ def troubleshoot_ess(self, label, job, level_of_theory, job_type, conformer=-1):
conformer=conformer)
elif 'memory' not in job.ess_trsh_methods:
# Increase memory allocation
memory = job.memory * 2
logging.info('Troubleshooting {type} job in {software} using memory: {mem} MB instead of {old} MB'.
format(type=job_type, software=job.software, mem=memory, old=job.memory))
max_mem = servers[job.server].get('memory', 128) # Node memory in GB, default to 128 if not specified
memory = job.memory_gb * 2 if job.memory_gb * 2 < max_mem * 0.9 else max_mem * 0.9
logging.info('Troubleshooting {type} job in {software} using memory: {mem} GB instead of {old} GB'.
format(type=job_type, software=job.software, mem=memory, old=job.memory_gb))
job.ess_trsh_methods.append('memory')
self.run_job(label=label, xyz=xyz, level_of_theory=level_of_theory, software=job.software,
job_type=job_type, fine=job.fine, memory=memory, ess_trsh_methods=job.ess_trsh_methods,
Expand Down
3 changes: 2 additions & 1 deletion arc/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
'address': 'server2.host.edu',
'un': '<username>',
'key': 'path_to_rsa_key',
'cpus': 48, # optional (default: 8)
'cpus': 48, # number of cpu's per node, optional (default: 8)
'memory': 128, # amount of memory per node in GB, optional (default: 16)
},
'local': {
'cluster_soft': 'OGE',
Expand Down

0 comments on commit f57d561

Please sign in to comment.