From c8c2023e2cf5c1b39e5115c2c2dfbc477f3f1bb9 Mon Sep 17 00:00:00 2001 From: drewoldag <47493171+drewoldag@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:46:43 -0700 Subject: [PATCH 1/3] Took a first pass at reducing the resources to be requested for the various workflow tasks. --- .../resource_configs/klone_configuration.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/kbmod_wf/resource_configs/klone_configuration.py b/src/kbmod_wf/resource_configs/klone_configuration.py index 953622b..f62072e 100644 --- a/src/kbmod_wf/resource_configs/klone_configuration.py +++ b/src/kbmod_wf/resource_configs/klone_configuration.py @@ -8,8 +8,8 @@ walltimes = { "compute_bigmem": "01:00:00", "large_mem": "04:00:00", - "sharded_reproject": "04:00:00", - "gpu_max": "08:00:00", + "sharded_reproject": "01:00:00", + "gpu_max": "01:00:00", } @@ -21,7 +21,7 @@ def klone_resource_config(): os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()) ), run_dir=os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()), - retries=1, + retries=100, executors=[ HighThroughputExecutor( label="small_cpu", @@ -35,7 +35,7 @@ def klone_resource_config(): parallelism=1, nodes_per_block=1, cores_per_node=1, # perhaps should be 8??? - mem_per_node=256, # In GB + mem_per_node=32, # In GB exclusive=False, walltime=walltimes["compute_bigmem"], # Command to run before starting worker - i.e. conda activate @@ -43,6 +43,7 @@ def klone_resource_config(): ), ), HighThroughputExecutor( + # This executor was used for the pre-TNO reprojection task label="large_mem", max_workers=1, provider=SlurmProvider( @@ -62,18 +63,19 @@ def klone_resource_config(): ), ), HighThroughputExecutor( + # This executor is used for reprojecting sharded WorkUnits label="sharded_reproject", max_workers=1, provider=SlurmProvider( partition="ckpt-g2", account="astro", min_blocks=0, - max_blocks=2, + max_blocks=10, init_blocks=0, parallelism=1, nodes_per_block=1, - cores_per_node=32, - mem_per_node=128, # ~2-4 GB per core + cores_per_node=8, + mem_per_node=32, # ~2-4 GB per core exclusive=False, walltime=walltimes["sharded_reproject"], # Command to run before starting worker - i.e. conda activate @@ -91,8 +93,8 @@ def klone_resource_config(): init_blocks=0, parallelism=1, nodes_per_block=1, - cores_per_node=2, # perhaps should be 8??? - mem_per_node=512, # In GB + cores_per_node=1, + mem_per_node=128, # In GB exclusive=False, walltime=walltimes["gpu_max"], # Command to run before starting worker - i.e. conda activate From 38a11ad5bd05991645c3bdb168d50c3afd578c2d Mon Sep 17 00:00:00 2001 From: drewoldag <47493171+drewoldag@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:56:06 -0700 Subject: [PATCH 2/3] Need to update the `n_workers` parameter in the reprojection config as well. --- example_runtime_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_runtime_config.toml b/example_runtime_config.toml index a33daf1..48f51e9 100644 --- a/example_runtime_config.toml +++ b/example_runtime_config.toml @@ -30,7 +30,7 @@ overwrite = false [apps.reproject_wu] # Number of processors to use for parallelizing the reprojection -n_workers = 32 +n_workers = 8 # The name of the observation site to use for reflex correction observation_site = "ctio" From 105afdb41683ddc8e702dd82a38db7399c8523a3 Mon Sep 17 00:00:00 2001 From: Drew Oldag Date: Wed, 11 Sep 2024 12:51:58 -0700 Subject: [PATCH 3/3] Additional tweaks to the klone config. --- src/kbmod_wf/resource_configs/klone_configuration.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kbmod_wf/resource_configs/klone_configuration.py b/src/kbmod_wf/resource_configs/klone_configuration.py index f62072e..228ea2f 100644 --- a/src/kbmod_wf/resource_configs/klone_configuration.py +++ b/src/kbmod_wf/resource_configs/klone_configuration.py @@ -30,7 +30,7 @@ def klone_resource_config(): partition="ckpt-g2", account="astro", min_blocks=0, - max_blocks=4, + max_blocks=16, init_blocks=0, parallelism=1, nodes_per_block=1, @@ -70,7 +70,7 @@ def klone_resource_config(): partition="ckpt-g2", account="astro", min_blocks=0, - max_blocks=10, + max_blocks=16, init_blocks=0, parallelism=1, nodes_per_block=1, @@ -89,12 +89,12 @@ def klone_resource_config(): partition="ckpt-g2", account="escience", min_blocks=0, - max_blocks=2, + max_blocks=10, init_blocks=0, parallelism=1, nodes_per_block=1, cores_per_node=1, - mem_per_node=128, # In GB + mem_per_node=64, # In GB exclusive=False, walltime=walltimes["gpu_max"], # Command to run before starting worker - i.e. conda activate