dirac-institute · drewoldag · Sep 10, 2024 · Sep 10, 2024 · Sep 11, 2024 · drewoldag
diff --git a/example_runtime_config.toml b/example_runtime_config.toml
@@ -30,7 +30,7 @@ overwrite = false
 
 [apps.reproject_wu]
 # Number of processors to use for parallelizing the reprojection
-n_workers = 32
+n_workers = 8
 
 # The name of the observation site to use for reflex correction
 observation_site = "ctio"

diff --git a/src/kbmod_wf/resource_configs/klone_configuration.py b/src/kbmod_wf/resource_configs/klone_configuration.py
@@ -8,8 +8,8 @@
 walltimes = {
     "compute_bigmem": "01:00:00",
     "large_mem": "04:00:00",
-    "sharded_reproject": "04:00:00",
-    "gpu_max": "08:00:00",
+    "sharded_reproject": "01:00:00",
+    "gpu_max": "01:00:00",
 }
 
 
@@ -21,7 +21,7 @@ def klone_resource_config():
             os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat())
         ),
         run_dir=os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()),
-        retries=1,
+        retries=100,
         executors=[
             HighThroughputExecutor(
                 label="small_cpu",
@@ -30,19 +30,20 @@ def klone_resource_config():
                     partition="ckpt-g2",
                     account="astro",
                     min_blocks=0,
-                    max_blocks=4,
+                    max_blocks=16,
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
                     cores_per_node=1,  # perhaps should be 8???
-                    mem_per_node=256,  # In GB
+                    mem_per_node=32,  # In GB
                     exclusive=False,
                     walltime=walltimes["compute_bigmem"],
                     # Command to run before starting worker - i.e. conda activate <special_env>
                     worker_init="",
                 ),
             ),
             HighThroughputExecutor(
+                # This executor was used for the pre-TNO reprojection task
                 label="large_mem",
                 max_workers=1,
                 provider=SlurmProvider(
@@ -62,18 +63,19 @@ def klone_resource_config():
                 ),
             ),
             HighThroughputExecutor(
+                # This executor is used for reprojecting sharded WorkUnits
                 label="sharded_reproject",
                 max_workers=1,
                 provider=SlurmProvider(
                     partition="ckpt-g2",
                     account="astro",
                     min_blocks=0,
-                    max_blocks=2,
+                    max_blocks=16,
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
-                    cores_per_node=32,
-                    mem_per_node=128,  # ~2-4 GB per core
+                    cores_per_node=8,
+                    mem_per_node=32,  # ~2-4 GB per core
                     exclusive=False,
                     walltime=walltimes["sharded_reproject"],
                     # Command to run before starting worker - i.e. conda activate <special_env>
@@ -87,12 +89,12 @@ def klone_resource_config():
                     partition="ckpt-g2",
                     account="escience",
                     min_blocks=0,
-                    max_blocks=2,
+                    max_blocks=10,
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
-                    cores_per_node=2,  # perhaps should be 8???
-                    mem_per_node=512,  # In GB
+                    cores_per_node=1,
+                    mem_per_node=64,  # In GB
                     exclusive=False,
                     walltime=walltimes["gpu_max"],
                     # Command to run before starting worker - i.e. conda activate <special_env>