From 297b91213f5ede0f4134437f94ef2c5f26fb1245 Mon Sep 17 00:00:00 2001
From: "Keaton J. Burns" <keaton.burns@gmail.com>
Date: Tue, 9 Jan 2024 14:59:19 -0500
Subject: [PATCH 1/5] Add built-in cprofile tooling

---
 dedalus/core/solvers.py | 44 +++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py
index 135b1933..0ff2964a 100644
--- a/dedalus/core/solvers.py
+++ b/dedalus/core/solvers.py
@@ -6,6 +6,8 @@
 import h5py
 import pathlib
 import scipy.linalg
+import cProfile
+import pstats
 from math import prod
 
 from . import subsystems
@@ -14,6 +16,7 @@
 from ..libraries.matsolvers import matsolvers
 from ..tools.config import config
 from ..tools.array import scipy_sparse_eigs
+from ..tools.parallel import Sync
 
 import logging
 logger = logging.getLogger(__name__.split('.')[-1])
@@ -485,6 +488,8 @@ class InitialValueSolver(SolverBase):
         Iteration cadence for enforcing Hermitian symmetry on real variables (default: 100).
     warmup_iterations : int, optional
         Number of warmup iterations to disregard when computing runtime statistics (default: 10).
+    profile : bool, optional
+        Save profiles with cProfile (default: False).
     **kw :
         Other options passed to ProblemBase.
 
@@ -510,15 +515,20 @@ class InitialValueSolver(SolverBase):
     matsolver_default = 'MATRIX_FACTORIZER'
     matrices = ['M', 'L']
 
-    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, **kw):
+    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, profile=False, **kw):
         logger.debug('Beginning IVP instantiation')
-        super().__init__(problem, **kw)
-        if np.isrealobj(self.dtype.type()):
-            self.enforce_real_cadence = enforce_real_cadence
-        else:
-            self.enforce_real_cadence = None
+        # Setup timing and profiling
+        self.dist = problem.dist
         self._bcast_array = np.zeros(1, dtype=float)
         self.init_time = self.world_time
+        self.profile = profile
+        if profile:
+            self.setup_profiler = cProfile.Profile()
+            self.warmup_profiler = cProfile.Profile()
+            self.run_profiler = cProfile.Profile()
+            self.setup_profiler.enable()
+        # Build subsystems and subproblems
+        super().__init__(problem, **kw)
         # Build LHS matrices
         self.build_matrices(self.subproblems, ['M', 'L'])
         # Compute total modes
@@ -538,6 +548,10 @@ def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterat
         self.sim_time = self.initial_sim_time = problem.time.allreduce_data_max(layout='g')
         self.iteration = self.initial_iteration = 0
         self.warmup_iterations = warmup_iterations
+        if np.isrealobj(self.dtype.type()):
+            self.enforce_real_cadence = enforce_real_cadence
+        else:
+            self.enforce_real_cadence = None
         # Default integration parameters
         self.stop_sim_time = np.inf
         self.stop_wall_time = np.inf
@@ -648,8 +662,14 @@ def step(self, dt):
         wall_time = self.wall_time
         if self.iteration == self.initial_iteration:
             self.start_time = wall_time
+            if self.profile:
+                self.dump_profiles(self.setup_profiler, "setup")
+                self.warmup_profiler.enable()
         if self.iteration == self.initial_iteration + self.warmup_iterations:
             self.warmup_time = wall_time
+            if self.profile:
+                self.dump_profiles(self.warmup_profiler, "warmup")
+                self.run_profiler.enable()
         # Advance using timestepper
         self.timestepper.step(dt, wall_time)
         # Update iteration
@@ -704,6 +724,8 @@ def log_stats(self, format=".4g"):
         logger.info(f"Final iteration: {self.iteration}")
         logger.info(f"Final sim time: {self.sim_time}")
         logger.info(f"Setup time (init - iter 0): {self.start_time:{format}} sec")
+        if self.profile:
+            self.dump_profiles(self.run_profiler, "runtime")
         if self.iteration >= self.initial_iteration + self.warmup_iterations:
             warmup_time = self.warmup_time - self.start_time
             run_time = log_time - self.warmup_time
@@ -716,3 +738,13 @@ def log_stats(self, format=".4g"):
             logger.info(f"Speed: {(modes*stages/cpus/run_time):{format}} mode-stages/cpu-sec")
         else:
             logger.info(f"Timings unavailable because warmup did not complete.")
+
+    def dump_profiles(self, profiler, name):
+        comm = self.dist.comm
+        # Write stats from each process
+        profiler.dump_stats(f"/tmp/{name}_proc{comm.rank}.prof")
+        # Sum stats over processes
+        with Sync(comm):
+            if comm.rank == 0:
+                joint_stats = pstats.Stats(*(f"/tmp/{name}_proc{i}.prof" for i in range(comm.size)))
+                joint_stats.dump_stats(f"{name}.prof")

From 54a3f6a9c063eb158c5515e93687f69a7cd14aaa Mon Sep 17 00:00:00 2001
From: "Keaton J. Burns" <keaton.burns@gmail.com>
Date: Wed, 10 Jan 2024 13:57:48 -0500
Subject: [PATCH 2/5] Use pickling instead of /tmp to gather profiles from
 processes

---
 dedalus/core/solvers.py   | 18 ++++++++++--------
 dedalus/tools/parallel.py | 11 +++++++++++
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py
index 0ff2964a..976c71f1 100644
--- a/dedalus/core/solvers.py
+++ b/dedalus/core/solvers.py
@@ -16,7 +16,7 @@
 from ..libraries.matsolvers import matsolvers
 from ..tools.config import config
 from ..tools.array import scipy_sparse_eigs
-from ..tools.parallel import Sync
+from ..tools.parallel import ProfileWrapper
 
 import logging
 logger = logging.getLogger(__name__.split('.')[-1])
@@ -741,10 +741,12 @@ def log_stats(self, format=".4g"):
 
     def dump_profiles(self, profiler, name):
         comm = self.dist.comm
-        # Write stats from each process
-        profiler.dump_stats(f"/tmp/{name}_proc{comm.rank}.prof")
-        # Sum stats over processes
-        with Sync(comm):
-            if comm.rank == 0:
-                joint_stats = pstats.Stats(*(f"/tmp/{name}_proc{i}.prof" for i in range(comm.size)))
-                joint_stats.dump_stats(f"{name}.prof")
+        # Disable and create stats on each process
+        profiler.create_stats()
+        # Gather using wrapper class to avoid pickling issues
+        profiles = comm.gather(ProfileWrapper(profiler.stats), root=0)
+        # Sum stats on root process
+        if comm.rank == 0:
+            joint_stats = pstats.Stats(*profiles)
+            joint_stats.dump_stats(f"{name}.prof")
+
diff --git a/dedalus/tools/parallel.py b/dedalus/tools/parallel.py
index 0c03191a..be21c849 100644
--- a/dedalus/tools/parallel.py
+++ b/dedalus/tools/parallel.py
@@ -56,3 +56,14 @@ def __enter__(self):
     def __exit__(self, type, value, traceback):
         for i in range(self.size-self.rank):
             self.comm.Barrier()
+
+
+class ProfileWrapper:
+    """Pickleable wrapper for cProfile.Profile for use with pstats.Stats"""
+
+    def __init__(self, stats):
+        self.stats = stats
+
+    def create_stats(self):
+        pass
+

From dbc204132927f5ecd1727cc445bbcb714d185165 Mon Sep 17 00:00:00 2001
From: Benjamin Brown <bpbrown@gmail.com>
Date: Fri, 12 Jan 2024 20:38:12 -0800
Subject: [PATCH 3/5] Adds per-core outputs of profiling data, to determine
 imbalances and bottlenecks.  Currently saving via shelve.

---
 dedalus/core/solvers.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py
index 976c71f1..b5f030f8 100644
--- a/dedalus/core/solvers.py
+++ b/dedalus/core/solvers.py
@@ -10,6 +10,9 @@
 import pstats
 from math import prod
 
+from collections import defaultdict
+import shelve
+
 from . import subsystems
 from . import timesteppers
 from .evaluator import Evaluator
@@ -743,10 +746,25 @@ def dump_profiles(self, profiler, name):
         comm = self.dist.comm
         # Disable and create stats on each process
         profiler.create_stats()
+        p = pstats.Stats(profiler)
+        p.strip_dirs()
         # Gather using wrapper class to avoid pickling issues
-        profiles = comm.gather(ProfileWrapper(profiler.stats), root=0)
+        profiles = comm.gather(ProfileWrapper(p.stats), root=0)
         # Sum stats on root process
         if comm.rank == 0:
+            if self.profile=='verbose':
+                profile_database = pathlib.Path(f"{name}_profiles")
+                stats = {'primcalls':defaultdict(list),'totcalls':defaultdict(list),'tottime':defaultdict(list),'cumtime':defaultdict(list)}
+                for profile in profiles:
+                    for func, (primcalls, totcalls, tottime, cumtime, callers) in profile.stats.items():
+                        stats['primcalls'][func].append(primcalls)
+                        stats['totcalls'][func].append(totcalls)
+                        stats['tottime'][func].append(tottime)
+                        stats['cumtime'][func].append(cumtime)
+                with shelve.open(str(profile_database), flag='n') as shelf:
+                    for key in stats:
+                        shelf[key] = stats[key]
+
+            # creation of joint_stats destroys profiles, so do this second
             joint_stats = pstats.Stats(*profiles)
             joint_stats.dump_stats(f"{name}.prof")
-

From a5671ffb6289128e50f4b492009bdf490c79e3d3 Mon Sep 17 00:00:00 2001
From: Benjamin Brown <bpbrown@gmail.com>
Date: Sat, 13 Jan 2024 20:24:18 -0800
Subject: [PATCH 4/5] Adds config-file driving for profiling, in new
 'profiling' section.

---
 dedalus/core/solvers.py | 11 +++++++++--
 dedalus/dedalus.cfg     |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py
index b5f030f8..9f7e126d 100644
--- a/dedalus/core/solvers.py
+++ b/dedalus/core/solvers.py
@@ -21,6 +21,9 @@
 from ..tools.array import scipy_sparse_eigs
 from ..tools.parallel import ProfileWrapper
 
+PROFILE_SCRIPT = config['profiling'].getboolean('PROFILE_SCRIPT')
+PROFILE_MODE_DEFAULT = config['profiling'].get('PROFILE_MODE_DEFAULT')
+
 import logging
 logger = logging.getLogger(__name__.split('.')[-1])
 
@@ -493,6 +496,9 @@ class InitialValueSolver(SolverBase):
         Number of warmup iterations to disregard when computing runtime statistics (default: 10).
     profile : bool, optional
         Save profiles with cProfile (default: False).
+    profile_mode : string, optional
+        Output joined files [summary] or also include per-core analysis [full] (default: full).
+
     **kw :
         Other options passed to ProblemBase.
 
@@ -518,7 +524,7 @@ class InitialValueSolver(SolverBase):
     matsolver_default = 'MATRIX_FACTORIZER'
     matrices = ['M', 'L']
 
-    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, profile=False, **kw):
+    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, profile=PROFILE_SCRIPT, profile_mode=PROFILE_MODE_DEFAULT, **kw):
         logger.debug('Beginning IVP instantiation')
         # Setup timing and profiling
         self.dist = problem.dist
@@ -526,6 +532,7 @@ def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterat
         self.init_time = self.world_time
         self.profile = profile
         if profile:
+            self.profile_mode = profile_mode.lower()
             self.setup_profiler = cProfile.Profile()
             self.warmup_profiler = cProfile.Profile()
             self.run_profiler = cProfile.Profile()
@@ -752,7 +759,7 @@ def dump_profiles(self, profiler, name):
         profiles = comm.gather(ProfileWrapper(p.stats), root=0)
         # Sum stats on root process
         if comm.rank == 0:
-            if self.profile=='verbose':
+            if self.profile_mode=='full':
                 profile_database = pathlib.Path(f"{name}_profiles")
                 stats = {'primcalls':defaultdict(list),'totcalls':defaultdict(list),'tottime':defaultdict(list),'cumtime':defaultdict(list)}
                 for profile in profiles:
diff --git a/dedalus/dedalus.cfg b/dedalus/dedalus.cfg
index 416e621d..ef4d5baa 100644
--- a/dedalus/dedalus.cfg
+++ b/dedalus/dedalus.cfg
@@ -118,3 +118,11 @@
     # This works around NFS caching issues
     FILEHANDLER_TOUCH_TMPFILE = False
 
+
+[profiling]
+
+    # Profile script using cProfile
+    PROFILE_SCRIPT = False
+
+    # Level of profiling to conduct (summary, full)
+    PROFILE_MODE_DEFAULT = full

From 09a3bc0b90a89355adf97d1a3be29ba4d0d8b4d3 Mon Sep 17 00:00:00 2001
From: "Keaton J. Burns" <keaton.burns@gmail.com>
Date: Wed, 31 Jan 2024 13:13:34 -0500
Subject: [PATCH 5/5] Update profile options and put in subdirectory by default

---
 dedalus/core/solvers.py   | 44 +++++++++++++++++++--------------------
 dedalus/dedalus.cfg       | 14 ++++++++-----
 dedalus/tools/parallel.py | 10 +++++++++
 3 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py
index 9f7e126d..81a4318b 100644
--- a/dedalus/core/solvers.py
+++ b/dedalus/core/solvers.py
@@ -9,9 +9,8 @@
 import cProfile
 import pstats
 from math import prod
-
 from collections import defaultdict
-import shelve
+import pickle
 
 from . import subsystems
 from . import timesteppers
@@ -19,10 +18,11 @@
 from ..libraries.matsolvers import matsolvers
 from ..tools.config import config
 from ..tools.array import scipy_sparse_eigs
-from ..tools.parallel import ProfileWrapper
+from ..tools.parallel import ProfileWrapper, parallel_mkdir
 
-PROFILE_SCRIPT = config['profiling'].getboolean('PROFILE_SCRIPT')
-PROFILE_MODE_DEFAULT = config['profiling'].get('PROFILE_MODE_DEFAULT')
+PROFILE_DEFAULT = config['profiling'].getboolean('PROFILE_DEFAULT')
+PARALLEL_PROFILE_DEFAULT = config['profiling'].getboolean('PARALLEL_PROFILE_DEFAULT')
+PROFILE_DIRECTORY = pathlib.Path(config['profiling'].get('PROFILE_DIRECTORY'))
 
 import logging
 logger = logging.getLogger(__name__.split('.')[-1])
@@ -495,10 +495,9 @@ class InitialValueSolver(SolverBase):
     warmup_iterations : int, optional
         Number of warmup iterations to disregard when computing runtime statistics (default: 10).
     profile : bool, optional
-        Save profiles with cProfile (default: False).
-    profile_mode : string, optional
-        Output joined files [summary] or also include per-core analysis [full] (default: full).
-
+        Save accumulated profiles with cProfile (default: False).
+    parallel_profile : bool, optional
+        Save per-process and accumulated profiles with cProfile (default: False).
     **kw :
         Other options passed to ProblemBase.
 
@@ -524,15 +523,16 @@ class InitialValueSolver(SolverBase):
     matsolver_default = 'MATRIX_FACTORIZER'
     matrices = ['M', 'L']
 
-    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, profile=PROFILE_SCRIPT, profile_mode=PROFILE_MODE_DEFAULT, **kw):
+    def __init__(self, problem, timestepper, enforce_real_cadence=100, warmup_iterations=10, profile=PROFILE_DEFAULT, parallel_profile=PARALLEL_PROFILE_DEFAULT, **kw):
         logger.debug('Beginning IVP instantiation')
         # Setup timing and profiling
         self.dist = problem.dist
         self._bcast_array = np.zeros(1, dtype=float)
         self.init_time = self.world_time
-        self.profile = profile
-        if profile:
-            self.profile_mode = profile_mode.lower()
+        if profile or parallel_profile:
+            parallel_mkdir(PROFILE_DIRECTORY, comm=self.dist.comm)
+            self.profile = True
+            self.parallel_profile = parallel_profile
             self.setup_profiler = cProfile.Profile()
             self.warmup_profiler = cProfile.Profile()
             self.run_profiler = cProfile.Profile()
@@ -750,6 +750,7 @@ def log_stats(self, format=".4g"):
             logger.info(f"Timings unavailable because warmup did not complete.")
 
     def dump_profiles(self, profiler, name):
+        "Save profiling data to disk."
         comm = self.dist.comm
         # Disable and create stats on each process
         profiler.create_stats()
@@ -759,19 +760,18 @@ def dump_profiles(self, profiler, name):
         profiles = comm.gather(ProfileWrapper(p.stats), root=0)
         # Sum stats on root process
         if comm.rank == 0:
-            if self.profile_mode=='full':
-                profile_database = pathlib.Path(f"{name}_profiles")
-                stats = {'primcalls':defaultdict(list),'totcalls':defaultdict(list),'tottime':defaultdict(list),'cumtime':defaultdict(list)}
+            if self.parallel_profile:
+                stats = {'primcalls': defaultdict(list),
+                         'totcalls': defaultdict(list),
+                         'tottime': defaultdict(list),
+                         'cumtime': defaultdict(list)}
                 for profile in profiles:
                     for func, (primcalls, totcalls, tottime, cumtime, callers) in profile.stats.items():
                         stats['primcalls'][func].append(primcalls)
                         stats['totcalls'][func].append(totcalls)
                         stats['tottime'][func].append(tottime)
                         stats['cumtime'][func].append(cumtime)
-                with shelve.open(str(profile_database), flag='n') as shelf:
-                    for key in stats:
-                        shelf[key] = stats[key]
-
-            # creation of joint_stats destroys profiles, so do this second
+                pickle.dump(stats, open(PROFILE_DIRECTORY / f"{name}_parallel.pickle", 'wb'))
+            # Creation of joint_stats destroys profiles, so do this second
             joint_stats = pstats.Stats(*profiles)
-            joint_stats.dump_stats(f"{name}.prof")
+            joint_stats.dump_stats(PROFILE_DIRECTORY / f"{name}.prof")
diff --git a/dedalus/dedalus.cfg b/dedalus/dedalus.cfg
index ef4d5baa..edd0595c 100644
--- a/dedalus/dedalus.cfg
+++ b/dedalus/dedalus.cfg
@@ -118,11 +118,15 @@
     # This works around NFS caching issues
     FILEHANDLER_TOUCH_TMPFILE = False
 
-
 [profiling]
 
-    # Profile script using cProfile
-    PROFILE_SCRIPT = False
+    # Default profile setting for solvers
+    # This saves accumulated profiling data using cProfile
+    PROFILE_DEFAULT = False
+
+    # Default parallel profile setting for solvers
+    # This saves per-process and accumulated profiling data using cProfile
+    PARALLEL_PROFILE_DEFAULT = False
 
-    # Level of profiling to conduct (summary, full)
-    PROFILE_MODE_DEFAULT = full
+    # Profile directory base (will be expanded to <PROFILE_DIRECTORY>/runtime.prof, etc)
+    PROFILE_DIRECTORY = profiles
diff --git a/dedalus/tools/parallel.py b/dedalus/tools/parallel.py
index be21c849..b3d64940 100644
--- a/dedalus/tools/parallel.py
+++ b/dedalus/tools/parallel.py
@@ -3,6 +3,7 @@
 
 """
 
+import pathlib
 from mpi4py import MPI
 
 
@@ -67,3 +68,12 @@ def __init__(self, stats):
     def create_stats(self):
         pass
 
+
+def parallel_mkdir(path, comm=MPI.COMM_WORLD):
+    """Create a directory from root process."""
+    path = pathlib.Path(path)
+    with Sync(comm=comm, enter=False, exit=True) as sync:
+        if sync.comm.rank == 0:
+            if not path.exists():
+                path.mkdir()
+