Skip to content

Commit

Permalink
Profiling enhancements (#288)
Browse files Browse the repository at this point in the history
* Adds subroutine labelling via stub functions to help in profiling external calls with cProfile

* Adds profiling to transposes.pyx and extensive stub-labelling to help identify where we are spending time.

* Cleans up labelling.

* Cython profiling can be controlled using the CYTHON_PROFILE environment variable during cythonization or pip installation.

* Adds barrier at end of solvers.dump_stats to remove confusion caused by writing out profiling data.

* Adds additional timers to ensure that time related to dumping out profiles to disk (by core 0) is not included in overall timing.

---------

Co-authored-by: Benjamin Brown <bpbrown@gmail.com>
  • Loading branch information
kburns and bpbrown authored Mar 14, 2024
1 parent 4f96bb9 commit ac3c52b
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 24 deletions.
19 changes: 12 additions & 7 deletions dedalus/core/distributor.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,9 @@ def decrement(self, fields):
for field in fields:
self.decrement_single(field)

def plan_localize_columns(self, plan, data0, data1):
plan.localize_columns(data0, data1)

def increment_single(self, field):
"""Backward transpose a field."""
plan = self._single_plan(field)
Expand All @@ -797,11 +800,14 @@ def increment_single(self, field):
field.preset_layout(self.layout1)
data1 = field.data
# Transpose between data views
plan.localize_columns(data0, data1)
self.plan_localize_columns(plan, data0, data1)
else:
# No communication: just update field layout
field.preset_layout(self.layout1)

def plan_localize_rows(self, plan, data0, data1):
plan.localize_rows(data1, data0)

def decrement_single(self, field):
"""Forward transpose a field."""
plan = self._single_plan(field)
Expand All @@ -811,7 +817,7 @@ def decrement_single(self, field):
field.preset_layout(self.layout0)
data0 = field.data
# Transpose between data views
plan.localize_rows(data1, data0)
self.plan_localize_rows(plan, data0, data1)
else:
# No communication: just update field layout
field.preset_layout(self.layout0)
Expand All @@ -828,7 +834,7 @@ def increment_group(self, fields):
field.preset_layout(self.layout1)
data1 = field.data
# Transpose between data views
plan.localize_columns(data0, data1)
self.plan_localize_columns(plan, data0, data1)
else:
# Gather data across fields
data0 = []
Expand All @@ -852,7 +858,7 @@ def increment_group(self, fields):
else:
data1 = np.zeros(0, dtype=fields[0].dtype) # Assumes same dtypes
# Transpose between data views
plan.localize_columns(data0, data1)
self.plan_localize_columns(plan, data0, data1)
# Split up transposed data
i = 0
for field in fields:
Expand All @@ -877,7 +883,7 @@ def decrement_group(self, fields):
field.preset_layout(self.layout0)
data0 = field.data
# Transpose between data views
plan.localize_rows(data1, data0)
self.plan_localize_rows(plan, data0, data1)
else:
# Gather data across fields
data0 = []
Expand All @@ -901,7 +907,7 @@ def decrement_group(self, fields):
else:
data1 = np.zeros(0, dtype=fields[0].dtype) # Assumes same dtypes
# Transpose between data views
plan.localize_rows(data1, data0)
self.plan_localize_rows(plan, data0, data1)
# Split up transposed data
i = 0
for field in fields:
Expand Down Expand Up @@ -951,4 +957,3 @@ def decrement_group(self, fields):
# # No data: just update field layouts
# for field in fields:
# field.preset_layout(self.layout0)

19 changes: 13 additions & 6 deletions dedalus/core/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,15 +673,17 @@ def step(self, dt):
# Record times
wall_time = self.wall_time
if self.iteration == self.initial_iteration:
self.start_time = wall_time
self.start_time_end = wall_time
if self.profile:
self.dump_profiles(self.setup_profiler, "setup")
self.warmup_profiler.enable()
self.warmup_time_start = self.wall_time
if self.iteration == self.initial_iteration + self.warmup_iterations:
self.warmup_time = wall_time
self.warmup_time_end = self.wall_time
if self.profile:
self.dump_profiles(self.warmup_profiler, "warmup")
self.run_profiler.enable()
self.run_time_start = self.wall_time
# Advance using timestepper
self.timestepper.step(dt, wall_time)
# Update iteration
Expand Down Expand Up @@ -732,19 +734,23 @@ def evaluate_handlers(self, handlers=None, dt=0):

def log_stats(self, format=".4g"):
"""Log timing statistics with specified string formatting (optional)."""
log_time = self.wall_time
self.run_time_end = self.wall_time
start_time = self.start_time_end
logger.info(f"Final iteration: {self.iteration}")
logger.info(f"Final sim time: {self.sim_time}")
logger.info(f"Setup time (init - iter 0): {self.start_time:{format}} sec")
logger.info(f"Setup time (init - iter 0): {start_time:{format}} sec")
if self.profile:
self.dump_profiles(self.run_profiler, "runtime")
if self.iteration >= self.initial_iteration + self.warmup_iterations:
warmup_time = self.warmup_time - self.start_time
run_time = log_time - self.warmup_time
warmup_time = self.warmup_time_end - self.warmup_time_start
run_time = self.run_time_end - self.run_time_start
profile_output_time = (self.warmup_time_start-self.start_time_end) + (self.run_time_start-self.warmup_time_end)
cpus = self.dist.comm.size
modes = self.total_modes
stages = (self.iteration - self.warmup_iterations - self.initial_iteration) * self.timestepper.stages
logger.info(f"Warmup time (iter 0-{self.warmup_iterations}): {warmup_time:{format}} sec")
if self.profile:
logger.info(f"Profile output time: {profile_output_time:{format}} sec")
logger.info(f"Run time (iter {self.warmup_iterations}-end): {run_time:{format}} sec")
logger.info(f"CPU time (iter {self.warmup_iterations}-end): {run_time*cpus/3600:{format}} cpu-hr")
logger.info(f"Speed: {(modes*stages/cpus/run_time):{format}} mode-stages/cpu-sec")
Expand Down Expand Up @@ -777,3 +783,4 @@ def dump_profiles(self, profiler, name):
# Creation of joint_stats destroys profiles, so do this second
joint_stats = pstats.Stats(*profiles)
joint_stats.dump_stats(PROFILE_DIRECTORY / f"{name}.prof")
comm.Barrier()
41 changes: 30 additions & 11 deletions dedalus/core/transposes.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

cimport cython
cimport numpy as cnp
import numpy as np
Expand Down Expand Up @@ -199,30 +198,50 @@ cdef class FFTWTranspose:
cfftw.fftw_destroy_plan(self.CL_to_RL_plan)
cfftw.fftw_destroy_plan(self.RL_to_CL_plan)

def make_views(self, RL, CL):
CL_reduced = np.ndarray(shape=self.CL_reduced_shape, dtype=CL.dtype, buffer=CL)
RL_reduced = np.ndarray(shape=self.RL_reduced_shape, dtype=RL.dtype, buffer=RL)
return CL_reduced, RL_reduced

def fftw_CL_to_RL(self):
cfftw.fftw_execute(self.CL_to_RL_plan)

def localize_rows(self, CL, RL):
"""Transpose from column-local to row-local data distribution."""
# Create reduced views of data arrays
CL_reduced = np.ndarray(shape=self.CL_reduced_shape, dtype=CL.dtype, buffer=CL)
RL_reduced = np.ndarray(shape=self.RL_reduced_shape, dtype=RL.dtype, buffer=RL)
CL_reduced, RL_reduced = self.make_views(RL, CL)
# Transpose from input array to buffer
np.copyto(self.CL_view, CL_reduced)
self.copy_in_CL_fftw(CL_reduced)
# Communicate between buffers
cfftw.fftw_execute(self.CL_to_RL_plan)
self.fftw_CL_to_RL()
# Transpose from buffer to output array
self.copy_out_RL_fftw(RL_reduced)

def fftw_RL_to_CL(self):
cfftw.fftw_execute(self.RL_to_CL_plan)

def copy_in_RL_fftw(self, RL_reduced):
np.copyto(self.RL_view, RL_reduced)

def copy_out_RL_fftw(self, RL_reduced):
np.copyto(RL_reduced, self.RL_view)

def copy_in_CL_fftw(self, CL_reduced):
np.copyto(self.CL_view, CL_reduced)

def copy_out_CL_fftw(self, CL_reduced):
np.copyto(CL_reduced, self.CL_view)

def localize_columns(self, RL, CL):
"""Transpose from row-local to column-local data distribution."""
# Create reduced views of data arrays
CL_reduced = np.ndarray(shape=self.CL_reduced_shape, dtype=CL.dtype, buffer=CL)
RL_reduced = np.ndarray(shape=self.RL_reduced_shape, dtype=RL.dtype, buffer=RL)
CL_reduced, RL_reduced = self.make_views(RL, CL)
# Transpose from input array to buffer
np.copyto(self.RL_view, RL_reduced)
self.copy_in_RL_fftw(RL_reduced)
# Communicate between buffers
cfftw.fftw_execute(self.RL_to_CL_plan)
self.fftw_RL_to_CL()
# Transpose from buffer to output array
np.copyto(CL_reduced, self.CL_view)

self.copy_out_CL_fftw(CL_reduced)

cdef class AlltoallvTranspose:
"""
Expand Down

0 comments on commit ac3c52b

Please sign in to comment.