diff --git a/dedalus/core/distributor.py b/dedalus/core/distributor.py index 346080e1..db63a8c7 100644 --- a/dedalus/core/distributor.py +++ b/dedalus/core/distributor.py @@ -788,6 +788,9 @@ def decrement(self, fields): for field in fields: self.decrement_single(field) + def plan_localize_columns(self, plan, data0, data1): + plan.localize_columns(data0, data1) + def increment_single(self, field): """Backward transpose a field.""" plan = self._single_plan(field) @@ -797,11 +800,14 @@ def increment_single(self, field): field.preset_layout(self.layout1) data1 = field.data # Transpose between data views - plan.localize_columns(data0, data1) + self.plan_localize_columns(plan, data0, data1) else: # No communication: just update field layout field.preset_layout(self.layout1) + def plan_localize_rows(self, plan, data0, data1): + plan.localize_rows(data1, data0) + def decrement_single(self, field): """Forward transpose a field.""" plan = self._single_plan(field) @@ -811,7 +817,7 @@ def decrement_single(self, field): field.preset_layout(self.layout0) data0 = field.data # Transpose between data views - plan.localize_rows(data1, data0) + self.plan_localize_rows(plan, data0, data1) else: # No communication: just update field layout field.preset_layout(self.layout0) @@ -828,7 +834,7 @@ def increment_group(self, fields): field.preset_layout(self.layout1) data1 = field.data # Transpose between data views - plan.localize_columns(data0, data1) + self.plan_localize_columns(plan, data0, data1) else: # Gather data across fields data0 = [] @@ -852,7 +858,7 @@ def increment_group(self, fields): else: data1 = np.zeros(0, dtype=fields[0].dtype) # Assumes same dtypes # Transpose between data views - plan.localize_columns(data0, data1) + self.plan_localize_columns(plan, data0, data1) # Split up transposed data i = 0 for field in fields: @@ -877,7 +883,7 @@ def decrement_group(self, fields): field.preset_layout(self.layout0) data0 = field.data # Transpose between data views - plan.localize_rows(data1, data0) + self.plan_localize_rows(plan, data0, data1) else: # Gather data across fields data0 = [] @@ -901,7 +907,7 @@ def decrement_group(self, fields): else: data1 = np.zeros(0, dtype=fields[0].dtype) # Assumes same dtypes # Transpose between data views - plan.localize_rows(data1, data0) + self.plan_localize_rows(plan, data0, data1) # Split up transposed data i = 0 for field in fields: @@ -951,4 +957,3 @@ def decrement_group(self, fields): # # No data: just update field layouts # for field in fields: # field.preset_layout(self.layout0) - diff --git a/dedalus/core/solvers.py b/dedalus/core/solvers.py index c444175d..fd5591fc 100644 --- a/dedalus/core/solvers.py +++ b/dedalus/core/solvers.py @@ -673,15 +673,17 @@ def step(self, dt): # Record times wall_time = self.wall_time if self.iteration == self.initial_iteration: - self.start_time = wall_time + self.start_time_end = wall_time if self.profile: self.dump_profiles(self.setup_profiler, "setup") self.warmup_profiler.enable() + self.warmup_time_start = self.wall_time if self.iteration == self.initial_iteration + self.warmup_iterations: - self.warmup_time = wall_time + self.warmup_time_end = self.wall_time if self.profile: self.dump_profiles(self.warmup_profiler, "warmup") self.run_profiler.enable() + self.run_time_start = self.wall_time # Advance using timestepper self.timestepper.step(dt, wall_time) # Update iteration @@ -732,19 +734,23 @@ def evaluate_handlers(self, handlers=None, dt=0): def log_stats(self, format=".4g"): """Log timing statistics with specified string formatting (optional).""" - log_time = self.wall_time + self.run_time_end = self.wall_time + start_time = self.start_time_end logger.info(f"Final iteration: {self.iteration}") logger.info(f"Final sim time: {self.sim_time}") - logger.info(f"Setup time (init - iter 0): {self.start_time:{format}} sec") + logger.info(f"Setup time (init - iter 0): {start_time:{format}} sec") if self.profile: self.dump_profiles(self.run_profiler, "runtime") if self.iteration >= self.initial_iteration + self.warmup_iterations: - warmup_time = self.warmup_time - self.start_time - run_time = log_time - self.warmup_time + warmup_time = self.warmup_time_end - self.warmup_time_start + run_time = self.run_time_end - self.run_time_start + profile_output_time = (self.warmup_time_start-self.start_time_end) + (self.run_time_start-self.warmup_time_end) cpus = self.dist.comm.size modes = self.total_modes stages = (self.iteration - self.warmup_iterations - self.initial_iteration) * self.timestepper.stages logger.info(f"Warmup time (iter 0-{self.warmup_iterations}): {warmup_time:{format}} sec") + if self.profile: + logger.info(f"Profile output time: {profile_output_time:{format}} sec") logger.info(f"Run time (iter {self.warmup_iterations}-end): {run_time:{format}} sec") logger.info(f"CPU time (iter {self.warmup_iterations}-end): {run_time*cpus/3600:{format}} cpu-hr") logger.info(f"Speed: {(modes*stages/cpus/run_time):{format}} mode-stages/cpu-sec") @@ -777,3 +783,4 @@ def dump_profiles(self, profiler, name): # Creation of joint_stats destroys profiles, so do this second joint_stats = pstats.Stats(*profiles) joint_stats.dump_stats(PROFILE_DIRECTORY / f"{name}.prof") + comm.Barrier() diff --git a/dedalus/core/transposes.pyx b/dedalus/core/transposes.pyx index 2ebb66b7..db98464f 100644 --- a/dedalus/core/transposes.pyx +++ b/dedalus/core/transposes.pyx @@ -1,4 +1,3 @@ - cimport cython cimport numpy as cnp import numpy as np @@ -199,30 +198,50 @@ cdef class FFTWTranspose: cfftw.fftw_destroy_plan(self.CL_to_RL_plan) cfftw.fftw_destroy_plan(self.RL_to_CL_plan) + def make_views(self, RL, CL): + CL_reduced = np.ndarray(shape=self.CL_reduced_shape, dtype=CL.dtype, buffer=CL) + RL_reduced = np.ndarray(shape=self.RL_reduced_shape, dtype=RL.dtype, buffer=RL) + return CL_reduced, RL_reduced + + def fftw_CL_to_RL(self): + cfftw.fftw_execute(self.CL_to_RL_plan) + def localize_rows(self, CL, RL): """Transpose from column-local to row-local data distribution.""" # Create reduced views of data arrays - CL_reduced = np.ndarray(shape=self.CL_reduced_shape, dtype=CL.dtype, buffer=CL) - RL_reduced = np.ndarray(shape=self.RL_reduced_shape, dtype=RL.dtype, buffer=RL) + CL_reduced, RL_reduced = self.make_views(RL, CL) # Transpose from input array to buffer - np.copyto(self.CL_view, CL_reduced) + self.copy_in_CL_fftw(CL_reduced) # Communicate between buffers - cfftw.fftw_execute(self.CL_to_RL_plan) + self.fftw_CL_to_RL() # Transpose from buffer to output array + self.copy_out_RL_fftw(RL_reduced) + + def fftw_RL_to_CL(self): + cfftw.fftw_execute(self.RL_to_CL_plan) + + def copy_in_RL_fftw(self, RL_reduced): + np.copyto(self.RL_view, RL_reduced) + + def copy_out_RL_fftw(self, RL_reduced): np.copyto(RL_reduced, self.RL_view) + def copy_in_CL_fftw(self, CL_reduced): + np.copyto(self.CL_view, CL_reduced) + + def copy_out_CL_fftw(self, CL_reduced): + np.copyto(CL_reduced, self.CL_view) + def localize_columns(self, RL, CL): """Transpose from row-local to column-local data distribution.""" # Create reduced views of data arrays - CL_reduced = np.ndarray(shape=self.CL_reduced_shape, dtype=CL.dtype, buffer=CL) - RL_reduced = np.ndarray(shape=self.RL_reduced_shape, dtype=RL.dtype, buffer=RL) + CL_reduced, RL_reduced = self.make_views(RL, CL) # Transpose from input array to buffer - np.copyto(self.RL_view, RL_reduced) + self.copy_in_RL_fftw(RL_reduced) # Communicate between buffers - cfftw.fftw_execute(self.RL_to_CL_plan) + self.fftw_RL_to_CL() # Transpose from buffer to output array - np.copyto(CL_reduced, self.CL_view) - + self.copy_out_CL_fftw(CL_reduced) cdef class AlltoallvTranspose: """