Skip to content

Commit

Permalink
Fixing shutdown bug when finalize is called without dump on frontier
Browse files Browse the repository at this point in the history
  • Loading branch information
khuck committed Dec 9, 2023
1 parent 797da5e commit eea16b8
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 24 deletions.
23 changes: 7 additions & 16 deletions src/apex/apex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,6 @@ std::shared_ptr<task_wrapper>& top_level_timer() {
}
*/

std::shared_ptr<task_wrapper>& main_timer() {
static std::shared_ptr<task_wrapper> main_timer = nullptr;
return main_timer;
}

/*
* The destructor will request power data from RCRToolkit
*/
Expand Down Expand Up @@ -545,7 +540,6 @@ uint64_t init(const char * thread_name, uint64_t comm_rank,
// be stopped on the thread that is calling apex::init. You've been warned.
main->explicit_trace_start = true;
start(main);
main_timer() = main;
if (apex_options::top_level_os_threads()) {
// start top-level timer for main thread, it will get automatically
// stopped when the main wrapper timer is stopped.
Expand Down Expand Up @@ -1663,10 +1657,12 @@ void finalize_plugins(void) {
#endif
}

std::string dump(bool reset) {
std::string dump(bool reset, bool finalizing) {
in_apex prevent_deadlocks;
// if APEX is disabled, do nothing.
if (apex_options::disable() == true) { return(std::string("")); }
if (apex_options::disable() == true ||
(!finalizing && apex_options::use_final_output_only()))
{ return(std::string("")); }
bool old_screen_output = apex_options::use_screen_output();
if (apex_options::use_jupyter_support()) {
// force output in the Jupyter notebook
Expand Down Expand Up @@ -1749,11 +1745,7 @@ void finalize(void)
}
// Second, stop the main timer, while the infrastructure is still
// functioning.
tmp = main_timer();
if (tmp != nullptr) {
stop(tmp);
main_timer() = nullptr;
}
instance->the_profiler_listener->stop_main_timer();
// if not done already...
shutdown_throttling(); // stop thread scheduler policies
/* Do this before OTF2 grabs a final timestamp - we might have
Expand All @@ -1763,7 +1755,6 @@ void finalize(void)
for (unsigned int i = 0 ; i < instance->listeners.size() ; i++) {
instance->listeners[i]->on_pre_shutdown();
}
//instance->the_profiler_listener->stop_main_timer();
stop_all_async_threads(); // stop OS/HW monitoring, including PAPI

/* This could take a while */
Expand All @@ -1778,7 +1769,7 @@ void finalize(void)
apex_options::suspend(true);

// now, process all output
dump(false);
dump(false, true);
exit_thread();
if (!_measurement_stopped)
{
Expand Down Expand Up @@ -1905,7 +1896,7 @@ void register_thread(const std::string &name,
// spawned by the main timer.
std::shared_ptr<task_wrapper> twp =
new_task(task_name, UINTMAX_MAX,
(parent == nullptr ? main_timer() : parent));
(parent == nullptr ? task_wrapper::get_apex_main_wrapper() : parent));
start(twp);
//printf("New thread: %p\n", &(*twp));
thread_instance::set_top_level_timer(twp);
Expand Down
3 changes: 2 additions & 1 deletion src/apex/apex_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,11 @@ APEX_EXPORT uint64_t init(const char * thread_name,
- write a profile to disk (if requested)
- output all other visualization data
\param reset Whether to reset all statistics
\param reset Whether this is the final dump at shutdown
\return a string containing the output
\sa @ref apex::finalize
*/
APEX_EXPORT std::string dump(bool reset);
APEX_EXPORT std::string dump(bool reset, bool finalizing = false);

/**
\brief Finalize APEX.
Expand Down
4 changes: 3 additions & 1 deletion src/apex/apex_kokkos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ void kokkosp_init_library(int loadseq, uint64_t version,
*/
void kokkosp_finalize_library() {
#ifndef APEX_HAVE_HPX
apex::finalize();
if (!apex::apex_options::use_mpi()) {
apex::finalize();
}
#endif
}

Expand Down
1 change: 1 addition & 0 deletions src/apex/apex_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ inline unsigned int sc_nprocessors_onln(void)
macro (APEX_MEASURE_CONCURRENCY, use_concurrency, int, 0, "Periodically sample thread activity and output report at exit.") \
macro (APEX_MEASURE_CONCURRENCY_MAX_TIMERS, concurrency_max_timers, int, 5, "Maximum number of timers in the concurrency report.") \
macro (APEX_MEASURE_CONCURRENCY_PERIOD, concurrency_period, int, 1000000, "Thread concurrency sampling period, in microseconds.") \
macro (APEX_FINAL_OUTPUT_ONLY, use_final_output_only, bool, false, "Output APEX performance log files only at exit (ignore intermediate dump calls).") \
macro (APEX_SCREEN_OUTPUT, use_screen_output, bool, false, "Output APEX performance summary at exit.") \
macro (APEX_SCREEN_OUTPUT_DETAIL, use_screen_output_detail, bool, false, "Output detailed APEX performance summary at exit.") \
macro (APEX_VERBOSE, use_verbose, bool, false, "Output APEX options at entry.") \
Expand Down
1 change: 1 addition & 0 deletions src/apex/profile_reducer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ std::map<std::string, apex_profile*> reduce_profiles_for_screen() {
std::vector<std::vector<std::string>> *rows = new std::vector<std::vector<std::string>>{};
treemerge::ThreadPool pool{};
pool.Start();
treemerge::node::reset();
treemerge::node * root{nullptr};
std::cout << "Merging common tree for all ranks... ";
auto start = high_resolution_clock::now();
Expand Down
11 changes: 6 additions & 5 deletions src/apex/profiler_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ std::unordered_set<profile*> free_profiles;
total_time = get_profile(main_id);
}
#endif // APEX_SYNCHRONOUS_PROCESSING
double wall_clock_main = total_time->get_accumulated_seconds();
double wall_clock_main = (total_time != nullptr) ? total_time->get_accumulated_seconds() : 0.0;
#ifdef APEX_HAVE_HPX
num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
#endif
Expand Down Expand Up @@ -953,7 +953,7 @@ std::unordered_set<profile*> free_profiles;
int num_worker_threads = thread_instance::get_num_workers();
auto main_id = task_identifier::get_main_task_id();
profile * total_time = get_profile(*main_id);
double wall_clock_main = total_time->get_accumulated_seconds();
double wall_clock_main = (total_time != nullptr) ? total_time->get_accumulated_seconds() : 0.0;
#ifdef APEX_HAVE_HPX
num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
#endif
Expand Down Expand Up @@ -982,7 +982,7 @@ std::unordered_set<profile*> free_profiles;
dep != task_dependencies.end(); dep++) {
task_identifier parent = dep->first;
string parent_name = parent.get_tree_name();
if (parent_name.compare("APEX MAIN") == 0 ||
if (parent_name.compare(APEX_MAIN_STR) == 0 ||
parent_name.substr(0, pthread_wrapper.size()) == pthread_wrapper ||
parent_name.substr(0, preload_main.size()) == preload_main) {
auto children = dep->second;
Expand All @@ -1001,7 +1001,7 @@ std::unordered_set<profile*> free_profiles;
dep != task_dependencies.end(); dep++) {
task_identifier parent = dep->first;
string parent_name = parent.get_tree_name();
if (parent_name.compare("APEX MAIN") != 0 &&
if (parent_name.compare(APEX_MAIN_STR) != 0 &&
parent_name.substr(0, pthread_wrapper.size()) != pthread_wrapper &&
parent_name.substr(0, preload_main.size()) != preload_main) {
auto children = dep->second;
Expand Down Expand Up @@ -1131,7 +1131,7 @@ std::unordered_set<profile*> free_profiles;
// our TOTAL available time is the elapsed * the number of threads, or cores
auto main_id = task_identifier::get_main_task_id();
profile * total_time = get_profile(*main_id);
double wall_clock_main = total_time->get_accumulated_seconds();
double wall_clock_main = (total_time != nullptr) ? total_time->get_accumulated_seconds() : 0.0;

#if 0
int num_worker_threads = thread_instance::get_num_workers();
Expand Down Expand Up @@ -2171,6 +2171,7 @@ if (rc != 0) cout << "PAPI error! " << name << ": " << PAPI_strerror(rc) << endl

void profiler_listener::on_pre_shutdown(void) {
stop_main_timer();
push_profiler((unsigned int)thread_instance::get_id(), *main_timer);
}

void profiler_listener::push_profiler_public(std::shared_ptr<profiler> &p) {
Expand Down
2 changes: 1 addition & 1 deletion src/apex/task_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ struct task_wrapper {
if (tt_ptr.get() == nullptr) {
mtx.lock();
if (tt_ptr.get() == nullptr) {
const std::string apex_main_str("APEX MAIN");
const std::string apex_main_str(APEX_MAIN_STR);
tt_ptr = std::make_shared<task_wrapper>();
tt_ptr->task_id = task_identifier::get_task_id(apex_main_str);
tt_ptr->tree_node = new dependency::Node(tt_ptr->task_id, nullptr);
Expand Down
15 changes: 15 additions & 0 deletions src/scripts/apex_exec
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ where APEX options are zero or more of:
--apex:screen enable screen text output (on by default)
--apex:screen-detail enable detailed text output (off by default)
--apex:quiet disable screen text output
--apex:final-output-only only output performance data at exit (ignore intermediate dump calls)
--apex:csv enable csv text output
--apex:tau enable tau profile output
--apex:taskgraph enable taskgraph output
Expand Down Expand Up @@ -97,6 +98,7 @@ where APEX options are zero or more of:
exit 1
}

apex_opts=yes
openacc=no
kokkos=no
kokkos_tuning=no
Expand Down Expand Up @@ -252,6 +254,10 @@ while (( "$#" )); do
# on by default
shift
;;
--apex:final-output-only)
export APEX_FINAL_OUTPUT_ONLY=1
shift
;;
--apex:screen_details|--apex:screen-details)
screen=yes
export APEX_SCREEN_OUTPUT_DETAIL=1
Expand Down Expand Up @@ -481,11 +487,20 @@ while (( "$#" )); do
usage
fi
;;
--apex:help|--help|-h)
if [ $apex_opts = yes ] ; then
usage
fi
# Could be a program argument!
PARAMS="$PARAMS $1"
shift
;;
--apex:*) # unsupported flags
echo "Error: Unsupported flag $1" >&2
usage
;;
*) # preserve positional arguments
apex_opts=no
if [ "$prog" = "" ] ; then
prog=$1
fi
Expand Down

0 comments on commit eea16b8

Please sign in to comment.