diff --git a/src/apex/apex_ompt.cpp b/src/apex/apex_ompt.cpp index 91dc8c81..2acb20e4 100644 --- a/src/apex/apex_ompt.cpp +++ b/src/apex/apex_ompt.cpp @@ -138,12 +138,15 @@ class Globals{ }; -std::shared_ptr start_async_task(const std::string &name, uint32_t correlationId) { +std::shared_ptr start_async_task(const std::string &name, uint32_t correlationId, long unsigned int& parent_thread) { apex::in_apex prevent_deadlocks; // get the parent GUID, then erase the correlation from the map std::shared_ptr parent = nullptr; if (correlationId > 0) { parent = Globals::find_timer(correlationId); + if (parent != nullptr) { + parent_thread = parent->thread_id; + } } // create a task_wrapper, as a GPU child of the parent on the CPU side std::shared_ptr tt = apex::new_task(name, UINT64_MAX, parent); @@ -151,13 +154,12 @@ std::shared_ptr start_async_task(const std::string &name, ui } void stop_async_task(std::shared_ptr tt, uint64_t start, uint64_t end, - uint32_t correlationId, apex::base_thread_node &node) { + uint32_t correlationId, apex::ompt_thread_node &node) { // create an APEX profiler to store this data - we can't start // then stop because we have timestamps already. auto prof = std::make_shared(tt); prof->set_start(start + Globals::delta()); prof->set_end(end + Globals::delta()); - std::cout << __func__ << prof->get_start_ns() << " " << prof->get_stop_ns() << std::endl; // important! Otherwise we might get the wrong end timestamp. prof->stopped = true; // Get the singleton APEX instance @@ -190,7 +192,7 @@ void stop_async_task(std::shared_ptr tt, uint64_t start, uin } void store_profiler_data(const std::string &name, - uint64_t start, uint64_t end, apex::base_thread_node &node, + uint64_t start, uint64_t end, apex::ompt_thread_node &node, std::shared_ptr parent, bool otf2_trace = true) { apex::in_apex prevent_deadlocks; apex::async_event_data as_data; @@ -229,7 +231,7 @@ void store_profiler_data(const std::string &name, /* Handle counters from asynchronous activity */ void store_counter_data(const char * name, const std::string& ctx, - uint64_t end, double value, apex::base_thread_node &node) { + uint64_t end, double value, apex::ompt_thread_node &node) { apex::in_apex prevent_deadlocks; std::stringstream ss; if (name == nullptr) { @@ -263,7 +265,7 @@ void store_counter_data(const char * name, const std::string& ctx, } void store_counter_data(const char * name, const std::string& ctx, - uint64_t end, size_t value, apex::base_thread_node &node) { + uint64_t end, size_t value, apex::ompt_thread_node &node) { store_counter_data(name, ctx, end, (double)(value), node); } @@ -279,7 +281,9 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { static std::unordered_map active_target_devices; static std::unordered_map> target_map; static std::unordered_map target_start_times; + static std::unordered_map target_parent_thread_ids; static std::mutex target_lock; + long unsigned int parent_thread = 0; switch (rec->type) { case ompt_callback_target: @@ -296,12 +300,13 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { ss << ": UNRESOLVED ADDR " << target_rec.codeptr_ra; } std::string name{ss.str()}; - auto tt = start_async_task(name, rec->target_id); + auto tt = start_async_task(name, rec->target_id, parent_thread); std::unique_lock l(target_lock); target_map[rec->target_id] = tt; target_start_times[rec->target_id] = rec->time; active_target_addrs[rec->target_id] = target_rec.codeptr_ra; active_target_devices[rec->target_id] = target_rec.device_num; + target_parent_thread_ids[rec->target_id] = parent_thread; } else if (target_rec.endpoint == ompt_scope_end) { std::shared_ptr tt; uint64_t start; @@ -309,15 +314,17 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { std::unique_lock l(target_lock); tt = target_map[rec->target_id]; start = target_start_times[rec->target_id]; + parent_thread = target_parent_thread_ids[rec->target_id]; active_target_addrs.erase(rec->target_id); active_target_devices.erase(rec->target_id); target_map.erase(rec->target_id); target_start_times.erase(rec->target_id); + target_parent_thread_ids.erase(rec->target_id); } /* If we have a target region with a device id of -1, we might not get a target region start event - so ignore this end event for now. */ if (tt != nullptr) { - apex::base_thread_node node(target_rec.device_num, APEX_ASYNC_KERNEL); + apex::ompt_thread_node node(target_rec.device_num, parent_thread, APEX_ASYNC_KERNEL); stop_async_task(tt, start, rec->time, rec->target_id, node); } } @@ -335,7 +342,6 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { target_data_op_rec.bytes, target_data_op_rec.end_time, target_data_op_rec.end_time - rec->time, target_data_op_rec.codeptr_ra); - apex::base_thread_node node(target_data_op_rec.dest_device_num, APEX_ASYNC_MEMORY); std::stringstream ss; ss << "GPU: OpenMP Target DataOp"; switch (target_data_op_rec.optype) { @@ -386,11 +392,14 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { std::unique_lock l(target_lock); tt = target_map[rec->target_id]; codeptr_ra = active_target_addrs[rec->target_id]; + parent_thread = target_parent_thread_ids[rec->target_id]; } if (codeptr_ra != nullptr) { ss << ": UNRESOLVED ADDR " << codeptr_ra; } std::string name{ss.str()}; + apex::ompt_thread_node node(target_data_op_rec.dest_device_num, + parent_thread, APEX_ASYNC_MEMORY); store_profiler_data(name, rec->time, target_data_op_rec.end_time, node, tt); store_counter_data("OpenMP Target DataOp", "Bytes", target_data_op_rec.end_time, @@ -416,12 +425,14 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { tt = target_map[rec->target_id]; codeptr_ra = active_target_addrs[rec->target_id]; device_num = active_target_devices[rec->target_id]; + parent_thread = target_parent_thread_ids[rec->target_id]; } if (codeptr_ra != nullptr) { ss << ": UNRESOLVED ADDR " << codeptr_ra; } std::string name{ss.str()}; - apex::base_thread_node node(device_num, APEX_ASYNC_KERNEL); + apex::ompt_thread_node node(device_num, parent_thread, + APEX_ASYNC_KERNEL); store_profiler_data(name, rec->time, target_kernel_rec.end_time, node, tt); break; @@ -438,7 +449,7 @@ static void print_record_ompt(ompt_record_ompt_t *rec) { // free is used for corresponding malloc static void delete_buffer_ompt(ompt_buffer_t *buffer) { free(buffer); - printf("Deallocated %p\n", buffer); + DEBUG_PRINT("Deallocated %p\n", buffer); } /* Function pointers. These are all queried from the runtime during @@ -1228,12 +1239,15 @@ extern "C" void apex_ompt_work ( sprintf(regionIDstr, "OpenMP Work %s", tmp_str); apex_ompt_start(regionIDstr, task_data, parallel_data, true); } + APEX_UNUSED(count_type); + /* if (apex::apex_options::ompt_high_overhead_events()) { std::stringstream ss; ss << count_type << ": " << regionIDstr; std::string tmp{ss.str()}; apex::sample_value(tmp, count); } + */ } else { DEBUG_PRINT("%" PRId64 ": %s End task: %p, region: %p\n", apex_threadid, tmp_str, (void*)task_data, (void*)parallel_data); diff --git a/src/apex/async_thread_node.hpp b/src/apex/async_thread_node.hpp index 1d85453d..e4df4351 100644 --- a/src/apex/async_thread_node.hpp +++ b/src/apex/async_thread_node.hpp @@ -17,45 +17,19 @@ namespace apex { class base_thread_node { public: uint32_t _device; - apex_async_activity_t _activity; - base_thread_node(uint32_t device, apex_async_activity_t activity) : - _device(device), _activity(activity) { } - virtual bool operator==(const base_thread_node &rhs) const { - return (_device == rhs._device && _activity == rhs._activity); - } - virtual bool operator<(const base_thread_node &rhs) const { - if (_device(node,id_shifted)); std::stringstream ss; ss << fixed; ss << "{\"name\":\"thread_name\"" << ",\"ph\":\"M\",\"pid\":" << saved_node_id << ",\"tid\":" << id_shifted - << ",\"args\":{\"name\":"; + << ",\"args\":{\"name\":\""; ss << node.name(); //ss << "" << activity_to_string(node._activity); ss << "\""; @@ -270,7 +268,7 @@ std::string trace_event_listener::make_tid (base_thread_node &node) { ss << "{\"name\":\"thread_sort_index\"" << ",\"ph\":\"M\",\"pid\":" << saved_node_id << ",\"tid\":" << id_shifted - << ",\"args\":{\"sort_index\":" << UINT32_MAX << "}},\n"; + << ",\"args\":{\"sort_index\":" << id_shifted << "}},\n"; write_to_trace(ss); } tid = vthread_map[node];