diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 38cc62684..08f9db5f4 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -26,6 +26,7 @@ #include "library/components/functors.hpp" #include "library/components/fwd.hpp" #include "library/components/mpi_gotcha.hpp" +#include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/components/pthread_mutex_gotcha.hpp" #include "library/config.hpp" @@ -752,13 +753,13 @@ omnitrace_init_tooling_hidden() using CategoryT = std::decay_t; uint64_t _ts = comp::wall_clock::record(); TRACE_EVENT_BEGIN(trait::name::value, perfetto::StaticString(name), - _ts); + _ts, "begin_ns", _ts); }; auto _pop_perfetto = [](auto _category, const char*) { using CategoryT = std::decay_t; uint64_t _ts = comp::wall_clock::record(); - TRACE_EVENT_END(trait::name::value, _ts); + TRACE_EVENT_END(trait::name::value, _ts, "end_ns", _ts); }; if(get_use_perfetto() && get_use_timemory()) @@ -855,6 +856,8 @@ omnitrace_init_tooling_hidden() if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n"); + pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record(); + return true; } @@ -949,6 +952,8 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a { get_gotcha_bundle()->start(); } + + pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record(); } //======================================================================================// @@ -970,6 +975,7 @@ omnitrace_finalize_hidden(void) } OMNITRACE_VERBOSE_F(0, "finalizing...\n"); + pthread_create_gotcha::get_execution_time()->second = comp::wall_clock::record(); // some functions called during finalization may alter the push/pop count so we need // to save them here diff --git a/source/lib/omnitrace/library/components/backtrace.cpp b/source/lib/omnitrace/library/components/backtrace.cpp index 8445dd4d0..fa15a0352 100644 --- a/source/lib/omnitrace/library/components/backtrace.cpp +++ b/source/lib/omnitrace/library/components/backtrace.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "library/components/fwd.hpp" +#include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/components/rocm_smi.hpp" #include "library/config.hpp" @@ -222,7 +223,7 @@ backtrace::size() const return m_size; } -backtrace::time_point_type +uint64_t backtrace::get_timestamp() const { return m_ts; @@ -273,7 +274,7 @@ backtrace::sample(int signum) m_size = 0; m_tid = threading::get_id(); - m_ts = clock_type::now(); + m_ts = comp::wall_clock::record(); m_thr_cpu_ts = tim::get_clock_thread_now(); auto _cache = tim::rusage_cache{ RUSAGE_THREAD }; m_mem_peak = _cache.get_peak_rss(); @@ -432,6 +433,9 @@ backtrace::post_process(int64_t _tid) return; } + _init->m_ts = std::max( + _init->m_ts, pthread_create_gotcha::get_execution_time(_tid)->first); + // check whether the call-stack entry should be used. -1 means break, 0 means continue auto _use_label = [](const std::string& _lbl, bool _check_internal) -> short { // debugging feature @@ -477,14 +481,15 @@ backtrace::post_process(int64_t _tid) : std::vector{}; auto _process_perfetto_counters = [&](const std::vector& _data) { + auto _tid_name = JOIN("", '[', _tid, ']'); if(!perfetto_counter_track::exists(_tid)) { perfetto_counter_track::emplace( - _tid, JOIN("", "Peak Memory Usage", " [Thread ", _tid, "] (S)"), "MB"); + _tid, JOIN(' ', "Thread Peak Memory Usage", _tid_name, "(S)"), "MB"); perfetto_counter_track::emplace( - _tid, JOIN("", "Context Switches", " [Thread ", _tid, "] (S)")); + _tid, JOIN(' ', "Thread Context Switches", _tid_name, "(S)")); perfetto_counter_track::emplace( - _tid, JOIN("", "Page Faults", " [Thread ", _tid, "] (S)")); + _tid, JOIN(' ', "Thread Page Faults", _tid_name, "(S)")); } if(!perfetto_counter_track::exists(_tid) && @@ -493,10 +498,8 @@ backtrace::post_process(int64_t _tid) for(auto& itr : _hw_cnt_labels) { perfetto_counter_track::emplace( - _tid, - JOIN("", tim::papi::get_event_info(itr).short_descr, " [Thread ", - _tid, "] (S)"), - ""); + _tid, JOIN(' ', "Thread", tim::papi::get_event_info(itr).short_descr, + _tid_name, "(S)")); } } @@ -507,7 +510,9 @@ backtrace::post_process(int64_t _tid) const auto* _bt = ditr->get(); if(_bt->m_tid != _tid) continue; - auto _ts = static_cast(_bt->m_ts.time_since_epoch().count()); + auto _ts = _bt->m_ts; + if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue; + _last_bt = _bt; _mean_ts += _ts; @@ -540,8 +545,7 @@ backtrace::post_process(int64_t _tid) if(_tid > 0 && _last_bt) { - auto _ts = static_cast(_last_bt->m_ts.time_since_epoch().count()) + - (_mean_ts / _data.size()); + auto _ts = pthread_create_gotcha::get_execution_time(_tid)->second; uint64_t _zero = 0; TRACE_COUNTER("sampling", perfetto_counter_track::at(_tid, 0), _ts, @@ -575,7 +579,7 @@ backtrace::post_process(int64_t _tid) bool _rename) { if(_rename) threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid, "(S)").c_str()); - time_point_type _last_wall_ts = _init->get_timestamp(); + auto _last_wall_ts = _init->get_timestamp(); for(const auto& ditr : _data) { @@ -594,12 +598,13 @@ backtrace::post_process(int64_t _tid) if(_use == 0) continue; auto sitr = _static_strings.emplace(_name); _last = *sitr.first; - auto _ts = static_cast(_bt->m_ts.time_since_epoch().count()); + auto _ts = _bt->m_ts; + if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue; - TRACE_EVENT_BEGIN( - "sampling", perfetto::StaticString{ sitr.first->c_str() }, - static_cast(_last_wall_ts.time_since_epoch().count())); - TRACE_EVENT_END("sampling", _ts); + TRACE_EVENT_BEGIN("sampling", + perfetto::StaticString{ sitr.first->c_str() }, + _last_wall_ts, "begin_ns", _last_wall_ts); + TRACE_EVENT_END("sampling", _ts, "end_ns", _ts); } _last_wall_ts = _bt->m_ts; } @@ -623,6 +628,7 @@ backtrace::post_process(int64_t _tid) continue; } if(_bt->empty()) continue; + if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue; _data.emplace_back(&ritr); } } @@ -667,9 +673,10 @@ backtrace::post_process(int64_t _tid) auto* _bt = ditr->get(); + if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue; if(_bt->m_ts < _last_bt->m_ts) continue; - double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts).count(); + double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts); double _elapsed_cc = (_bt->m_thr_cpu_ts - _last_bt->m_thr_cpu_ts); std::vector _tc{}; diff --git a/source/lib/omnitrace/library/components/backtrace.hpp b/source/lib/omnitrace/library/components/backtrace.hpp index ba3955279..006148f71 100644 --- a/source/lib/omnitrace/library/components/backtrace.hpp +++ b/source/lib/omnitrace/library/components/backtrace.hpp @@ -28,6 +28,7 @@ #include "library/thread_data.hpp" #include "library/timemory.hpp" +#include #include #include #include @@ -57,7 +58,6 @@ struct backtrace using data_t = std::array; using clock_type = std::chrono::steady_clock; - using time_point_type = typename clock_type::time_point; using value_type = void; using hw_counters = tim::component::papi_array; using hw_counter_data_t = typename hw_counters::value_type; @@ -88,7 +88,7 @@ struct backtrace bool empty() const; size_t size() const; std::vector get() const; - time_point_type get_timestamp() const; + uint64_t get_timestamp() const; int64_t get_thread_cpu_timestamp() const; private: @@ -97,8 +97,8 @@ struct backtrace int64_t m_mem_peak = 0; int64_t m_ctx_swch = 0; int64_t m_page_flt = 0; + uint64_t m_ts = {}; size_t m_size = 0; - time_point_type m_ts = {}; data_t m_data = {}; hw_counter_data_t m_hw_counter = {}; }; diff --git a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp index aaed456bb..7d1389000 100644 --- a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp @@ -29,6 +29,7 @@ #include "library/runtime.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" +#include "timemory/components/timing/wall_clock.hpp" #include #include @@ -130,7 +131,11 @@ pthread_create_gotcha::wrapper::operator()() const sampling::shutdown(); } - pthread_create_gotcha::shutdown(_tid); + if(_tid >= 0) + { + get_execution_time(_tid)->second = comp::wall_clock::record(); + pthread_create_gotcha::shutdown(_tid); + } set_thread_state(ThreadState::Completed); } }; @@ -148,7 +153,8 @@ pthread_create_gotcha::wrapper::operator()() const .first->second; } if(_bundle) start_bundle(*_bundle); - get_cpu_cid_stack(threading::get_id(), m_parent_tid); + get_execution_time(_tid)->first = comp::wall_clock::record(); + get_cpu_cid_stack(_tid, m_parent_tid); if(m_enable_sampling) { // initialize thread-local statics @@ -298,4 +304,10 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, return _ret; } +bool +pthread_create_gotcha::is_valid_execution_time(int64_t _tid, uint64_t _ts) +{ + return (_ts >= get_execution_time(_tid)->first && + _ts <= get_execution_time(_tid)->second); +} } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp b/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp index f6e20e6d3..7e6b466a6 100644 --- a/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp +++ b/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp @@ -24,6 +24,7 @@ #include "library/common.hpp" #include "library/defines.hpp" +#include "library/thread_data.hpp" #include "library/timemory.hpp" #include @@ -64,8 +65,22 @@ struct pthread_create_gotcha : tim::component::base // pthread_create int operator()(pthread_t* thread, const pthread_attr_t* attr, void* (*start_routine)(void*), void* arg) const; + + static auto& get_execution_time(int64_t _tid = threading::get_id()); + static bool is_valid_execution_time(int64_t _tid, uint64_t _ts); }; +inline auto& +pthread_create_gotcha::get_execution_time(int64_t _tid) +{ + struct omnitrace_thread_exec_time + {}; + using data_t = std::pair; + using thread_data_t = thread_data; + static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); + return _v.at(_tid); +} + using pthread_create_gotcha_t = tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>; } // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/rocm_smi.cpp b/source/lib/omnitrace/library/components/rocm_smi.cpp index 7bb0f1fb4..8203c65cb 100644 --- a/source/lib/omnitrace/library/components/rocm_smi.cpp +++ b/source/lib/omnitrace/library/components/rocm_smi.cpp @@ -33,6 +33,7 @@ #include "library/components/rocm_smi.hpp" #include "library/common.hpp" #include "library/components/fwd.hpp" +#include "library/components/pthread_create_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" #include "library/config.hpp" #include "library/critical_trace.hpp" @@ -237,11 +238,13 @@ data::post_process(uint32_t _dev_id) counter_track::emplace(_dev_id, addendum("Power"), "watts"); counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes"); } - uint64_t _ts = itr.m_ts; - double _busy = itr.m_busy_perc; - double _temp = itr.m_temp / 1.0e3; - double _power = itr.m_power / 1.0e6; - double _usage = itr.m_mem_usage / static_cast(units::megabyte); + uint64_t _ts = itr.m_ts; + if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; + + double _busy = itr.m_busy_perc; + double _temp = itr.m_temp / 1.0e3; + double _power = itr.m_power / 1.0e6; + double _usage = itr.m_mem_usage / static_cast(units::megabyte); TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 0), _ts, _busy); TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 1), _ts, _temp); TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 2), _ts, _power); @@ -262,6 +265,8 @@ data::post_process(uint32_t _dev_id) { using entry_t = critical_trace::entry; auto _ts = itr.m_ts; + if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; + auto _entries = critical_trace::get_entries(_ts, [](const entry_t& _e) { return _e.device == critical_trace::Device::GPU; }); diff --git a/source/lib/omnitrace/library/components/roctracer_callbacks.cpp b/source/lib/omnitrace/library/components/roctracer_callbacks.cpp index 374ddea18..3eace38d9 100644 --- a/source/lib/omnitrace/library/components/roctracer_callbacks.cpp +++ b/source/lib/omnitrace/library/components/roctracer_callbacks.cpp @@ -276,8 +276,10 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_perfetto()) { TRACE_EVENT_BEGIN("device", perfetto::StaticString{ _name }, + static_cast(begin_timestamp), "begin_ns", static_cast(begin_timestamp)); - TRACE_EVENT_END("device", static_cast(end_timestamp)); + TRACE_EVENT_END("device", static_cast(end_timestamp), + "end_ns", static_cast(end_timestamp)); } if(get_use_timemory()) @@ -347,8 +349,10 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg) if(get_use_perfetto()) { TRACE_EVENT_BEGIN("device", perfetto::StaticString{ *_name }, + static_cast(_beg_ns), "begin_ns", static_cast(_beg_ns)); - TRACE_EVENT_END("device", static_cast(_end_ns)); + TRACE_EVENT_END("device", static_cast(_end_ns), "end_ns", + static_cast(_end_ns)); } if(get_use_timemory()) { @@ -579,8 +583,9 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* { TRACE_EVENT_BEGIN( "host", perfetto::StaticString{ op_name }, static_cast(_ts), - perfetto::Flow::ProcessScoped(_cid), "pcid", _parent_cid, "cid", _cid, - "device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id); + perfetto::Flow::ProcessScoped(_cid), "begin_ns", + static_cast(_ts), "pcid", _parent_cid, "cid", _cid, "device", + _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id); } if(get_use_timemory()) { @@ -616,7 +621,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_perfetto()) { - TRACE_EVENT_END("host", static_cast(_ts)); + TRACE_EVENT_END("host", static_cast(_ts), "end_ns", + static_cast(_ts)); } if(get_use_timemory()) { @@ -774,12 +780,12 @@ hip_activity_callback(const char* begin, const char* end, void*) assert(_end_ns > _beg_ns); TRACE_EVENT_BEGIN("device", perfetto::StaticString{ _kernel_names.at(_name).c_str() }, - _beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id", - record->correlation_id, "device", _devid, "queue", _queid, - "op", _op_id_names.at(record->op)); - TRACE_EVENT_END("device", _end_ns); + _beg_ns, perfetto::Flow::ProcessScoped(_cid), "begin_ns", + _beg_ns, "corr_id", record->correlation_id, "device", + _devid, "queue", _queid, "op", _op_id_names.at(record->op)); + TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns); // for some reason, this is necessary to make sure very last one ends - TRACE_EVENT_END("device", _end_ns); + TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns); } if(_critical_trace) diff --git a/source/lib/omnitrace/library/cpu_freq.cpp b/source/lib/omnitrace/library/cpu_freq.cpp index cd2e84e4a..2fd757299 100644 --- a/source/lib/omnitrace/library/cpu_freq.cpp +++ b/source/lib/omnitrace/library/cpu_freq.cpp @@ -22,6 +22,7 @@ #include "library/cpu_freq.hpp" #include "library/common.hpp" +#include "library/components/pthread_create_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" @@ -252,8 +253,12 @@ post_process() { uint64_t _ts = std::get<0>(itr); double _freq = std::get<7>(itr).at(_offset); + if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; TRACE_COUNTER("sampling", freq_track::at(_idx, 0), _ts, _freq); } + + auto _end_ts = pthread_create_gotcha::get_execution_time(0)->second; + TRACE_COUNTER("sampling", freq_track::at(_idx, 0), _end_ts, 0); }; auto _process_cpu_rusage = []() { @@ -261,34 +266,35 @@ post_process() type_list{}, { "Memory Usage", "Virtual Memory Usage", "Context Switches", "Page Faults", - "User CPU Time", "Kernel CPU Time" }, + "User Time", "Kernel Time" }, { "MB", "MB", "", "", "sec", "sec" }); - cpu_data_tuple_t* _last = nullptr; for(auto& itr : cpu_data) { - uint64_t _ts = std::get<0>(itr); + uint64_t _ts = std::get<0>(itr); + if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue; + double _page = std::get<1>(itr); double _virt = std::get<2>(itr); uint64_t _cntx = std::get<3>(itr); uint64_t _flts = std::get<4>(itr); double _user = std::get<5>(itr); double _kern = std::get<6>(itr); - if(_last) - { - _cntx -= std::get<3>(*_last); - _flts -= std::get<4>(*_last); - _user -= std::get<5>(*_last); - _kern -= std::get<6>(*_last); - } write_perfetto_counter_track(_ts, _page / units::megabyte); write_perfetto_counter_track(_ts, _virt / units::megabyte); write_perfetto_counter_track(_ts, _cntx); write_perfetto_counter_track(_ts, _flts); write_perfetto_counter_track(_ts, _user / units::sec); write_perfetto_counter_track(_ts, _kern / units::sec); - _last = &itr; } + + auto _end_ts = pthread_create_gotcha::get_execution_time(0)->second; + write_perfetto_counter_track(_end_ts, 0); + write_perfetto_counter_track(_end_ts, 0); + write_perfetto_counter_track(_end_ts, 0); + write_perfetto_counter_track(_end_ts, 0); + write_perfetto_counter_track(_end_ts, 0); + write_perfetto_counter_track(_end_ts, 0); }; _process_cpu_rusage(); diff --git a/source/lib/omnitrace/library/critical_trace.cpp b/source/lib/omnitrace/library/critical_trace.cpp index 35847c55c..d3758a1a2 100644 --- a/source/lib/omnitrace/library/critical_trace.cpp +++ b/source/lib/omnitrace/library/critical_trace.cpp @@ -434,14 +434,17 @@ call_chain::generate_perfetto(std::set& _used) const if(itr.device == Device::CPU) { TRACE_EVENT_BEGIN("device-critical-trace", "CPU", - static_cast(itr.begin_ns)); + static_cast(itr.begin_ns), "begin_ns", + itr.begin_ns); } else if(itr.device == Device::GPU) { TRACE_EVENT_BEGIN("device-critical-trace", "GPU", - static_cast(itr.begin_ns)); + static_cast(itr.begin_ns), "begin_ns", + itr.begin_ns); } - TRACE_EVENT_END("device-critical-trace", static_cast(itr.end_ns)); + TRACE_EVENT_END("device-critical-trace", static_cast(itr.end_ns), + "end_ns", itr.end_ns); } } @@ -462,8 +465,10 @@ call_chain::generate_perfetto(std::set& _used) const _static_mutex.unlock(); TRACE_EVENT_BEGIN("host-critical-trace", perfetto::StaticString{ sitr.first->c_str() }, + static_cast(itr.begin_ns), "begin_ns", static_cast(itr.begin_ns)); - TRACE_EVENT_END("host-critical-trace", static_cast(itr.end_ns)); + TRACE_EVENT_END("host-critical-trace", static_cast(itr.end_ns), + "end_ns", static_cast(itr.end_ns)); } } @@ -484,8 +489,10 @@ call_chain::generate_perfetto(std::set& _used) const _static_mutex.unlock(); TRACE_EVENT_BEGIN("device-critical-trace", perfetto::StaticString{ sitr.first->c_str() }, + static_cast(itr.begin_ns), "begin_ns", static_cast(itr.begin_ns)); - TRACE_EVENT_END("device-critical-trace", static_cast(itr.end_ns)); + TRACE_EVENT_END("device-critical-trace", static_cast(itr.end_ns), + "end_ns", static_cast(itr.end_ns)); } } @@ -504,8 +511,10 @@ call_chain::generate_perfetto(std::set& _used) const auto sitr = _static_strings.emplace(_name); _static_mutex.unlock(); TRACE_EVENT_BEGIN("critical-trace", perfetto::StaticString{ sitr.first->c_str() }, + static_cast(itr.begin_ns), "begin_ns", static_cast(itr.begin_ns)); - TRACE_EVENT_END("critical-trace", static_cast(itr.end_ns)); + TRACE_EVENT_END("critical-trace", static_cast(itr.end_ns), "end_ns", + static_cast(itr.end_ns)); } }