Skip to content

Commit

Permalink
Fix sampling counter time scales
Browse files Browse the repository at this point in the history
- All perfetto trace events have "begin_ns" and "end_ns" debug fields
- data for thread start and end timestamp in pthread_create_gotcha
- discard samples outside of thread start and end timestamps
- rename "CPU User CPU Time" perfetto counter to "CPU User Time"
- rename "CPU Kernel CPU Time" perfetto counter to "CPU Kernel Time"
- ensure CPU system samples in perfetto are set to zero at end
- backtrace uses comp::wall_clock record() for timestamps (consistency)
- "Peak Memory Usage [Thread X] (S)" renamed to "Thread Peak Memory Usage [X] (S)"
- "Context Switches [Thread X] (S)" renamed to "Thread Context Switches Usage [X] (S)"
- "Page Faults [Thread X] (S)" renamed to "Thread Page Faults Usage [X] (S)"
- "<PAPI_DESC> [Thread X] (S)" renamed to "Thread <PAPI_DESC> [X] (S)"
- samples
  • Loading branch information
jrmadsen committed Jun 10, 2022
1 parent 1db3934 commit 161f8b7
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 58 deletions.
10 changes: 8 additions & 2 deletions source/lib/omnitrace/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "library/components/functors.hpp"
#include "library/components/fwd.hpp"
#include "library/components/mpi_gotcha.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/pthread_mutex_gotcha.hpp"
#include "library/config.hpp"
Expand Down Expand Up @@ -752,13 +753,13 @@ omnitrace_init_tooling_hidden()
using CategoryT = std::decay_t<decltype(_category)>;
uint64_t _ts = comp::wall_clock::record();
TRACE_EVENT_BEGIN(trait::name<CategoryT>::value, perfetto::StaticString(name),
_ts);
_ts, "begin_ns", _ts);
};

auto _pop_perfetto = [](auto _category, const char*) {
using CategoryT = std::decay_t<decltype(_category)>;
uint64_t _ts = comp::wall_clock::record();
TRACE_EVENT_END(trait::name<CategoryT>::value, _ts);
TRACE_EVENT_END(trait::name<CategoryT>::value, _ts, "end_ns", _ts);
};

if(get_use_perfetto() && get_use_timemory())
Expand Down Expand Up @@ -855,6 +856,8 @@ omnitrace_init_tooling_hidden()

if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n");

pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record();

return true;
}

Expand Down Expand Up @@ -949,6 +952,8 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a
{
get_gotcha_bundle()->start();
}

pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record();
}

//======================================================================================//
Expand All @@ -970,6 +975,7 @@ omnitrace_finalize_hidden(void)
}

OMNITRACE_VERBOSE_F(0, "finalizing...\n");
pthread_create_gotcha::get_execution_time()->second = comp::wall_clock::record();

// some functions called during finalization may alter the push/pop count so we need
// to save them here
Expand Down
45 changes: 26 additions & 19 deletions source/lib/omnitrace/library/components/backtrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
// SOFTWARE.

#include "library/components/fwd.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/rocm_smi.hpp"
#include "library/config.hpp"
Expand Down Expand Up @@ -222,7 +223,7 @@ backtrace::size() const
return m_size;
}

backtrace::time_point_type
uint64_t
backtrace::get_timestamp() const
{
return m_ts;
Expand Down Expand Up @@ -273,7 +274,7 @@ backtrace::sample(int signum)

m_size = 0;
m_tid = threading::get_id();
m_ts = clock_type::now();
m_ts = comp::wall_clock::record();
m_thr_cpu_ts = tim::get_clock_thread_now<int64_t, std::nano>();
auto _cache = tim::rusage_cache{ RUSAGE_THREAD };
m_mem_peak = _cache.get_peak_rss();
Expand Down Expand Up @@ -432,6 +433,9 @@ backtrace::post_process(int64_t _tid)
return;
}

_init->m_ts = std::max<uint64_t>(
_init->m_ts, pthread_create_gotcha::get_execution_time(_tid)->first);

// check whether the call-stack entry should be used. -1 means break, 0 means continue
auto _use_label = [](const std::string& _lbl, bool _check_internal) -> short {
// debugging feature
Expand Down Expand Up @@ -477,14 +481,15 @@ backtrace::post_process(int64_t _tid)
: std::vector<std::string>{};

auto _process_perfetto_counters = [&](const std::vector<sampling::bundle_t*>& _data) {
auto _tid_name = JOIN("", '[', _tid, ']');
if(!perfetto_counter_track<perfetto_rusage>::exists(_tid))
{
perfetto_counter_track<perfetto_rusage>::emplace(
_tid, JOIN("", "Peak Memory Usage", " [Thread ", _tid, "] (S)"), "MB");
_tid, JOIN(' ', "Thread Peak Memory Usage", _tid_name, "(S)"), "MB");
perfetto_counter_track<perfetto_rusage>::emplace(
_tid, JOIN("", "Context Switches", " [Thread ", _tid, "] (S)"));
_tid, JOIN(' ', "Thread Context Switches", _tid_name, "(S)"));
perfetto_counter_track<perfetto_rusage>::emplace(
_tid, JOIN("", "Page Faults", " [Thread ", _tid, "] (S)"));
_tid, JOIN(' ', "Thread Page Faults", _tid_name, "(S)"));
}

if(!perfetto_counter_track<hw_counters>::exists(_tid) &&
Expand All @@ -493,10 +498,8 @@ backtrace::post_process(int64_t _tid)
for(auto& itr : _hw_cnt_labels)
{
perfetto_counter_track<hw_counters>::emplace(
_tid,
JOIN("", tim::papi::get_event_info(itr).short_descr, " [Thread ",
_tid, "] (S)"),
"");
_tid, JOIN(' ', "Thread", tim::papi::get_event_info(itr).short_descr,
_tid_name, "(S)"));
}
}

Expand All @@ -507,7 +510,9 @@ backtrace::post_process(int64_t _tid)
const auto* _bt = ditr->get<backtrace>();
if(_bt->m_tid != _tid) continue;

auto _ts = static_cast<uint64_t>(_bt->m_ts.time_since_epoch().count());
auto _ts = _bt->m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue;

_last_bt = _bt;
_mean_ts += _ts;

Expand Down Expand Up @@ -540,8 +545,7 @@ backtrace::post_process(int64_t _tid)

if(_tid > 0 && _last_bt)
{
auto _ts = static_cast<uint64_t>(_last_bt->m_ts.time_since_epoch().count()) +
(_mean_ts / _data.size());
auto _ts = pthread_create_gotcha::get_execution_time(_tid)->second;
uint64_t _zero = 0;
TRACE_COUNTER("sampling",
perfetto_counter_track<perfetto_rusage>::at(_tid, 0), _ts,
Expand Down Expand Up @@ -575,7 +579,7 @@ backtrace::post_process(int64_t _tid)
bool _rename) {
if(_rename)
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid, "(S)").c_str());
time_point_type _last_wall_ts = _init->get_timestamp();
auto _last_wall_ts = _init->get_timestamp();

for(const auto& ditr : _data)
{
Expand All @@ -594,12 +598,13 @@ backtrace::post_process(int64_t _tid)
if(_use == 0) continue;
auto sitr = _static_strings.emplace(_name);
_last = *sitr.first;
auto _ts = static_cast<uint64_t>(_bt->m_ts.time_since_epoch().count());
auto _ts = _bt->m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue;

TRACE_EVENT_BEGIN(
"sampling", perfetto::StaticString{ sitr.first->c_str() },
static_cast<uint64_t>(_last_wall_ts.time_since_epoch().count()));
TRACE_EVENT_END("sampling", _ts);
TRACE_EVENT_BEGIN("sampling",
perfetto::StaticString{ sitr.first->c_str() },
_last_wall_ts, "begin_ns", _last_wall_ts);
TRACE_EVENT_END("sampling", _ts, "end_ns", _ts);
}
_last_wall_ts = _bt->m_ts;
}
Expand All @@ -623,6 +628,7 @@ backtrace::post_process(int64_t _tid)
continue;
}
if(_bt->empty()) continue;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue;
_data.emplace_back(&ritr);
}
}
Expand Down Expand Up @@ -667,9 +673,10 @@ backtrace::post_process(int64_t _tid)

auto* _bt = ditr->get<backtrace>();

if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue;
if(_bt->m_ts < _last_bt->m_ts) continue;

double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts).count();
double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts);
double _elapsed_cc = (_bt->m_thr_cpu_ts - _last_bt->m_thr_cpu_ts);

std::vector<bundle_t> _tc{};
Expand Down
6 changes: 3 additions & 3 deletions source/lib/omnitrace/library/components/backtrace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "library/thread_data.hpp"
#include "library/timemory.hpp"

#include <bits/stdint-uintn.h>
#include <timemory/components/base.hpp>
#include <timemory/components/papi/papi_array.hpp>
#include <timemory/components/papi/types.hpp>
Expand Down Expand Up @@ -57,7 +58,6 @@ struct backtrace

using data_t = std::array<char[buffer_width], stack_depth>;
using clock_type = std::chrono::steady_clock;
using time_point_type = typename clock_type::time_point;
using value_type = void;
using hw_counters = tim::component::papi_array<num_hw_counters>;
using hw_counter_data_t = typename hw_counters::value_type;
Expand Down Expand Up @@ -88,7 +88,7 @@ struct backtrace
bool empty() const;
size_t size() const;
std::vector<std::string> get() const;
time_point_type get_timestamp() const;
uint64_t get_timestamp() const;
int64_t get_thread_cpu_timestamp() const;

private:
Expand All @@ -97,8 +97,8 @@ struct backtrace
int64_t m_mem_peak = 0;
int64_t m_ctx_swch = 0;
int64_t m_page_flt = 0;
uint64_t m_ts = {};
size_t m_size = 0;
time_point_type m_ts = {};
data_t m_data = {};
hw_counter_data_t m_hw_counter = {};
};
Expand Down
16 changes: 14 additions & 2 deletions source/lib/omnitrace/library/components/pthread_create_gotcha.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include "timemory/components/timing/wall_clock.hpp"

#include <bits/stdint-intn.h>
#include <timemory/backends/threading.hpp>
Expand Down Expand Up @@ -130,7 +131,11 @@ pthread_create_gotcha::wrapper::operator()() const
sampling::shutdown();
}

pthread_create_gotcha::shutdown(_tid);
if(_tid >= 0)
{
get_execution_time(_tid)->second = comp::wall_clock::record();
pthread_create_gotcha::shutdown(_tid);
}
set_thread_state(ThreadState::Completed);
} };

Expand All @@ -148,7 +153,8 @@ pthread_create_gotcha::wrapper::operator()() const
.first->second;
}
if(_bundle) start_bundle(*_bundle);
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
get_execution_time(_tid)->first = comp::wall_clock::record();
get_cpu_cid_stack(_tid, m_parent_tid);
if(m_enable_sampling)
{
// initialize thread-local statics
Expand Down Expand Up @@ -298,4 +304,10 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
return _ret;
}

bool
pthread_create_gotcha::is_valid_execution_time(int64_t _tid, uint64_t _ts)
{
return (_ts >= get_execution_time(_tid)->first &&
_ts <= get_execution_time(_tid)->second);
}
} // namespace omnitrace
15 changes: 15 additions & 0 deletions source/lib/omnitrace/library/components/pthread_create_gotcha.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "library/common.hpp"
#include "library/defines.hpp"
#include "library/thread_data.hpp"
#include "library/timemory.hpp"

#include <cstdint>
Expand Down Expand Up @@ -64,8 +65,22 @@ struct pthread_create_gotcha : tim::component::base<pthread_create_gotcha, void>
// pthread_create
int operator()(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine)(void*), void* arg) const;

static auto& get_execution_time(int64_t _tid = threading::get_id());
static bool is_valid_execution_time(int64_t _tid, uint64_t _ts);
};

inline auto&
pthread_create_gotcha::get_execution_time(int64_t _tid)
{
struct omnitrace_thread_exec_time
{};
using data_t = std::pair<uint64_t, uint64_t>;
using thread_data_t = thread_data<data_t, omnitrace_thread_exec_time>;
static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{});
return _v.at(_tid);
}

using pthread_create_gotcha_t =
tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>;
} // namespace omnitrace
15 changes: 10 additions & 5 deletions source/lib/omnitrace/library/components/rocm_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "library/components/rocm_smi.hpp"
#include "library/common.hpp"
#include "library/components/fwd.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/critical_trace.hpp"
Expand Down Expand Up @@ -237,11 +238,13 @@ data::post_process(uint32_t _dev_id)
counter_track::emplace(_dev_id, addendum("Power"), "watts");
counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes");
}
uint64_t _ts = itr.m_ts;
double _busy = itr.m_busy_perc;
double _temp = itr.m_temp / 1.0e3;
double _power = itr.m_power / 1.0e6;
double _usage = itr.m_mem_usage / static_cast<double>(units::megabyte);
uint64_t _ts = itr.m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue;

double _busy = itr.m_busy_perc;
double _temp = itr.m_temp / 1.0e3;
double _power = itr.m_power / 1.0e6;
double _usage = itr.m_mem_usage / static_cast<double>(units::megabyte);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 0), _ts, _busy);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 1), _ts, _temp);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 2), _ts, _power);
Expand All @@ -262,6 +265,8 @@ data::post_process(uint32_t _dev_id)
{
using entry_t = critical_trace::entry;
auto _ts = itr.m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue;

auto _entries = critical_trace::get_entries(_ts, [](const entry_t& _e) {
return _e.device == critical_trace::Device::GPU;
});
Expand Down
26 changes: 16 additions & 10 deletions source/lib/omnitrace/library/components/roctracer_callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,10 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_perfetto())
{
TRACE_EVENT_BEGIN("device", perfetto::StaticString{ _name },
static_cast<uint64_t>(begin_timestamp), "begin_ns",
static_cast<uint64_t>(begin_timestamp));
TRACE_EVENT_END("device", static_cast<uint64_t>(end_timestamp));
TRACE_EVENT_END("device", static_cast<uint64_t>(end_timestamp),
"end_ns", static_cast<uint64_t>(end_timestamp));
}

if(get_use_timemory())
Expand Down Expand Up @@ -347,8 +349,10 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
if(get_use_perfetto())
{
TRACE_EVENT_BEGIN("device", perfetto::StaticString{ *_name },
static_cast<uint64_t>(_beg_ns), "begin_ns",
static_cast<uint64_t>(_beg_ns));
TRACE_EVENT_END("device", static_cast<uint64_t>(_end_ns));
TRACE_EVENT_END("device", static_cast<uint64_t>(_end_ns), "end_ns",
static_cast<uint64_t>(_end_ns));
}
if(get_use_timemory())
{
Expand Down Expand Up @@ -579,8 +583,9 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
{
TRACE_EVENT_BEGIN(
"host", perfetto::StaticString{ op_name }, static_cast<uint64_t>(_ts),
perfetto::Flow::ProcessScoped(_cid), "pcid", _parent_cid, "cid", _cid,
"device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
perfetto::Flow::ProcessScoped(_cid), "begin_ns",
static_cast<uint64_t>(_ts), "pcid", _parent_cid, "cid", _cid, "device",
_device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
}
if(get_use_timemory())
{
Expand Down Expand Up @@ -616,7 +621,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*

if(get_use_perfetto())
{
TRACE_EVENT_END("host", static_cast<uint64_t>(_ts));
TRACE_EVENT_END("host", static_cast<uint64_t>(_ts), "end_ns",
static_cast<uint64_t>(_ts));
}
if(get_use_timemory())
{
Expand Down Expand Up @@ -774,12 +780,12 @@ hip_activity_callback(const char* begin, const char* end, void*)
assert(_end_ns > _beg_ns);
TRACE_EVENT_BEGIN("device",
perfetto::StaticString{ _kernel_names.at(_name).c_str() },
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id",
record->correlation_id, "device", _devid, "queue", _queid,
"op", _op_id_names.at(record->op));
TRACE_EVENT_END("device", _end_ns);
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "begin_ns",
_beg_ns, "corr_id", record->correlation_id, "device",
_devid, "queue", _queid, "op", _op_id_names.at(record->op));
TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns);
// for some reason, this is necessary to make sure very last one ends
TRACE_EVENT_END("device", _end_ns);
TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns);
}

if(_critical_trace)
Expand Down
Loading

0 comments on commit 161f8b7

Please sign in to comment.