-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add the parsing part for the profiling tool #7043
Changes from all commits
55b17c1
5837166
24458ae
770aff2
0f44107
2d94eca
0aa03a8
d7e5684
5a0a461
367a5c9
df3b250
7a4f3be
d09503b
ea0280b
6416eee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and | |
limitations under the License. */ | ||
|
||
#include "paddle/platform/profiler.h" | ||
#include <iomanip> | ||
#include <map> | ||
#include "glog/logging.h" | ||
|
||
namespace paddle { | ||
namespace platform { | ||
|
||
// The profiler state, the initial value is ProfilerState::kDisabled | ||
static ProfilerState g_state = ProfilerState::kDisabled; | ||
// To record which timer the profiler used, CUDA or CPU. | ||
static std::string g_profiler_place = ""; | ||
// The thread local event list only can be accessed by the specific thread | ||
// The thread index of each thread | ||
static thread_local int32_t g_thread_id; | ||
|
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() { | |
|
||
Event::Event(EventKind kind, std::string name, uint32_t thread_id, | ||
DeviceContext* dev_ctx) | ||
: kind_(kind), | ||
name_(std::move(name)), | ||
thread_id_(thread_id), | ||
has_cuda_(false) { | ||
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do not you use |
||
#ifdef PADDLE_WITH_CUDA | ||
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx); | ||
if (cuda_dev_ctx) { | ||
|
@@ -72,19 +74,19 @@ std::string Event::kind() const { | |
PADDLE_THROW("Unknown EventKind."); | ||
} | ||
|
||
double Event::CpuElapsedUs(const Event& e) const { | ||
return (e.cpu_ns_ - cpu_ns_) / (1000.0); | ||
double Event::CpuElapsedMs(const Event& e) const { | ||
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ii will be better to replace |
||
} | ||
|
||
double Event::CudaElapsedUs(const Event& e) const { | ||
double Event::CudaElapsedMs(const Event& e) const { | ||
#ifdef PADDLE_WITH_CUDA | ||
PADDLE_ENFORCE(e.has_cuda() && has_cuda()); | ||
PADDLE_ENFORCE(e.device() == device()); | ||
PADDLE_ENFORCE(cudaEventSynchronize(event_)); | ||
PADDLE_ENFORCE(cudaEventSynchronize(e.event())); | ||
float ms; | ||
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); | ||
return ms * 1000.0; | ||
return ms; | ||
#else | ||
PADDLE_THROW("CUDA is not enabled"); | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can not comment on line 110 but only write the comments here. |
||
|
@@ -113,21 +115,27 @@ inline EventList& GetEventList() { | |
} | ||
|
||
void Mark(const std::string& name, DeviceContext* dev_ctx) { | ||
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, | ||
dev_ctx); | ||
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); | ||
} | ||
|
||
void PushEvent(const std::string& name, DeviceContext* dev_ctx) { | ||
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); | ||
} | ||
|
||
void PopEvent(const std::string& name, DeviceContext* dev_ctx) { | ||
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); | ||
} | ||
|
||
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { | ||
if (g_state == ProfilerState::kDisabled) return; | ||
dev_ctx_ = dev_ctx; | ||
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, | ||
dev_ctx_); | ||
name_ = name; | ||
PushEvent(name_, dev_ctx_); | ||
} | ||
|
||
RecordEvent::~RecordEvent() { | ||
if (g_state == ProfilerState::kDisabled) return; | ||
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, | ||
dev_ctx_); | ||
PopEvent(name_, dev_ctx_); | ||
} | ||
|
||
void EnableProfiler(ProfilerState state) { | ||
|
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) { | |
"The profiling state should be disabled when calling ", | ||
"EnableProfiler."); | ||
g_state = state; | ||
g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; | ||
#ifdef PADDLE_WITH_CUDA | ||
if (g_state == ProfilerState::kCUDA) { | ||
// Generate some dummy evenets first to reduce the startup overhead. | ||
|
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() { | |
return result; | ||
} | ||
|
||
void ParseEvents(std::vector<std::vector<Event>>& events, | ||
EventSortingKey sorted_by) { | ||
if (g_profiler_place == "") return; | ||
|
||
std::string sorted_domain; | ||
std::function<bool(EventItem&, EventItem&)> sorted_func; | ||
switch (sorted_by) { | ||
case EventSortingKey::kCalls: | ||
sorted_domain = "number of calls"; | ||
sorted_func = [](EventItem& a, EventItem& b) { | ||
return a.calls > b.calls; | ||
}; | ||
break; | ||
case EventSortingKey::kTotal: | ||
sorted_domain = "total time"; | ||
sorted_func = [](EventItem& a, EventItem& b) { | ||
return a.total_time > b.total_time; | ||
}; | ||
break; | ||
case EventSortingKey::kMin: | ||
sorted_domain = "minimum time"; | ||
sorted_func = [](EventItem& a, EventItem& b) { | ||
return a.min_time > b.min_time; | ||
}; | ||
break; | ||
case EventSortingKey::kMax: | ||
sorted_domain = "maximum time"; | ||
sorted_func = [](EventItem& a, EventItem& b) { | ||
return a.max_time > b.max_time; | ||
}; | ||
break; | ||
case EventSortingKey::kAve: | ||
sorted_domain = "average time"; | ||
sorted_func = [](EventItem& a, EventItem& b) { | ||
return a.ave_time > b.ave_time; | ||
}; | ||
break; | ||
default: | ||
sorted_domain = "event end time"; | ||
} | ||
|
||
std::vector<std::vector<EventItem>> events_table; | ||
size_t max_name_width = 0; | ||
for (size_t i = 0; i < events.size(); i++) { | ||
std::list<Event> pushed_events; | ||
std::vector<EventItem> event_items; | ||
std::unordered_map<std::string, int> event_idx; | ||
|
||
for (size_t j = 0; j < events[i].size(); j++) { | ||
if (events[i][j].kind() == "push") { | ||
pushed_events.push_back(events[i][j]); | ||
} else if (events[i][j].kind() == "pop") { | ||
std::list<Event>::reverse_iterator rit = pushed_events.rbegin(); | ||
while (rit != pushed_events.rend() && | ||
rit->name() != events[i][j].name()) { | ||
++rit; | ||
} | ||
|
||
if (rit != pushed_events.rend()) { | ||
double event_time = (g_profiler_place == "CUDA") | ||
? rit->CudaElapsedMs(events[i][j]) | ||
: rit->CpuElapsedMs(events[i][j]); | ||
std::string event_name = | ||
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); | ||
max_name_width = std::max(max_name_width, event_name.size()); | ||
|
||
if (event_idx.find(event_name) == event_idx.end()) { | ||
event_idx[event_name] = event_items.size(); | ||
EventItem event_item = {event_name, 1, event_time, | ||
event_time, event_time, event_time}; | ||
event_items.push_back(event_item); | ||
} else { | ||
int index = event_idx[event_name]; | ||
event_items[index].calls += 1; | ||
// total time | ||
event_items[index].total_time += event_time; | ||
// min time | ||
event_items[index].min_time = | ||
std::min(event_time, event_items[index].min_time); | ||
// max time | ||
event_items[index].max_time = | ||
std::max(event_time, event_items[index].max_time); | ||
} | ||
|
||
// remove the push marker from the list | ||
pushed_events.erase((++rit).base()); | ||
} else { | ||
LOG(WARNING) << "Cannot find the push marker of event \'" | ||
<< events[i][j].name() | ||
<< "\', which will be ignored in profiling report."; | ||
} | ||
} | ||
} | ||
// average time | ||
for (auto& item : event_items) { | ||
item.ave_time = item.total_time / item.calls; | ||
} | ||
// sort | ||
if (sorted_by != EventSortingKey::kDefault) { | ||
std::sort(event_items.begin(), event_items.end(), sorted_func); | ||
} | ||
|
||
events_table.push_back(event_items); | ||
// log warning if there are events with `push` but without `pop` | ||
std::list<Event>::reverse_iterator rit = pushed_events.rbegin(); | ||
while (rit != pushed_events.rend()) { | ||
LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() | ||
<< "\', which will be ignored in profiling report."; | ||
++rit; | ||
} | ||
} | ||
|
||
// Print report | ||
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); | ||
} | ||
|
||
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, | ||
std::string& sorted_domain, const size_t name_width, | ||
const size_t data_width) { | ||
// Output header information | ||
std::cout << "\n------------------------->" | ||
<< " Profiling Report " | ||
<< "<-------------------------\n\n"; | ||
std::cout << "Place: " << g_profiler_place << std::endl; | ||
std::cout << "Time unit: ms" << std::endl; | ||
std::cout << "Sorted by " << sorted_domain | ||
<< " in descending order in the same thread\n\n"; | ||
// Output events table | ||
std::cout.setf(std::ios::left); | ||
std::cout << std::setw(name_width) << "Event" << std::setw(data_width) | ||
<< "Calls" << std::setw(data_width) << "Total" | ||
<< std::setw(data_width) << "Min." << std::setw(data_width) | ||
<< "Max." << std::setw(data_width) << "Ave." << std::endl; | ||
for (size_t i = 0; i < events_table.size(); ++i) { | ||
for (size_t j = 0; j < events_table[i].size(); ++j) { | ||
EventItem& event_item = events_table[i][j]; | ||
std::cout << std::setw(name_width) << event_item.name | ||
<< std::setw(data_width) << event_item.calls | ||
<< std::setw(data_width) << event_item.total_time | ||
<< std::setw(data_width) << event_item.min_time | ||
<< std::setw(data_width) << event_item.max_time | ||
<< std::setw(data_width) << event_item.ave_time << std::endl; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
std::cout << std::endl; | ||
} | ||
|
||
} // namespace platform | ||
} // namespace paddle |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
g_profiler_place
andg_state
should bethread_local
, just likeg_thread_id
.