Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the parsing part for the profiling tool #7043

Merged
merged 15 commits into from
Jan 8, 2018
Merged
184 changes: 170 additions & 14 deletions paddle/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/platform/profiler.h"
#include <iomanip>
#include <map>
#include "glog/logging.h"

namespace paddle {
namespace platform {

// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// To record which timer the profiler used, CUDA or CPU.
static std::string g_profiler_place = "";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

g_profiler_place and g_state should be thread_local, just like g_thread_id.

// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
Expand All @@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {

Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx)
: kind_(kind),
name_(std::move(name)),
thread_id_(thread_id),
has_cuda_(false) {
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do not you use std::move?

#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
Expand All @@ -72,19 +74,19 @@ std::string Event::kind() const {
PADDLE_THROW("Unknown EventKind.");
}

double Event::CpuElapsedUs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
double Event::CpuElapsedMs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ii will be better to replace / (1000000.0) with *0.000001.

}

double Event::CudaElapsedUs(const Event& e) const {
double Event::CudaElapsedMs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
return ms;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can not comment on line 110 but only write the comments here.
line 110 should be before line 109.

Expand Down Expand Up @@ -113,21 +115,27 @@ inline EventList& GetEventList() {
}

void Mark(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
dev_ctx);
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
}

void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
}

void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
}

RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
dev_ctx_);
name_ = name;
PushEvent(name_, dev_ctx_);
}

RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
dev_ctx_);
PopEvent(name_, dev_ctx_);
}

void EnableProfiler(ProfilerState state) {
Expand All @@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling ",
"EnableProfiler.");
g_state = state;
g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
#ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead.
Expand Down Expand Up @@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
return result;
}

void ParseEvents(std::vector<std::vector<Event>>& events,
EventSortingKey sorted_by) {
if (g_profiler_place == "") return;

std::string sorted_domain;
std::function<bool(EventItem&, EventItem&)> sorted_func;
switch (sorted_by) {
case EventSortingKey::kCalls:
sorted_domain = "number of calls";
sorted_func = [](EventItem& a, EventItem& b) {
return a.calls > b.calls;
};
break;
case EventSortingKey::kTotal:
sorted_domain = "total time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.total_time > b.total_time;
};
break;
case EventSortingKey::kMin:
sorted_domain = "minimum time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.min_time > b.min_time;
};
break;
case EventSortingKey::kMax:
sorted_domain = "maximum time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.max_time > b.max_time;
};
break;
case EventSortingKey::kAve:
sorted_domain = "average time";
sorted_func = [](EventItem& a, EventItem& b) {
return a.ave_time > b.ave_time;
};
break;
default:
sorted_domain = "event end time";
}

std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0;
for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
std::unordered_map<std::string, int> event_idx;

for (size_t j = 0; j < events[i].size(); j++) {
if (events[i][j].kind() == "push") {
pushed_events.push_back(events[i][j]);
} else if (events[i][j].kind() == "pop") {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() &&
rit->name() != events[i][j].name()) {
++rit;
}

if (rit != pushed_events.rend()) {
double event_time = (g_profiler_place == "CUDA")
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);
std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
max_name_width = std::max(max_name_width, event_name.size());

if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
event_items[index].calls += 1;
// total time
event_items[index].total_time += event_time;
// min time
event_items[index].min_time =
std::min(event_time, event_items[index].min_time);
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);
}

// remove the push marker from the list
pushed_events.erase((++rit).base());
} else {
LOG(WARNING) << "Cannot find the push marker of event \'"
<< events[i][j].name()
<< "\', which will be ignored in profiling report.";
}
}
}
// average time
for (auto& item : event_items) {
item.ave_time = item.total_time / item.calls;
}
// sort
if (sorted_by != EventSortingKey::kDefault) {
std::sort(event_items.begin(), event_items.end(), sorted_func);
}

events_table.push_back(event_items);
// log warning if there are events with `push` but without `pop`
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend()) {
LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
<< "\', which will be ignored in profiling report.";
++rit;
}
}

// Print report
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
}

void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the ParseEvents and print can be two functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

std::cout << std::endl;
}

} // namespace platform
} // namespace paddle
32 changes: 30 additions & 2 deletions paddle/platform/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,16 @@ class Event {

std::string kind() const;
std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; }

#ifdef PADDLE_WITH_CUDA
cudaEvent_t event() const { return event_; }
int device() const { return device_; }
#endif

double CpuElapsedUs(const Event& e) const;
double CudaElapsedUs(const Event& e) const;
double CpuElapsedMs(const Event& e) const;
double CudaElapsedMs(const Event& e) const;

private:
EventKind kind_;
Expand Down Expand Up @@ -94,13 +95,19 @@ enum ProfilerState {

void Mark(const std::string& name, DeviceContext* dev_ctx);

void PushEvent(const std::string& name, DeviceContext* dev_ctx);

void PopEvent(const std::string& name, DeviceContext* dev_ctx);

struct RecordEvent {
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);

~RecordEvent();

// The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_;
// Event name
std::string name_;
};

// Enable the profiling function.
Expand All @@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler();

// The information of each event given in the profiling report
struct EventItem {
std::string name;
int calls;
double total_time;
double min_time;
double max_time;
double ave_time;
};

// Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };

// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault);

// Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width);
} // namespace platform
} // namespace paddle
43 changes: 38 additions & 5 deletions paddle/platform/profiler_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
}

#ifdef PADDLE_WITH_CUDA
Expand All @@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
counter++;
}
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
}
#endif

Expand All @@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
using paddle::platform::EventKind;
using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;

ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
Expand All @@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
#endif
EnableProfiler(state);

/* Usage 1:
* PushEvent(evt_name, dev_ctx);
* ...
* code to be analyzed
* ...
* PopEvent(evt_name, dev_ctx);
*/
for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
PushEvent(name, dev_ctx);
int counter = 1;
while (counter != i * 1000) counter++;
PopEvent(name, dev_ctx);
}
}

/* Usage 2:
* {
* RecordEvent record_event(name, dev_ctx);
* ...
* code to be analyzed
* ...
* }
*/
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
int counter = 1;
while (counter != i * 1000) counter++;
}

// Bad Usage:
PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx);
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
// Will remove parsing-related code from test later
ParseEvents(events, EventSortingKey::kTotal);

int cuda_startup_count = 0;
int start_profiler_count = 0;
int stop_profiler_count = 0;
Expand All @@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA
EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
#else
EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
#endif
}
}
Expand Down