diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 4e89e5c600bf7..4283724d28afc 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -13,12 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/platform/profiler.h" +#include +#include +#include "glog/logging.h" namespace paddle { namespace platform { // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; +// To record which timer the profiler used, CUDA or CPU. +static std::string g_profiler_place = ""; // The thread local event list only can be accessed by the specific thread // The thread index of each thread static thread_local int32_t g_thread_id; @@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() { Event::Event(EventKind kind, std::string name, uint32_t thread_id, DeviceContext* dev_ctx) - : kind_(kind), - name_(std::move(name)), - thread_id_(thread_id), - has_cuda_(false) { + : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { #ifdef PADDLE_WITH_CUDA auto* cuda_dev_ctx = static_cast(dev_ctx); if (cuda_dev_ctx) { @@ -72,11 +74,11 @@ std::string Event::kind() const { PADDLE_THROW("Unknown EventKind."); } -double Event::CpuElapsedUs(const Event& e) const { - return (e.cpu_ns_ - cpu_ns_) / (1000.0); +double Event::CpuElapsedMs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000000.0); } -double Event::CudaElapsedUs(const Event& e) const { +double Event::CudaElapsedMs(const Event& e) const { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE(e.has_cuda() && has_cuda()); PADDLE_ENFORCE(e.device() == device()); @@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const { PADDLE_ENFORCE(cudaEventSynchronize(e.event())); float ms; PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); - return ms * 1000.0; + return ms; #else PADDLE_THROW("CUDA is not enabled"); #endif @@ -113,21 +115,27 @@ inline EventList& GetEventList() { } void Mark(const std::string& name, DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, - dev_ctx); + GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); +} + +void PushEvent(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); +} + +void PopEvent(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); } RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { if (g_state == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; - GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, - dev_ctx_); + name_ = name; + PushEvent(name_, dev_ctx_); } RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled) return; - GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, - dev_ctx_); + PopEvent(name_, dev_ctx_); } void EnableProfiler(ProfilerState state) { @@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) { "The profiling state should be disabled when calling ", "EnableProfiler."); g_state = state; + g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { // Generate some dummy evenets first to reduce the startup overhead. @@ -169,5 +178,152 @@ std::vector> DisableProfiler() { return result; } +void ParseEvents(std::vector>& events, + EventSortingKey sorted_by) { + if (g_profiler_place == "") return; + + std::string sorted_domain; + std::function sorted_func; + switch (sorted_by) { + case EventSortingKey::kCalls: + sorted_domain = "number of calls"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.calls > b.calls; + }; + break; + case EventSortingKey::kTotal: + sorted_domain = "total time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.total_time > b.total_time; + }; + break; + case EventSortingKey::kMin: + sorted_domain = "minimum time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.min_time > b.min_time; + }; + break; + case EventSortingKey::kMax: + sorted_domain = "maximum time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.max_time > b.max_time; + }; + break; + case EventSortingKey::kAve: + sorted_domain = "average time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.ave_time > b.ave_time; + }; + break; + default: + sorted_domain = "event end time"; + } + + std::vector> events_table; + size_t max_name_width = 0; + for (size_t i = 0; i < events.size(); i++) { + std::list pushed_events; + std::vector event_items; + std::unordered_map event_idx; + + for (size_t j = 0; j < events[i].size(); j++) { + if (events[i][j].kind() == "push") { + pushed_events.push_back(events[i][j]); + } else if (events[i][j].kind() == "pop") { + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend() && + rit->name() != events[i][j].name()) { + ++rit; + } + + if (rit != pushed_events.rend()) { + double event_time = (g_profiler_place == "CUDA") + ? rit->CudaElapsedMs(events[i][j]) + : rit->CpuElapsedMs(events[i][j]); + std::string event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + + if (event_idx.find(event_name) == event_idx.end()) { + event_idx[event_name] = event_items.size(); + EventItem event_item = {event_name, 1, event_time, + event_time, event_time, event_time}; + event_items.push_back(event_item); + } else { + int index = event_idx[event_name]; + event_items[index].calls += 1; + // total time + event_items[index].total_time += event_time; + // min time + event_items[index].min_time = + std::min(event_time, event_items[index].min_time); + // max time + event_items[index].max_time = + std::max(event_time, event_items[index].max_time); + } + + // remove the push marker from the list + pushed_events.erase((++rit).base()); + } else { + LOG(WARNING) << "Cannot find the push marker of event \'" + << events[i][j].name() + << "\', which will be ignored in profiling report."; + } + } + } + // average time + for (auto& item : event_items) { + item.ave_time = item.total_time / item.calls; + } + // sort + if (sorted_by != EventSortingKey::kDefault) { + std::sort(event_items.begin(), event_items.end(), sorted_func); + } + + events_table.push_back(event_items); + // log warning if there are events with `push` but without `pop` + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend()) { + LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() + << "\', which will be ignored in profiling report."; + ++rit; + } + } + + // Print report + PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); +} + +void PrintProfilingReport(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::cout << "Place: " << g_profiler_place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::cout << "Sorted by " << sorted_domain + << " in descending order in the same thread\n\n"; + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total" + << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." << std::endl; + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + EventItem& event_item = events_table[i][j]; + std::cout << std::setw(name_width) << event_item.name + << std::setw(data_width) << event_item.calls + << std::setw(data_width) << event_item.total_time + << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.max_time + << std::setw(data_width) << event_item.ave_time << std::endl; + } + } + std::cout << std::endl; +} + } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 47104ea9d08da..6df48ef8806e8 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -33,6 +33,7 @@ class Event { std::string kind() const; std::string name() const { return name_; } + uint32_t thread_id() const { return thread_id_; } bool has_cuda() const { return has_cuda_; } #ifdef PADDLE_WITH_CUDA @@ -40,8 +41,8 @@ class Event { int device() const { return device_; } #endif - double CpuElapsedUs(const Event& e) const; - double CudaElapsedUs(const Event& e) const; + double CpuElapsedMs(const Event& e) const; + double CudaElapsedMs(const Event& e) const; private: EventKind kind_; @@ -94,6 +95,10 @@ enum ProfilerState { void Mark(const std::string& name, DeviceContext* dev_ctx); +void PushEvent(const std::string& name, DeviceContext* dev_ctx); + +void PopEvent(const std::string& name, DeviceContext* dev_ctx); + struct RecordEvent { explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); @@ -101,6 +106,8 @@ struct RecordEvent { // The device context is used by Event to get the current cuda stream. DeviceContext* dev_ctx_; + // Event name + std::string name_; }; // Enable the profiling function. @@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state); // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> DisableProfiler(); +// The information of each event given in the profiling report +struct EventItem { + std::string name; + int calls; + double total_time; + double min_time; + double max_time; + double ave_time; +}; + +// Candidate keys to sort the profiling report +enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; + +// Parse the event list and output the profiling report +void ParseEvents(std::vector>&, + EventSortingKey sorted_by = EventSortingKey::kDefault); + +// Print results +void PrintProfilingReport(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width); } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 47cf7be146121..13dea713c71e1 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) { counter++; } Event stop_event(EventKind::kPopRange, "test", 0, nullptr); - EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); + EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); } #ifdef PADDLE_WITH_CUDA @@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) { counter++; } Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); - EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0); + EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); } #endif @@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) { using paddle::platform::EventKind; using paddle::platform::RecordEvent; using paddle::platform::ProfilerState; + using paddle::platform::EventSortingKey; ProfilerState state = ProfilerState::kCPU; DeviceContext* dev_ctx = nullptr; @@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) { #endif EnableProfiler(state); + /* Usage 1: + * PushEvent(evt_name, dev_ctx); + * ... + * code to be analyzed + * ... + * PopEvent(evt_name, dev_ctx); + */ + for (int loop = 0; loop < 3; ++loop) { + for (int i = 1; i < 5; ++i) { + std::string name = "op_" + std::to_string(i); + PushEvent(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + PopEvent(name, dev_ctx); + } + } + + /* Usage 2: + * { + * RecordEvent record_event(name, dev_ctx); + * ... + * code to be analyzed + * ... + * } + */ for (int i = 1; i < 5; ++i) { - std::string name = "op_" + std::to_string(i); + std::string name = "evs_op_" + std::to_string(i); RecordEvent record_event(name, dev_ctx); int counter = 1; while (counter != i * 1000) counter++; } + + // Bad Usage: + PushEvent("event_without_pop", dev_ctx); + PopEvent("event_without_push", dev_ctx); std::vector> events = paddle::platform::DisableProfiler(); + // Will remove parsing-related code from test later + ParseEvents(events, EventSortingKey::kTotal); + int cuda_startup_count = 0; int start_profiler_count = 0; int stop_profiler_count = 0; @@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) { if (events[i][j].name() == "push") { EXPECT_EQ(events[i][j + 1].name(), "pop"); #ifdef PADDLE_WITH_CUDA - EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0); + EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); #else - EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0); + EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); #endif } }