From 55b17c11710a595dc64ec9123012f329c6a36d62 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Tue, 26 Dec 2017 12:28:45 +0000 Subject: [PATCH 01/10] Add the parsing part for the profiling tool --- paddle/platform/profiler.cc | 59 ++++++++++++++++++++++++++++++++ paddle/platform/profiler.h | 12 ++++++- paddle/platform/profiler_test.cc | 22 ++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 40b34b732c9d3..3d95097048c8c 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/platform/profiler.h" +#include namespace paddle { namespace platform { @@ -70,5 +71,63 @@ std::vector> DisableProfiler() { return result; } +void PushEvent(const std::string name, const platform::DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, + dev_ctx); +} + +void PopEvent(const std::string name, const platform::DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPopRange, std::move(name), kThreadId, + dev_ctx); +} + +void ParseEvents(std::vector> events) { + std::map> events_table; + for (size_t i = 0; i < events.size(); i++) { + std::list pushed_events; + for (size_t j = 0; j < events[i].size(); j++) { + if (events[i][j].kind() == "push") { + pushed_events.push_back(events[i][j]); + } + if (events[i][j].kind() == "pop") { + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit->name() != events[i][j].name() && + rit != pushed_events.rend()) { + ++rit; + } + if (rit != pushed_events.rend()) { + Event pushed_event = *rit; + double cpu_time = rit->CpuElapsedUs(events[i][j]); + double cuda_time = 0; +#ifdef PADDLE_WITH_CUDA + cuda_time = rit->CudaElapsedUs(events[i][j]); +#endif + if (events_table.find(rit->name()) == events_table.end()) { + events_table[rit->name()] = std::make_tuple(1, cpu_time, cuda_time); + } else { + std::get<0>(events_table[rit->name()]) += 1; + std::get<1>(events_table[rit->name()]) += cpu_time; + std::get<2>(events_table[rit->name()]) += cuda_time; + } + // remove the start marker from the list + pushed_events.erase((++rit).base()); + } else { + std::cout << "Warning: can not find the start marker of event " + << events[i][j].name(); + } + } + } + } + // output events table + std::cout << "\nEvents\t\tCalls\t\tTotal CPU time\t\tTotal GPU time\n"; + for (std::map>::iterator it = + events_table.begin(); + it != events_table.end(); ++it) { + std::cout << it->first << "\t\t" << std::get<0>(it->second) << "\t\t" + << std::get<1>(it->second) << "\t\t" << std::get<2>(it->second) + << std::endl; + } +} + } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 2242635024ca6..5f21ff8c1c393 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -173,25 +173,35 @@ inline void Mark(const std::string name, GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx); } +void PushEvent(const std::string name, + const platform::DeviceContext* dev_ctx = nullptr); + +void PopEvent(const std::string name, + const platform::DeviceContext* dev_ctx = nullptr); + struct RecordEvent { explicit RecordEvent(const std::string name, platform::DeviceContext* dev_ctx = nullptr) { if (kState == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; + name_ = name; GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, dev_ctx_); } ~RecordEvent() { if (kState == ProfilerState::kDisabled) return; - GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId, + GetEventList().Record(EventKind::kPopRange, std::move(name_), kThreadId, dev_ctx_); } platform::DeviceContext* dev_ctx_; + std::string name_; }; void EnableProfiler(ProfilerState state); std::vector> DisableProfiler(); +void ParseEvents(std::vector>); + } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 5bd0a9d859930..b2f1dea46596c 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -67,8 +67,29 @@ TEST(RecordEvent, RecordEvent) { #endif EnableProfiler(state); + /* Usage 1: + * PushEvent(evt_name, dev_ctx); + * ... + * code to time + * ... + * PopEvent(evt_name, dev_ctx); + */ for (int i = 1; i < 5; ++i) { std::string name = "op_" + std::to_string(i); + PushEvent(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + PopEvent(name, dev_ctx); + } + + /* Usage 2: + * { + * RecordEvent record_event(name, dev_ctx); + * ... + * } + */ + for (int i = 1; i < 5; ++i) { + std::string name = "evs_op_" + std::to_string(i); RecordEvent record_event(name, dev_ctx); int counter = 1; while (counter != i * 1000) counter++; @@ -77,6 +98,7 @@ TEST(RecordEvent, RecordEvent) { int cuda_startup_count = 0; int start_profiler_count = 0; int stop_profiler_count = 0; + ParseEvents(events); for (size_t i = 0; i < events.size(); ++i) { for (size_t j = 0; j < events[i].size(); ++j) { if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; From 0f4410755fa2fd333aaf1f009388773b0c7bbd6c Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 5 Jan 2018 03:38:12 +0000 Subject: [PATCH 02/10] Confirm the contents in profiling report --- paddle/platform/profiler.cc | 52 ++++++++++++++++++++++---------- paddle/platform/profiler.h | 1 + paddle/platform/profiler_test.cc | 6 ++++ 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 64b8bd1485a80..f11c87f4df83d 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -3,7 +3,7 @@ licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -` + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/platform/profiler.h" +#include #include namespace paddle { @@ -183,7 +184,8 @@ void PopEvent(const std::string& name, DeviceContext* dev_ctx) { void ParseEvents(std::vector>& events) { // Event name :: counts :: ave :: min :: max :: total - std::map> events_table; + std::map> + events_table; for (size_t i = 0; i < events.size(); i++) { std::list pushed_events; for (size_t j = 0; j < events[i].size(); j++) { @@ -197,18 +199,28 @@ void ParseEvents(std::vector>& events) { ++rit; } if (rit != pushed_events.rend()) { - Event pushed_event = *rit; - double cpu_time = rit->CpuElapsedUs(events[i][j]); - double cuda_time = 0; #ifdef PADDLE_WITH_CUDA - cuda_time = rit->CudaElapsedUs(events[i][j]); + double event_time = rit->CudaElapsedUs(events[i][j]); +#else + double event_time = rit->CpuElapsedUs(events[i][j]); #endif - if (events_table.find(rit->name()) == events_table.end()) { - events_table[rit->name()] = std::make_tuple(1, cpu_time, cuda_time); + std::string event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + if (events_table.find(event_name) == events_table.end()) { + events_table[event_name] = + std::make_tuple(1, event_time, event_time, event_time, 0); } else { - std::get<0>(events_table[rit->name()]) += 1; - std::get<1>(events_table[rit->name()]) += cpu_time; - std::get<2>(events_table[rit->name()]) += cuda_time; + std::get<0>(events_table[event_name]) += 1; + // total time + std::get<1>(events_table[event_name]) += event_time; + // min time + if (std::get<2>(events_table[event_name]) > event_time) { + std::get<2>(events_table[event_name]) = event_time; + } + // max time + if (std::get<3>(events_table[event_name]) < event_time) { + std::get<3>(events_table[event_name]) = event_time; + } } // remove the start marker from the list pushed_events.erase((++rit).base()); @@ -220,13 +232,21 @@ void ParseEvents(std::vector>& events) { } } // output events table - std::cout << "\nEvents\t\tCalls\t\tTotal CPU time\t\tTotal GPU time\n"; - for (std::map>::iterator it = + std::cout << std::setw(20) << "Events" << std::setw(10) << "Calls" + << std::setw(10) << "Total" << std::setw(10) << "Min" + << std::setw(10) << "Max" << std::setw(10) << "Ave" << std::endl; + for (std::map>::iterator it = events_table.begin(); it != events_table.end(); ++it) { - std::cout << it->first << "\t\t" << std::get<0>(it->second) << "\t\t" - << std::get<1>(it->second) << "\t\t" << std::get<2>(it->second) - << std::endl; + // average time + std::get<4>(it->second) = std::get<1>(it->second) / std::get<0>(it->second); + std::cout << std::setw(20) << it->first << std::setw(10) + << std::get<0>(it->second) << std::setw(10) + << std::get<1>(it->second) << std::setw(10) + << std::get<2>(it->second) << std::setw(10) + << std::get<3>(it->second) << std::setw(10) + << std::get<4>(it->second) << std::endl; } } diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index eb36355f86d65..eb176421a9243 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -33,6 +33,7 @@ class Event { std::string kind() const; std::string name() const { return name_; } + uint32_t thread_id() const { return thread_id_; } bool has_cuda() const { return has_cuda_; } #ifdef PADDLE_WITH_CUDA diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index ff5e658f5bd55..7966b35a16edf 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -94,6 +94,12 @@ TEST(RecordEvent, RecordEvent) { int counter = 1; while (counter != i * 1000) counter++; } + for (int i = 1; i < 5; ++i) { + std::string name = "evs_op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + } std::vector> events = paddle::platform::DisableProfiler(); int cuda_startup_count = 0; int start_profiler_count = 0; From 2d94eca8a1f707830cabb6eb0afe8879c376ac5c Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 5 Jan 2018 05:45:45 +0000 Subject: [PATCH 03/10] Format profiling report --- paddle/platform/profiler.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index f11c87f4df83d..52312d5a57654 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -186,6 +186,7 @@ void ParseEvents(std::vector>& events) { // Event name :: counts :: ave :: min :: max :: total std::map> events_table; + size_t max_name_width = 0; for (size_t i = 0; i < events.size(); i++) { std::list pushed_events; for (size_t j = 0; j < events[i].size(); j++) { @@ -206,6 +207,7 @@ void ParseEvents(std::vector>& events) { #endif std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); if (events_table.find(event_name) == events_table.end()) { events_table[event_name] = std::make_tuple(1, event_time, event_time, event_time, 0); @@ -232,21 +234,24 @@ void ParseEvents(std::vector>& events) { } } // output events table - std::cout << std::setw(20) << "Events" << std::setw(10) << "Calls" - << std::setw(10) << "Total" << std::setw(10) << "Min" - << std::setw(10) << "Max" << std::setw(10) << "Ave" << std::endl; + std::cout.setf(std::ios::left); + const int data_width = 12; + std::cout << std::setw(max_name_width + 4) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total" + << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." << std::endl; for (std::map>::iterator it = events_table.begin(); it != events_table.end(); ++it) { // average time std::get<4>(it->second) = std::get<1>(it->second) / std::get<0>(it->second); - std::cout << std::setw(20) << it->first << std::setw(10) - << std::get<0>(it->second) << std::setw(10) - << std::get<1>(it->second) << std::setw(10) - << std::get<2>(it->second) << std::setw(10) - << std::get<3>(it->second) << std::setw(10) - << std::get<4>(it->second) << std::endl; + std::cout << std::setw(max_name_width + 4) << it->first + << std::setw(data_width) << std::get<0>(it->second) + << std::setw(data_width) << std::get<1>(it->second) + << std::setw(data_width) << std::get<2>(it->second) + << std::setw(data_width) << std::get<3>(it->second) + << std::setw(data_width) << std::get<4>(it->second) << std::endl; } } From 0aa03a822e17e905b75e2df5a6cadc31c4159cfc Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 5 Jan 2018 08:54:35 +0000 Subject: [PATCH 04/10] Enable sorting the profiling result by different keys --- paddle/platform/profiler.cc | 124 +++++++++++++++++++++++-------- paddle/platform/profiler.h | 17 ++++- paddle/platform/profiler_test.cc | 31 ++++---- 3 files changed, 124 insertions(+), 48 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 52312d5a57654..a35d6e94f072d 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -182,47 +182,88 @@ void PopEvent(const std::string& name, DeviceContext* dev_ctx) { dev_ctx); } -void ParseEvents(std::vector>& events) { - // Event name :: counts :: ave :: min :: max :: total - std::map> - events_table; +void ParseEvents(std::vector>& events, + EventSortingKey sorted_by) { + // Output header information + std::cout << "------------------------->" + << " Profiling Report " + << "<-------------------------" + << "\n\n"; +#ifdef PADDLE_WITH_CUDA + std::cout << "Place: GPU" << std::endl; +#else + std::cout << "Place: CPU" << std::endl; +#endif + std::cout << "Time unit: ms" << std::endl; + std::string sort_domain = "event end time"; + switch (sorted_by) { + case EventSortingKey::kCalls: + sort_domain = "number of calls"; + break; + case EventSortingKey::kTotal: + sort_domain = "total time"; + break; + case EventSortingKey::kMin: + sort_domain = "minimum time"; + break; + case EventSortingKey::kMax: + sort_domain = "maximum time"; + break; + case EventSortingKey::kAve: + sort_domain = "average time"; + break; + default: + if (sorted_by != EventSortingKey::kDefault) { + std::cout << "Warning: unkown sorting key. "; + sorted_by = EventSortingKey::kDefault; + } + } + std::cout << "Sorted by " << sort_domain + << " in descending order in the same thread\n\n"; + + // Parse events + std::vector> events_table; size_t max_name_width = 0; for (size_t i = 0; i < events.size(); i++) { std::list pushed_events; + std::vector event_items; + std::unordered_map event_idx; + for (size_t j = 0; j < events[i].size(); j++) { if (events[i][j].kind() == "push") { pushed_events.push_back(events[i][j]); - } - if (events[i][j].kind() == "pop") { + } else if (events[i][j].kind() == "pop") { std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit->name() != events[i][j].name() && rit != pushed_events.rend()) { ++rit; } if (rit != pushed_events.rend()) { +// get event time in ms #ifdef PADDLE_WITH_CUDA - double event_time = rit->CudaElapsedUs(events[i][j]); + double event_time = rit->CudaElapsedUs(events[i][j]) / 1000.0; #else - double event_time = rit->CpuElapsedUs(events[i][j]); + double event_time = rit->CpuElapsedUs(events[i][j]) / 1000.0; #endif std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); max_name_width = std::max(max_name_width, event_name.size()); - if (events_table.find(event_name) == events_table.end()) { - events_table[event_name] = - std::make_tuple(1, event_time, event_time, event_time, 0); + if (event_idx.find(event_name) == event_idx.end()) { + event_idx[event_name] = event_items.size(); + EventItem event_item = {event_name, 1, event_time, + event_time, event_time, event_time}; + event_items.push_back(event_item); } else { - std::get<0>(events_table[event_name]) += 1; + int index = event_idx[event_name]; + event_items[index].calls += 1; // total time - std::get<1>(events_table[event_name]) += event_time; + event_items[index].total_time += event_time; // min time - if (std::get<2>(events_table[event_name]) > event_time) { - std::get<2>(events_table[event_name]) = event_time; - } + event_items[index].min_time = + std::min(event_time, event_items[index].min_time); // max time - if (std::get<3>(events_table[event_name]) < event_time) { - std::get<3>(events_table[event_name]) = event_time; - } + event_items[index].max_time = + std::max(event_time, event_items[index].max_time); } // remove the start marker from the list pushed_events.erase((++rit).base()); @@ -232,6 +273,29 @@ void ParseEvents(std::vector>& events) { } } } + // average time + for (auto& item : event_items) { + item.ave_time = item.total_time / item.calls; + } + // sort + if (sorted_by != EventSortingKey::kDefault) { + std::sort(event_items.begin(), event_items.end(), + [&](EventItem& a, EventItem& b) { + switch (sorted_by) { + case EventSortingKey::kCalls: + return a.calls > b.calls; + case EventSortingKey::kTotal: + return a.total_time > b.total_time; + case EventSortingKey::kMin: + return a.min_time > b.min_time; + case EventSortingKey::kMax: + return a.max_time > b.max_time; + default: + return a.ave_time > b.ave_time; + } + }); + } + events_table.push_back(event_items); } // output events table std::cout.setf(std::ios::left); @@ -240,18 +304,16 @@ void ParseEvents(std::vector>& events) { << "Calls" << std::setw(data_width) << "Total" << std::setw(data_width) << "Min." << std::setw(data_width) << "Max." << std::setw(data_width) << "Ave." << std::endl; - for (std::map>::iterator it = - events_table.begin(); - it != events_table.end(); ++it) { - // average time - std::get<4>(it->second) = std::get<1>(it->second) / std::get<0>(it->second); - std::cout << std::setw(max_name_width + 4) << it->first - << std::setw(data_width) << std::get<0>(it->second) - << std::setw(data_width) << std::get<1>(it->second) - << std::setw(data_width) << std::get<2>(it->second) - << std::setw(data_width) << std::get<3>(it->second) - << std::setw(data_width) << std::get<4>(it->second) << std::endl; + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + EventItem& event_item = events_table[i][j]; + std::cout << std::setw(max_name_width + 4) << event_item.name + << std::setw(data_width) << event_item.calls + << std::setw(data_width) << event_item.total_time + << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.max_time + << std::setw(data_width) << event_item.ave_time << std::endl; + } } } diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index eb176421a9243..7504aff5cd9a6 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -117,7 +117,22 @@ void EnableProfiler(ProfilerState state); // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> DisableProfiler(); -void ParseEvents(std::vector>&); +// The information of each event given in the profiling report +struct EventItem { + std::string name; + int calls; + double total_time; + double min_time; + double max_time; + double ave_time; +}; + +// Candidate keys to sort the profiling report +enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; + +// Parse the event list and output the profiling report +void ParseEvents(std::vector>&, + EventSortingKey sorted_by = EventSortingKey::kDefault); } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 7966b35a16edf..68b75e7063779 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) { using paddle::platform::EventKind; using paddle::platform::RecordEvent; using paddle::platform::ProfilerState; + using paddle::platform::EventSortingKey; ProfilerState state = ProfilerState::kCPU; DeviceContext* dev_ctx = nullptr; @@ -70,22 +71,26 @@ TEST(RecordEvent, RecordEvent) { /* Usage 1: * PushEvent(evt_name, dev_ctx); * ... - * code to time + * code to analyze * ... * PopEvent(evt_name, dev_ctx); */ - for (int i = 1; i < 5; ++i) { - std::string name = "op_" + std::to_string(i); - PushEvent(name, dev_ctx); - int counter = 1; - while (counter != i * 1000) counter++; - PopEvent(name, dev_ctx); + for (int loop = 0; loop < 3; ++loop) { + for (int i = 1; i < 5; ++i) { + std::string name = "op_" + std::to_string(i); + PushEvent(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + PopEvent(name, dev_ctx); + } } /* Usage 2: * { * RecordEvent record_event(name, dev_ctx); * ... + * code to analyze + * ... * } */ for (int i = 1; i < 5; ++i) { @@ -94,19 +99,13 @@ TEST(RecordEvent, RecordEvent) { int counter = 1; while (counter != i * 1000) counter++; } - for (int i = 1; i < 5; ++i) { - std::string name = "evs_op_" + std::to_string(i); - RecordEvent record_event(name, dev_ctx); - int counter = 1; - while (counter != i * 1000) counter++; - } std::vector> events = paddle::platform::DisableProfiler(); + // Will remove from test before merging + ParseEvents(events, EventSortingKey::kTotal); + int cuda_startup_count = 0; int start_profiler_count = 0; int stop_profiler_count = 0; - - ParseEvents(events); - for (size_t i = 0; i < events.size(); ++i) { for (size_t j = 0; j < events[i].size(); ++j) { if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; From d7e56847c7063c0281e13cfda0cc6ba78761a618 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 5 Jan 2018 09:04:27 +0000 Subject: [PATCH 05/10] fix typos --- paddle/platform/profiler_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 68b75e7063779..d4fd4c885039d 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -71,7 +71,7 @@ TEST(RecordEvent, RecordEvent) { /* Usage 1: * PushEvent(evt_name, dev_ctx); * ... - * code to analyze + * code to be analyzed * ... * PopEvent(evt_name, dev_ctx); */ @@ -89,7 +89,7 @@ TEST(RecordEvent, RecordEvent) { * { * RecordEvent record_event(name, dev_ctx); * ... - * code to analyze + * code to be analyzed * ... * } */ From 5a0a4617536aa4c79a30f195f88653794fe98678 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 5 Jan 2018 09:57:28 +0000 Subject: [PATCH 06/10] Make time calc funcs return ms instead of us --- paddle/platform/profiler.cc | 13 ++++++------- paddle/platform/profiler.h | 4 ++-- paddle/platform/profiler_test.cc | 8 ++++---- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index a35d6e94f072d..ea16e56076761 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -74,11 +74,11 @@ std::string Event::kind() const { PADDLE_THROW("Unknown EventKind."); } -double Event::CpuElapsedUs(const Event& e) const { - return (e.cpu_ns_ - cpu_ns_) / (1000.0); +double Event::CpuElapsedMs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000000.0); } -double Event::CudaElapsedUs(const Event& e) const { +double Event::CudaElapsedMs(const Event& e) const { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE(e.has_cuda() && has_cuda()); PADDLE_ENFORCE(e.device() == device()); @@ -86,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const { PADDLE_ENFORCE(cudaEventSynchronize(e.event())); float ms; PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); - return ms * 1000.0; + return ms; #else PADDLE_THROW("CUDA is not enabled"); #endif @@ -239,11 +239,10 @@ void ParseEvents(std::vector>& events, ++rit; } if (rit != pushed_events.rend()) { -// get event time in ms #ifdef PADDLE_WITH_CUDA - double event_time = rit->CudaElapsedUs(events[i][j]) / 1000.0; + double event_time = rit->CudaElapsedMs(events[i][j]); #else - double event_time = rit->CpuElapsedUs(events[i][j]) / 1000.0; + double event_time = rit->CpuElapsedMs(events[i][j]); #endif std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 7504aff5cd9a6..d51ec6443217c 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -41,8 +41,8 @@ class Event { int device() const { return device_; } #endif - double CpuElapsedUs(const Event& e) const; - double CudaElapsedUs(const Event& e) const; + double CpuElapsedMs(const Event& e) const; + double CudaElapsedMs(const Event& e) const; private: EventKind kind_; diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index d4fd4c885039d..3800f3914f188 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) { counter++; } Event stop_event(EventKind::kPopRange, "test", 0, nullptr); - EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); + EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); } #ifdef PADDLE_WITH_CUDA @@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) { counter++; } Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); - EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0); + EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); } #endif @@ -114,9 +114,9 @@ TEST(RecordEvent, RecordEvent) { if (events[i][j].name() == "push") { EXPECT_EQ(events[i][j + 1].name(), "pop"); #ifdef PADDLE_WITH_CUDA - EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0); + EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); #else - EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0); + EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); #endif } } From df3b250c7000269489522fcaa1f62e18e2f44ee8 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 5 Jan 2018 14:52:03 +0000 Subject: [PATCH 07/10] Fix bad_alloc bug & refine code in profiler --- paddle/platform/profiler.cc | 150 ++++++++++++++++--------------- paddle/platform/profiler.h | 4 + paddle/platform/profiler_test.cc | 6 +- 3 files changed, 88 insertions(+), 72 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index ea16e56076761..abb1e6a89cf9e 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -15,12 +15,16 @@ limitations under the License. */ #include "paddle/platform/profiler.h" #include #include +#include "gflags/gflags.h" +#include "glog/logging.h" namespace paddle { namespace platform { // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; +// To record which timer the profiler used, CUDA or CPU. +static std::string g_profiler_place = ""; // The thread local event list only can be accessed by the specific thread // The thread index of each thread static thread_local int32_t g_thread_id; @@ -45,10 +49,7 @@ inline uint64_t GetTimeInNsec() { Event::Event(EventKind kind, std::string name, uint32_t thread_id, DeviceContext* dev_ctx) - : kind_(kind), - name_(std::move(name)), - thread_id_(thread_id), - has_cuda_(false) { + : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { #ifdef PADDLE_WITH_CUDA auto* cuda_dev_ctx = static_cast(dev_ctx); if (cuda_dev_ctx) { @@ -115,22 +116,27 @@ inline EventList& GetEventList() { } void Mark(const std::string& name, DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, - dev_ctx); + GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); +} + +void PushEvent(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); +} + +void PopEvent(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); } RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { if (g_state == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; name_ = name; - GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, - dev_ctx_); + PushEvent(name_, dev_ctx_); } RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled) return; - GetEventList().Record(EventKind::kPopRange, std::move(name_), g_thread_id, - dev_ctx_); + PopEvent(name_, dev_ctx_); } void EnableProfiler(ProfilerState state) { @@ -141,6 +147,7 @@ void EnableProfiler(ProfilerState state) { "The profiling state should be disabled when calling ", "EnableProfiler."); g_state = state; + g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { // Generate some dummy evenets first to reduce the startup overhead. @@ -172,56 +179,8 @@ std::vector> DisableProfiler() { return result; } -void PushEvent(const std::string& name, DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, - dev_ctx); -} - -void PopEvent(const std::string& name, DeviceContext* dev_ctx) { - GetEventList().Record(EventKind::kPopRange, std::move(name), g_thread_id, - dev_ctx); -} - void ParseEvents(std::vector>& events, EventSortingKey sorted_by) { - // Output header information - std::cout << "------------------------->" - << " Profiling Report " - << "<-------------------------" - << "\n\n"; -#ifdef PADDLE_WITH_CUDA - std::cout << "Place: GPU" << std::endl; -#else - std::cout << "Place: CPU" << std::endl; -#endif - std::cout << "Time unit: ms" << std::endl; - std::string sort_domain = "event end time"; - switch (sorted_by) { - case EventSortingKey::kCalls: - sort_domain = "number of calls"; - break; - case EventSortingKey::kTotal: - sort_domain = "total time"; - break; - case EventSortingKey::kMin: - sort_domain = "minimum time"; - break; - case EventSortingKey::kMax: - sort_domain = "maximum time"; - break; - case EventSortingKey::kAve: - sort_domain = "average time"; - break; - default: - if (sorted_by != EventSortingKey::kDefault) { - std::cout << "Warning: unkown sorting key. "; - sorted_by = EventSortingKey::kDefault; - } - } - std::cout << "Sorted by " << sort_domain - << " in descending order in the same thread\n\n"; - - // Parse events std::vector> events_table; size_t max_name_width = 0; for (size_t i = 0; i < events.size(); i++) { @@ -234,19 +193,19 @@ void ParseEvents(std::vector>& events, pushed_events.push_back(events[i][j]); } else if (events[i][j].kind() == "pop") { std::list::reverse_iterator rit = pushed_events.rbegin(); - while (rit->name() != events[i][j].name() && - rit != pushed_events.rend()) { + while (rit != pushed_events.rend() && + rit->name() != events[i][j].name()) { ++rit; } + if (rit != pushed_events.rend()) { -#ifdef PADDLE_WITH_CUDA - double event_time = rit->CudaElapsedMs(events[i][j]); -#else - double event_time = rit->CpuElapsedMs(events[i][j]); -#endif + double event_time = (g_state == ProfilerState::kCUDA) + ? rit->CudaElapsedMs(events[i][j]) + : rit->CpuElapsedMs(events[i][j]); std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); max_name_width = std::max(max_name_width, event_name.size()); + if (event_idx.find(event_name) == event_idx.end()) { event_idx[event_name] = event_items.size(); EventItem event_item = {event_name, 1, event_time, @@ -264,11 +223,13 @@ void ParseEvents(std::vector>& events, event_items[index].max_time = std::max(event_time, event_items[index].max_time); } + // remove the start marker from the list pushed_events.erase((++rit).base()); } else { - std::cout << "Warning: can not find the start marker of event " - << events[i][j].name(); + LOG(WARNING) << "Cannot find the push marker of event \'" + << events[i][j].name() + << "\', which will be ignored in profiling report."; } } } @@ -294,19 +255,65 @@ void ParseEvents(std::vector>& events, } }); } + events_table.push_back(event_items); + // To check whether there are events with `push` but without `pop` + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend()) { + if (rit->kind() == "push") { + LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() + << "\', which will be ignored in profiling report."; + } + ++rit; + } } - // output events table + + // Print report + PrintProfilingReport(events_table, sorted_by, max_name_width + 4, 12); +} + +void PrintProfilingReport(std::vector>& events_table, + EventSortingKey sorted_by, const size_t name_width, + const size_t data_width) { + if (g_profiler_place == "") return; + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::cout << "Place: " << g_profiler_place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::string sort_domain = "event end time"; + switch (sorted_by) { + case EventSortingKey::kCalls: + sort_domain = "number of calls"; + break; + case EventSortingKey::kTotal: + sort_domain = "total time"; + break; + case EventSortingKey::kMin: + sort_domain = "minimum time"; + break; + case EventSortingKey::kMax: + sort_domain = "maximum time"; + break; + case EventSortingKey::kAve: + sort_domain = "average time"; + break; + default: + break; + } + std::cout << "Sorted by " << sort_domain + << " in descending order in the same thread\n\n"; + // Output events table std::cout.setf(std::ios::left); - const int data_width = 12; - std::cout << std::setw(max_name_width + 4) << "Event" << std::setw(data_width) + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) << "Calls" << std::setw(data_width) << "Total" << std::setw(data_width) << "Min." << std::setw(data_width) << "Max." << std::setw(data_width) << "Ave." << std::endl; for (size_t i = 0; i < events_table.size(); ++i) { for (size_t j = 0; j < events_table[i].size(); ++j) { EventItem& event_item = events_table[i][j]; - std::cout << std::setw(max_name_width + 4) << event_item.name + std::cout << std::setw(name_width) << event_item.name << std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.total_time << std::setw(data_width) << event_item.min_time @@ -314,6 +321,7 @@ void ParseEvents(std::vector>& events, << std::setw(data_width) << event_item.ave_time << std::endl; } } + std::cout << std::endl; } } // namespace platform diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index d51ec6443217c..f97a586787780 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -134,5 +134,9 @@ enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; void ParseEvents(std::vector>&, EventSortingKey sorted_by = EventSortingKey::kDefault); +// Print results +void PrintProfilingReport(std::vector>& events_table, + EventSortingKey sorted_by, const size_t name_width, + const size_t data_width); } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 3800f3914f188..13dea713c71e1 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -99,8 +99,12 @@ TEST(RecordEvent, RecordEvent) { int counter = 1; while (counter != i * 1000) counter++; } + + // Bad Usage: + PushEvent("event_without_pop", dev_ctx); + PopEvent("event_without_push", dev_ctx); std::vector> events = paddle::platform::DisableProfiler(); - // Will remove from test before merging + // Will remove parsing-related code from test later ParseEvents(events, EventSortingKey::kTotal); int cuda_startup_count = 0; From 7a4f3be9f3f44781f789b60620a77f3857a6cd15 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Sat, 6 Jan 2018 06:14:48 +0000 Subject: [PATCH 08/10] Fix profiler place bug --- paddle/platform/profiler.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index abb1e6a89cf9e..239df23128ede 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -181,6 +181,7 @@ std::vector> DisableProfiler() { void ParseEvents(std::vector>& events, EventSortingKey sorted_by) { + if (g_profiler_place == "") return; std::vector> events_table; size_t max_name_width = 0; for (size_t i = 0; i < events.size(); i++) { @@ -199,7 +200,7 @@ void ParseEvents(std::vector>& events, } if (rit != pushed_events.rend()) { - double event_time = (g_state == ProfilerState::kCUDA) + double event_time = (g_profiler_place == "CUDA") ? rit->CudaElapsedMs(events[i][j]) : rit->CpuElapsedMs(events[i][j]); std::string event_name = @@ -224,7 +225,7 @@ void ParseEvents(std::vector>& events, std::max(event_time, event_items[index].max_time); } - // remove the start marker from the list + // remove the push marker from the list pushed_events.erase((++rit).base()); } else { LOG(WARNING) << "Cannot find the push marker of event \'" @@ -257,13 +258,11 @@ void ParseEvents(std::vector>& events, } events_table.push_back(event_items); - // To check whether there are events with `push` but without `pop` + // log warning if there are events with `push` but without `pop` std::list::reverse_iterator rit = pushed_events.rbegin(); while (rit != pushed_events.rend()) { - if (rit->kind() == "push") { - LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() - << "\', which will be ignored in profiling report."; - } + LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() + << "\', which will be ignored in profiling report."; ++rit; } } @@ -275,7 +274,6 @@ void ParseEvents(std::vector>& events, void PrintProfilingReport(std::vector>& events_table, EventSortingKey sorted_by, const size_t name_width, const size_t data_width) { - if (g_profiler_place == "") return; // Output header information std::cout << "\n------------------------->" << " Profiling Report " From d09503b2bef5b134b50e7a9069426276bcf73762 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 8 Jan 2018 04:29:57 +0000 Subject: [PATCH 09/10] Remove the redundant switch case statement --- paddle/platform/profiler.cc | 80 +++++++++++++++++++------------------ paddle/platform/profiler.h | 2 +- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 239df23128ede..2562b2b5f07e0 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -182,6 +182,44 @@ std::vector> DisableProfiler() { void ParseEvents(std::vector>& events, EventSortingKey sorted_by) { if (g_profiler_place == "") return; + + std::string sorted_domain; + std::function sorted_func; + switch (sorted_by) { + case EventSortingKey::kCalls: + sorted_domain = "number of calls"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.calls > b.calls; + }; + break; + case EventSortingKey::kTotal: + sorted_domain = "total time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.total_time > b.total_time; + }; + break; + case EventSortingKey::kMin: + sorted_domain = "minimum time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.min_time > b.min_time; + }; + break; + case EventSortingKey::kMax: + sorted_domain = "maximum time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.max_time > b.max_time; + }; + break; + case EventSortingKey::kAve: + sorted_domain = "average time"; + sorted_func = [](EventItem& a, EventItem& b) { + return a.ave_time > b.ave_time; + }; + break; + default: + sorted_domain = "event end time"; + } + std::vector> events_table; size_t max_name_width = 0; for (size_t i = 0; i < events.size(); i++) { @@ -240,21 +278,7 @@ void ParseEvents(std::vector>& events, } // sort if (sorted_by != EventSortingKey::kDefault) { - std::sort(event_items.begin(), event_items.end(), - [&](EventItem& a, EventItem& b) { - switch (sorted_by) { - case EventSortingKey::kCalls: - return a.calls > b.calls; - case EventSortingKey::kTotal: - return a.total_time > b.total_time; - case EventSortingKey::kMin: - return a.min_time > b.min_time; - case EventSortingKey::kMax: - return a.max_time > b.max_time; - default: - return a.ave_time > b.ave_time; - } - }); + std::sort(event_items.begin(), event_items.end(), sorted_func); } events_table.push_back(event_items); @@ -268,11 +292,11 @@ void ParseEvents(std::vector>& events, } // Print report - PrintProfilingReport(events_table, sorted_by, max_name_width + 4, 12); + PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); } void PrintProfilingReport(std::vector>& events_table, - EventSortingKey sorted_by, const size_t name_width, + std::string& sorted_domain, const size_t name_width, const size_t data_width) { // Output header information std::cout << "\n------------------------->" @@ -280,27 +304,7 @@ void PrintProfilingReport(std::vector>& events_table, << "<-------------------------\n\n"; std::cout << "Place: " << g_profiler_place << std::endl; std::cout << "Time unit: ms" << std::endl; - std::string sort_domain = "event end time"; - switch (sorted_by) { - case EventSortingKey::kCalls: - sort_domain = "number of calls"; - break; - case EventSortingKey::kTotal: - sort_domain = "total time"; - break; - case EventSortingKey::kMin: - sort_domain = "minimum time"; - break; - case EventSortingKey::kMax: - sort_domain = "maximum time"; - break; - case EventSortingKey::kAve: - sort_domain = "average time"; - break; - default: - break; - } - std::cout << "Sorted by " << sort_domain + std::cout << "Sorted by " << sorted_domain << " in descending order in the same thread\n\n"; // Output events table std::cout.setf(std::ios::left); diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index f97a586787780..6df48ef8806e8 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -136,7 +136,7 @@ void ParseEvents(std::vector>&, // Print results void PrintProfilingReport(std::vector>& events_table, - EventSortingKey sorted_by, const size_t name_width, + std::string& sorted_domain, const size_t name_width, const size_t data_width); } // namespace platform } // namespace paddle From ea0280b4a17ae3862c127bcdda1e7900b5ba1e11 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 8 Jan 2018 04:33:46 +0000 Subject: [PATCH 10/10] Remove unused included header gflags --- paddle/platform/profiler.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 2562b2b5f07e0..4283724d28afc 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/platform/profiler.h" #include #include -#include "gflags/gflags.h" #include "glog/logging.h" namespace paddle {