Skip to content

Commit

Permalink
Add time range duration display (#41029)
Browse files Browse the repository at this point in the history
* no

* fix bugs

* fix doc according to review

* fix api doc format

* fix api doc according to review

* fix bug and add unit test

* fix record event bug

* optimize chrome tracing display

* fix bug

* add comment

* add unit test

* fix a bug

* fix

* fix

* fix format
  • Loading branch information
rainyfly authored Mar 31, 2022
1 parent b9da48d commit 6744754
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 69 deletions.
3 changes: 2 additions & 1 deletion paddle/fluid/platform/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,9 @@ void RecordEvent::OriginalConstruct(const std::string &name,
void RecordEvent::End() {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) {
if (g_enable_nvprof_hook && is_pushed_) {
dynload::nvtxRangePop();
is_pushed_ = false;
}
#endif
#endif
Expand Down
166 changes: 119 additions & 47 deletions paddle/fluid/platform/profiler/chrometracing_logger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#include <cstdio>
#include <ctime>
#include <limits>

#include "glog/logging.h"

Expand Down Expand Up @@ -75,6 +76,26 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
// log all nodes except root node, root node is a helper node.
const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = node_trees.Traverse(true);
// find the earliest time in current timeline
start_time_ = std::numeric_limits<uint64_t>::max();
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
if (it->second.begin() + 1 != it->second.end()) {
if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
start_time_ = (*(it->second.begin() + 1))->StartNs();
}
} else {
auto runtimenode =
(*(it->second.begin()))->GetRuntimeTraceEventNodes().begin();
if (runtimenode !=
(*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) {
if ((*runtimenode)->StartNs() < start_time_) {
start_time_ = (*runtimenode)->StartNs();
}
}
}
}

for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
for (auto hostnode = it->second.begin(); hostnode != it->second.end();
Expand Down Expand Up @@ -102,6 +123,13 @@ void ChromeTracingLogger::LogHostTraceEventNode(
if (!output_file_stream_) {
return;
}
std::string dur_display;
float dur = nsToMsFloat(host_node.Duration());
if (dur > 1.0) {
dur_display = string_format(std::string("%.3f ms"), dur);
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
switch (host_node.Type()) {
case TracerEventType::ProfileStep:
case TracerEventType::Forward:
Expand All @@ -110,42 +138,50 @@ void ChromeTracingLogger::LogHostTraceEventNode(
case TracerEventType::Optimization:
case TracerEventType::PythonOp:
case TracerEventType::PythonUserDefined:
// cname value comes from tracing.js reservedColorsByName variable

output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": "%lld(Python)",
"ts": %lld, "dur": %lld,
"name": "%s[%s]", "pid": %lld, "tid": "%lld(Python)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_runnable",
"args": {
"start_ns": %lld,
"end_ns": %lld
"start_time": "%.3f us",
"end_time": "%.3f us"
}
},
)JSON"),
host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
host_node.StartNs(), host_node.EndNs());
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_));
break;
default:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %lld,
"name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_runnable",
"args": {
"start_ns": %lld,
"end_ns": %lld
"start_time": "%.3f us",
"end_time": "%.3f us"
}
},
)JSON"),
host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
host_node.StartNs(), host_node.EndNs());
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_));
break;
}

Expand All @@ -157,26 +193,35 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
if (!output_file_stream_) {
return;
}
float dur = nsToMsFloat(runtime_node.Duration());
std::string dur_display;
if (dur > 1.0) {
dur_display = string_format(std::string("%.3f ms"), dur);
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %lld,
"name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_running",
"args": {
"correlation id": %d,
"start_ns": %lld,
"end_ns": %lld
"start_time": "%.3f us",
"end_time": "%.3f us"
}
},
)JSON"),
runtime_node.Name().c_str(), runtime_node.ProcessId(),
runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()),
nsToUs(runtime_node.Duration()),
runtime_node.Name().c_str(), dur_display.c_str(),
runtime_node.ProcessId(), runtime_node.ThreadId(),
nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
categary_name_[static_cast<int>(runtime_node.Type())],
runtime_node.CorrelationId(), runtime_node.StartNs(),
runtime_node.EndNs());
runtime_node.CorrelationId(),
nsToUsFloat(runtime_node.StartNs(), start_time_),
nsToUsFloat(runtime_node.EndNs(), start_time_));
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});

output_file_stream_ << string_format(
Expand All @@ -199,6 +244,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
if (!output_file_stream_) {
return;
}

switch (device_node.Type()) {
case TracerEventType::Kernel:
HandleTypeKernel(device_node);
Expand Down Expand Up @@ -265,17 +311,24 @@ void ChromeTracingLogger::HandleTypeKernel(
kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
blocks_per_sm);
#endif

float dur = nsToMsFloat(device_node.Duration());
std::string dur_display;
if (dur > 1.0) {
dur_display = string_format(std::string("%.3f ms"), dur);
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": %lld,
"ts": %lld, "dur": %lld,
"name": "%s[%s]", "pid": %lld, "tid": %lld,
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "rail_animation",
"args": {
"start_ns": %lld,
"end_ns": %lld,
"start_time": "%.3f us",
"end_time": "%.3f us",
"device": %d, "context": %d,
"stream": %d, "correlation id": %d,
"registers per thread": %d,
Expand All @@ -284,15 +337,16 @@ void ChromeTracingLogger::HandleTypeKernel(
"warps per SM": %f,
"grid": [%d, %d, %d],
"block": [%d, %d, %d],
"theoretical achieved occupancy %%": %f
"theoretical achieved occupancy %%": %.3f
}
},
)JSON"),
device_node.Name().c_str(), device_node.DeviceId(),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUs(device_node.Duration()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
device_node.CorrelationId(), kernel_info.registers_per_thread,
kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
Expand All @@ -308,53 +362,71 @@ void ChromeTracingLogger::HandleTypeMemcpy(
if (device_node.Duration() > 0) {
memory_bandwidth = memcpy_info.num_bytes * 1.0 / device_node.Duration();
}
float dur = nsToMsFloat(device_node.Duration());
std::string dur_display;
if (dur > 1.0) {
dur_display = string_format(std::string("%.3f ms"), dur);
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": %lld,
"ts": %lld, "dur": %lld,
"name": "%s[%s]", "pid": %lld, "tid": %lld,
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "rail_animation",
"args": {
"start_ns": %lld,
"end_ns": %lld,
"start_time": "%.3f us",
"end_time": "%.3f us",
"stream": %d, "correlation id": %d,
"bytes": %d, "memory bandwidth (GB/s)": %f
"bytes": %d, "memory bandwidth (GB/s)": %.3f
}
},
)JSON"),
device_node.Name().c_str(), device_node.DeviceId(),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUs(device_node.Duration()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
device_node.StartNs(), device_node.EndNs(), device_node.StreamId(),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
}

void ChromeTracingLogger::HandleTypeMemset(
const DeviceTraceEventNode& device_node) {
MemsetEventInfo memset_info = device_node.MemsetInfo();
float dur = nsToMsFloat(device_node.Duration());
std::string dur_display;
if (dur > 1.0) {
dur_display = string_format(std::string("%.3f ms"), dur);
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": %lld,
"ts": %lld, "dur": %lld,
"name": "%s[%s]", "pid": %lld, "tid": %lld,
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "rail_animation",
"args": {
"start_ns": %lld,
"end_ns": %lld,
"start_time": "%.3f us",
"end_time": "%.3f us",
"device": %d, "context": %d,
"stream": %d, "correlation id": %d,
"bytes": %d, "value": %d
}
},
)JSON"),
device_node.Name().c_str(), device_node.DeviceId(),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUs(device_node.Duration()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
}
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/platform/profiler/chrometracing_logger.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class ChromeTracingLogger : public BaseLogger {
static const char* categary_name_[];
std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
uint64_t start_time_;
};

} // namespace platform
Expand Down
11 changes: 10 additions & 1 deletion paddle/fluid/platform/profiler/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,16 @@ static std::string GetStringFormatLocalTime() {
return std::string(buf);
}

static int64_t nsToUs(int64_t ns) { return ns / 1000; }
static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
return (end_ns - start_ns) / 1000;
}

static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
return static_cast<float>(end_ns - start_ns) / 1000;
}
static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
return static_cast<float>(end_ns - start_ns) / 1000 / 1000;
}

#ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
Expand Down
10 changes: 10 additions & 0 deletions python/paddle/fluid/tests/unittests/test_newprofiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,16 @@ def my_sheduler1(num_step):
result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')


class TestNvprof(unittest.TestCase):
def test_nvprof(self):
for i in range(10):
paddle.fluid.profiler._nvprof_range(i, 10, 20)
x_value = np.random.randn(2, 3, 3)
x = paddle.to_tensor(
x_value, stop_gradient=False, place=paddle.CPUPlace())
y = x / 2.0


class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
Expand Down
Loading

0 comments on commit 6744754

Please sign in to comment.