diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 9427702cbcfdc..5cd909b4e08f0 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -162,8 +162,9 @@ void RecordEvent::OriginalConstruct(const std::string &name, void RecordEvent::End() { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA - if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) { + if (g_enable_nvprof_hook && is_pushed_) { dynload::nvtxRangePop(); + is_pushed_ = false; } #endif #endif diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 4061e2d4d494d..0c5d90a9277a6 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include +#include #include "glog/logging.h" @@ -75,6 +76,26 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { // log all nodes except root node, root node is a helper node. const std::map> thread2host_event_nodes = node_trees.Traverse(true); + // find the earliest time in current timeline + start_time_ = std::numeric_limits::max(); + for (auto it = thread2host_event_nodes.begin(); + it != thread2host_event_nodes.end(); ++it) { + if (it->second.begin() + 1 != it->second.end()) { + if ((*(it->second.begin() + 1))->StartNs() < start_time_) { + start_time_ = (*(it->second.begin() + 1))->StartNs(); + } + } else { + auto runtimenode = + (*(it->second.begin()))->GetRuntimeTraceEventNodes().begin(); + if (runtimenode != + (*(it->second.begin()))->GetRuntimeTraceEventNodes().end()) { + if ((*runtimenode)->StartNs() < start_time_) { + start_time_ = (*runtimenode)->StartNs(); + } + } + } + } + for (auto it = thread2host_event_nodes.begin(); it != thread2host_event_nodes.end(); ++it) { for (auto hostnode = it->second.begin(); hostnode != it->second.end(); @@ -102,6 +123,13 @@ void ChromeTracingLogger::LogHostTraceEventNode( if (!output_file_stream_) { return; } + std::string dur_display; + float dur = nsToMsFloat(host_node.Duration()); + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } switch (host_node.Type()) { case TracerEventType::ProfileStep: case TracerEventType::Forward: @@ -110,42 +138,50 @@ void ChromeTracingLogger::LogHostTraceEventNode( case TracerEventType::Optimization: case TracerEventType::PythonOp: case TracerEventType::PythonUserDefined: + // cname value comes from tracing.js reservedColorsByName variable + output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": "%lld(Python)", - "ts": %lld, "dur": %lld, + "name": "%s[%s]", "pid": %lld, "tid": "%lld(Python)", + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "thread_state_runnable", "args": { - "start_ns": %lld, - "end_ns": %lld + "start_time": "%.3f us", + "end_time": "%.3f us" } }, )JSON"), - host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), + host_node.ThreadId(), nsToUs(host_node.StartNs()), + nsToUsFloat(host_node.Duration()), categary_name_[static_cast(host_node.Type())], - host_node.StartNs(), host_node.EndNs()); + nsToUsFloat(host_node.StartNs(), start_time_), + nsToUsFloat(host_node.EndNs(), start_time_)); break; default: output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": "%lld(C++)", - "ts": %lld, "dur": %lld, + "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "thread_state_runnable", "args": { - "start_ns": %lld, - "end_ns": %lld + "start_time": "%.3f us", + "end_time": "%.3f us" } }, )JSON"), - host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), + host_node.ThreadId(), nsToUs(host_node.StartNs()), + nsToUsFloat(host_node.Duration()), categary_name_[static_cast(host_node.Type())], - host_node.StartNs(), host_node.EndNs()); + nsToUsFloat(host_node.StartNs(), start_time_), + nsToUsFloat(host_node.EndNs(), start_time_)); break; } @@ -157,26 +193,35 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( if (!output_file_stream_) { return; } + float dur = nsToMsFloat(runtime_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": "%lld(C++)", - "ts": %lld, "dur": %lld, + "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "thread_state_running", "args": { "correlation id": %d, - "start_ns": %lld, - "end_ns": %lld + "start_time": "%.3f us", + "end_time": "%.3f us" } }, )JSON"), - runtime_node.Name().c_str(), runtime_node.ProcessId(), - runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()), - nsToUs(runtime_node.Duration()), + runtime_node.Name().c_str(), dur_display.c_str(), + runtime_node.ProcessId(), runtime_node.ThreadId(), + nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()), categary_name_[static_cast(runtime_node.Type())], - runtime_node.CorrelationId(), runtime_node.StartNs(), - runtime_node.EndNs()); + runtime_node.CorrelationId(), + nsToUsFloat(runtime_node.StartNs(), start_time_), + nsToUsFloat(runtime_node.EndNs(), start_time_)); pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); output_file_stream_ << string_format( @@ -199,6 +244,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( if (!output_file_stream_) { return; } + switch (device_node.Type()) { case TracerEventType::Kernel: HandleTypeKernel(device_node); @@ -265,17 +311,24 @@ void ChromeTracingLogger::HandleTypeKernel( kernel_info.block_x, kernel_info.block_y, kernel_info.block_z, blocks_per_sm); #endif - + float dur = nsToMsFloat(device_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, - "ts": %lld, "dur": %lld, + "name": "%s[%s]", "pid": %lld, "tid": %lld, + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "rail_animation", "args": { - "start_ns": %lld, - "end_ns": %lld, + "start_time": "%.3f us", + "end_time": "%.3f us", "device": %d, "context": %d, "stream": %d, "correlation id": %d, "registers per thread": %d, @@ -284,15 +337,16 @@ void ChromeTracingLogger::HandleTypeKernel( "warps per SM": %f, "grid": [%d, %d, %d], "block": [%d, %d, %d], - "theoretical achieved occupancy %%": %f + "theoretical achieved occupancy %%": %.3f } }, )JSON"), - device_node.Name().c_str(), device_node.DeviceId(), + device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.StreamId(), nsToUs(device_node.StartNs()), - nsToUs(device_node.Duration()), + nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + nsToUsFloat(device_node.StartNs(), start_time_), + nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, @@ -308,53 +362,71 @@ void ChromeTracingLogger::HandleTypeMemcpy( if (device_node.Duration() > 0) { memory_bandwidth = memcpy_info.num_bytes * 1.0 / device_node.Duration(); } + float dur = nsToMsFloat(device_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, - "ts": %lld, "dur": %lld, + "name": "%s[%s]", "pid": %lld, "tid": %lld, + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "rail_animation", "args": { - "start_ns": %lld, - "end_ns": %lld, + "start_time": "%.3f us", + "end_time": "%.3f us", "stream": %d, "correlation id": %d, - "bytes": %d, "memory bandwidth (GB/s)": %f + "bytes": %d, "memory bandwidth (GB/s)": %.3f } }, )JSON"), - device_node.Name().c_str(), device_node.DeviceId(), + device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.StreamId(), nsToUs(device_node.StartNs()), - nsToUs(device_node.Duration()), + nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StartNs(), device_node.EndNs(), device_node.StreamId(), + nsToUsFloat(device_node.StartNs(), start_time_), + nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(), device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth); } void ChromeTracingLogger::HandleTypeMemset( const DeviceTraceEventNode& device_node) { MemsetEventInfo memset_info = device_node.MemsetInfo(); + float dur = nsToMsFloat(device_node.Duration()); + std::string dur_display; + if (dur > 1.0) { + dur_display = string_format(std::string("%.3f ms"), dur); + } else { + dur_display = string_format(std::string("%.3f us"), dur * 1000); + } output_file_stream_ << string_format( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, - "ts": %lld, "dur": %lld, + "name": "%s[%s]", "pid": %lld, "tid": %lld, + "ts": %lld, "dur": %.3f, "ph": "X", "cat": "%s", + "cname": "rail_animation", "args": { - "start_ns": %lld, - "end_ns": %lld, + "start_time": "%.3f us", + "end_time": "%.3f us", "device": %d, "context": %d, "stream": %d, "correlation id": %d, "bytes": %d, "value": %d } }, )JSON"), - device_node.Name().c_str(), device_node.DeviceId(), + device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.StreamId(), nsToUs(device_node.StartNs()), - nsToUs(device_node.Duration()), + nsToUsFloat(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + nsToUsFloat(device_node.StartNs(), start_time_), + nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), memset_info.num_bytes, memset_info.value); } diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 20a924a54cabd..8977ab748c63a 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -50,6 +50,7 @@ class ChromeTracingLogger : public BaseLogger { static const char* categary_name_[]; std::set> pid_tid_set_; std::set> deviceid_streamid_set_; + uint64_t start_time_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index b471d6b79833a..06d1636c4617c 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -44,7 +44,16 @@ static std::string GetStringFormatLocalTime() { return std::string(buf); } -static int64_t nsToUs(int64_t ns) { return ns / 1000; } +static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) { + return (end_ns - start_ns) / 1000; +} + +static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) { + return static_cast(end_ns - start_ns) / 1000; +} +static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) { + return static_cast(end_ns - start_ns) / 1000 / 1000; +} #ifdef PADDLE_WITH_CUPTI float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py index c93e5dce86d5b..0088687b12563 100755 --- a/python/paddle/fluid/tests/unittests/test_newprofiler.py +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -128,6 +128,16 @@ def my_sheduler1(num_step): result = profiler.utils.load_profiler_result('./test_profiler_pb.pb') +class TestNvprof(unittest.TestCase): + def test_nvprof(self): + for i in range(10): + paddle.fluid.profiler._nvprof_range(i, 10, 20) + x_value = np.random.randn(2, 3, 3) + x = paddle.to_tensor( + x_value, stop_gradient=False, place=paddle.CPUPlace()) + y = x / 2.0 + + class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index 1acd583ee0f1a..291326478e91b 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -77,17 +77,19 @@ def begin(self): r""" Record the time of begining. - .. code-block:: python - :name: code-example2 + Examples: - import paddle - import paddle.profiler as profiler - record_event = profiler.RecordEvent("record_sub") - record_event.begin() - data1 = paddle.randn(shape=[3]) - data2 = paddle.randn(shape=[3]) - result = data1 - data2 - record_event.end() + .. code-block:: python + :name: code-example2 + + import paddle + import paddle.profiler as profiler + record_event = profiler.RecordEvent("record_sub") + record_event.begin() + data1 = paddle.randn(shape=[3]) + data2 = paddle.randn(shape=[3]) + result = data1 - data2 + record_event.end() """ if self.event_type not in _AllowedEventTypeList: warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\ @@ -102,17 +104,19 @@ def end(self): r''' Record the time of ending. - .. code-block:: python - :name: code-example3 + Examples: - import paddle - import paddle.profiler as profiler - record_event = profiler.RecordEvent("record_mul") - record_event.begin() - data1 = paddle.randn(shape=[3]) - data2 = paddle.randn(shape=[3]) - result = data1 * data2 - record_event.end() + .. code-block:: python + :name: code-example3 + + import paddle + import paddle.profiler as profiler + record_event = profiler.RecordEvent("record_mul") + record_event.begin() + data1 = paddle.randn(shape=[3]) + data2 = paddle.randn(shape=[3]) + result = data1 * data2 + record_event.end() ''' if self.event: self.event.end()