From 7e44acd9e2250dec7c9abac06f7207aa63ec7f19 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 17 Feb 2022 17:20:13 +0800 Subject: [PATCH 01/12] fix RecordEvent interface --- paddle/fluid/platform/profiler.cc | 32 ++++++++++--------- .../fluid/platform/profiler/event_tracing.h | 20 +++++++----- paddle/fluid/platform/profiler/trace_event.h | 12 +++++++ 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 8fecf444dc41b..866bf3c66aa2a 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -66,8 +66,8 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -RecordEvent::RecordEvent(const char *name, const EventRole role, - uint32_t level) { +RecordEvent::RecordEvent(const char *name, const TracerEventType type, + uint32_t level, const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -86,11 +86,12 @@ RecordEvent::RecordEvent(const char *name, const EventRole role, is_enabled_ = true; shallow_copy_name_ = name; role_ = role; + type_ = type; start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const EventRole role, - uint32_t level) { +RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, + uint32_t level, const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -109,11 +110,13 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, is_enabled_ = true; name_ = new std::string(name); role_ = role; + type_ = type; start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const EventRole role, - const std::string &attr, uint32_t level) { +RecordEvent::RecordEvent(const std::string &name, const std::string &attr, + const TracerEventType type, uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -130,6 +133,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, return; } is_enabled_ = true; + type_ = type; name_ = new std::string(name); start_ns_ = PosixInNsec(); attr_ = new std::string(attr); @@ -164,17 +168,15 @@ void RecordEvent::End() { uint64_t end_ns = PosixInNsec(); if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) { if (LIKELY(shallow_copy_name_ != nullptr)) { - HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_, - start_ns_, end_ns, role_, - TracerEventType::NumTypes); + HostEventRecorder::GetInstance().RecordEvent( + shallow_copy_name_, start_ns_, end_ns, role_, type_); } else if (name_ != nullptr) { if (attr_ == nullptr) { - HostEventRecorder::GetInstance().RecordEvent( - *name_, start_ns_, end_ns, role_, TracerEventType::NumTypes); + HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, + role_, type_); } else { - HostEventRecorder::GetInstance().RecordEvent( - *name_, start_ns_, end_ns, role_, TracerEventType::NumTypes, - *attr_); + HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, + role_, type_, *attr_); delete attr_; } delete name_; @@ -301,7 +303,7 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void Mark(const std::string &name) { if (FLAGS_enable_host_event_recorder_hook) { HostEventRecorder::GetInstance().RecordEvent( - name, 0, 0, EventRole::kOrdinary, TracerEventType::NumTypes); + name, 0, 0, EventRole::kOrdinary, TracerEventType::UserDefined); return; } GetEventList().Record(EventType::kMark, name, g_thread_id); diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h index 2532077bcc3bd..9711159563a8f 100644 --- a/paddle/fluid/platform/profiler/event_tracing.h +++ b/paddle/fluid/platform/profiler/event_tracing.h @@ -34,16 +34,19 @@ struct RecordInstantEvent { // Chrome Trace Viewer Format: Duration Event/Complte Event class RecordEvent { public: - explicit RecordEvent(const std::string& name, - const EventRole role = EventRole::kOrdinary, - uint32_t level = 1); + explicit RecordEvent( + const std::string& name, + const TracerEventType type = TracerEventType::UserDefined, + uint32_t level = 1, const EventRole role = EventRole::kOrdinary); - explicit RecordEvent(const char* name, - const EventRole role = EventRole::kOrdinary, - uint32_t level = 1); + explicit RecordEvent( + const char* name, + const TracerEventType type = TracerEventType::UserDefined, + uint32_t level = 1, const EventRole role = EventRole::kOrdinary); - RecordEvent(const std::string& name, const EventRole role, - const std::string& attr, uint32_t level = 1); + RecordEvent(const std::string& name, const std::string& attr, + const TracerEventType type = TracerEventType::UserDefined, + uint32_t level = 1, const EventRole role = EventRole::kOrdinary); // Stop event tracing explicitly before the object goes out of scope. // Sometimes it's inconvenient to use RAII @@ -65,6 +68,7 @@ class RecordEvent { // different kernel invocations within an op. // std::string full_name_; EventRole role_{EventRole::kOrdinary}; + TracerEventType type_{TracerEventType::UserDefined}; std::string* attr_{nullptr}; bool finished_{false}; }; diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 3e4903f6ffb64..61f96218560ec 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -36,6 +36,18 @@ enum class TracerEventType { Memset = 6, // Used to mark record defined by user UserDefined = 7, + // Used to mark operator detail, (such as infer shape, compute) + OperatorInner = 8, + // Used to mark model training or testing perspective, forward process + Forward = 9, + // Used to mark model training perspective, backward process + Backward = 10, + // Used to mark model training perspective, optimization process + Optimization = 11, + // Used to mark distributed training perspective + Communication = 12, + // Used to mark python api + PythonOp = 13, // A flag to denote the number of current types NumTypes }; From c6fdca747011b315f5e22558bd66d4e9ee0d50ac Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 17 Feb 2022 18:17:16 +0800 Subject: [PATCH 02/12] modify default level to 4 --- paddle/fluid/platform/profiler/event_tracing.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h index 9711159563a8f..167876e3d45ce 100644 --- a/paddle/fluid/platform/profiler/event_tracing.h +++ b/paddle/fluid/platform/profiler/event_tracing.h @@ -37,16 +37,16 @@ class RecordEvent { explicit RecordEvent( const std::string& name, const TracerEventType type = TracerEventType::UserDefined, - uint32_t level = 1, const EventRole role = EventRole::kOrdinary); + uint32_t level = 4, const EventRole role = EventRole::kOrdinary); explicit RecordEvent( const char* name, const TracerEventType type = TracerEventType::UserDefined, - uint32_t level = 1, const EventRole role = EventRole::kOrdinary); + uint32_t level = 4, const EventRole role = EventRole::kOrdinary); RecordEvent(const std::string& name, const std::string& attr, const TracerEventType type = TracerEventType::UserDefined, - uint32_t level = 1, const EventRole role = EventRole::kOrdinary); + uint32_t level = 4, const EventRole role = EventRole::kOrdinary); // Stop event tracing explicitly before the object goes out of scope. // Sometimes it's inconvenient to use RAII From 6791fe071ffdb3f9475deedc1d7d8aa2bb3758c6 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 17 Feb 2022 21:15:23 +0800 Subject: [PATCH 03/12] update interface use --- .../fluid/framework/data_layout_transform.cc | 3 ++- paddle/fluid/framework/operator.cc | 16 +++++++++----- paddle/fluid/imperative/prepared_operator.cc | 22 +++++++++++-------- .../mkldnn/elementwise_add_mkldnn_op.cc | 10 +++++---- .../mkldnn/elementwise_sub_mkldnn_op.cc | 10 +++++---- paddle/fluid/operators/marker_op.cc | 7 +++--- paddle/fluid/operators/marker_op.cu | 7 +++--- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 5 +++-- .../mkldnn/conv_transpose_mkldnn_op.cc | 10 +++++---- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 10 +++++---- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 10 +++++---- paddle/fluid/platform/mkldnn_helper.h | 5 +++-- paddle/fluid/platform/mkldnn_reuse.h | 21 +++++++++++------- 13 files changed, 83 insertions(+), 53 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 1bf6f12e63cbb..16ed7194b6d09 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -183,7 +183,8 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("ext_reorder", - platform::EventRole::kUniqueOp); + platform::TracerEventType::UserDefined, + 2, platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); } else { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7ab4e2acecfcc..e8fbe54b34c74 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/common/scalar.h" #include "paddle/pten/common/scalar_array.h" #include "paddle/pten/core/kernel_factory.h" @@ -261,10 +262,12 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // TODO(wangchaochaohu) : refine code to use only one RecordEvent) // in order to record different op type cost time // and different op name cost time,we set two event. - platform::RecordEvent op_type_record_event(Type()); + platform::RecordEvent op_type_record_event( + Type(), platform::TracerEventType::Operator, 1); auto op_name = platform::OpName(outputs_, Type()); platform::RecordEvent op_name_record_event( - op_name, platform::EventRole::kUniqueOp); + op_name, platform::TracerEventType::Operator, 1, + platform::EventRole::kUniqueOp); RunImpl(scope, place); } @@ -1253,7 +1256,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, Scope* transfer_scope = nullptr; { platform::RecordEvent record_event("prepare_data", - platform::EventRole::kInnerOp); + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); if (need_prepare_data_) { transfer_scope = PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); @@ -1265,7 +1269,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_) { platform::RecordEvent record_event("infer_shape", - platform::EventRole::kInnerOp); + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->Info().infer_shape_(&infer_shape_ctx); } @@ -1278,7 +1283,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // not Scope. Imperative mode only pass inputs and get outputs. { platform::RecordEvent record_event("compute", - platform::EventRole::kInnerOp); + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); if (run_pten_kernel_) { pten::KernelContext pt_kernel_context; // Do data transform before building KernelContext diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index c56f82d0bc084..4e86220e154e6 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -27,7 +27,7 @@ #endif #include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); @@ -348,16 +348,18 @@ static void PreparedOpRunImpl( framework::Scope scope; { - platform::RecordEvent record_event(op.Type() + " infer_shape", - platform::EventRole::kInnerOp); + platform::RecordEvent record_event(op.Type() + "::infer_shape", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); DygraphInferShapeContext infer_shape_ctx( &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); } { - platform::RecordEvent record_event(op.Type() + " compute", - platform::EventRole::kInnerOp); + platform::RecordEvent record_event(op.Type() + "::compute", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs, default_attrs)); @@ -403,16 +405,18 @@ static void PreparedOpRunPtImpl( const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { { - platform::RecordEvent record_event(op.Type() + " infer_shape", - platform::EventRole::kInnerOp); + platform::RecordEvent record_event(op.Type() + "::infer_shape", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); DygraphInferShapeContext infer_shape_ctx( &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); op.Info().infer_shape_(&infer_shape_ctx); } { - platform::RecordEvent record_event(op.Type() + " compute", - platform::EventRole::kInnerOp); + platform::RecordEvent record_event(op.Type() + "::compute", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); PreparePtenData(pt_kernel, pt_kernel_signature, ins); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 9305f42021192..fc90eda9f448c 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -57,8 +57,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); @@ -73,8 +74,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index 642ee1feb7a5d..fe505fe2e51a8 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -57,8 +57,9 @@ class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); @@ -78,8 +79,9 @@ class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { reorder_attr.set_output_scales(0, scales); auto reorder_p = std::make_shared( *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/operators/marker_op.cc b/paddle/fluid/operators/marker_op.cc index 397e3bfc6ad26..277a730be9c30 100644 --- a/paddle/fluid/operators/marker_op.cc +++ b/paddle/fluid/operators/marker_op.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { @@ -63,8 +63,9 @@ class MarkerOpCPUKernel : public framework::OpKernel { auto marker_pos = ctx.Attr("marker_pos"); platform::RecordEvent record_event( - "MarkerCPU", platform::EventRole::kInnerOp, - "marker_" + marker_role + "_" + marker_pos); + "MarkerCPU", "marker_" + marker_role + "_" + marker_pos, + platform::TracerEventType::OperatorInner, 1, + platform::EventRole::kInnerOp); } }; } // namespace operators diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu index b918210389169..cfa5c6dc7a918 100644 --- a/paddle/fluid/operators/marker_op.cu +++ b/paddle/fluid/operators/marker_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { @@ -45,8 +45,9 @@ class MarkerOpCUDAKernel : public framework::OpKernel { auto* in_temp = A.mutable_data({32, 1}, ctx.GetPlace()); auto* out_temp = B.mutable_data({32, 1}, ctx.GetPlace()); platform::RecordEvent record_event( - "MarkerCUDA", platform::EventRole::kInnerOp, - "marker_" + marker_role + "_" + marker_pos); + "MarkerCUDA", "marker_" + marker_role + "_" + marker_pos, + platform::TracerEventType::OperatorInner, 1, + platform::EventRole::kInnerOp); SimpleMarkerKernel<<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, 32); } diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index c5215751c8325..5774c3a16766a 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -976,8 +976,9 @@ class ConvMKLDNNGradOpKernel : public framework::OpKernel { handler.AcquireReorder(reorder_dst_memory_p, diff_weights_memory_p); { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, *diff_weights_memory_p, *reorder_dst_memory_p); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 4a3d1f455bd26..1d565839fc4ed 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -264,8 +264,9 @@ class ConvTransposeMKLDNNHandlerT dev_ctx.SetBlob(key_reorder_p, reorder_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); @@ -286,8 +287,9 @@ class ConvTransposeMKLDNNHandlerT auto reorder_p = std::static_pointer_cast( dev_ctx.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 153b0be6dad8f..7296a91f30d6f 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -284,8 +284,9 @@ class FCPrimitiveFactory { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder.execute(astream, src_mem, *dst_mem); astream.wait(); } @@ -312,8 +313,9 @@ class FCPrimitiveFactory { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder.execute(astream, {{DNNL_ARG_FROM, *src_mem}, {DNNL_ARG_TO, *dst_mem}}); astream.wait(); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index bc2dbf5696813..31c41f89d8a06 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -116,8 +116,9 @@ class MulPrimitiveFactory { auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder.execute(astream, src_mem, dst_mem); astream.wait(); } @@ -277,8 +278,9 @@ class MulPrimitiveFactory { auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder.execute(astream, src_mem, dst_mem); astream.wait(); } diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 9dbfe7013fae8..7515d810e0b62 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "dnnl.hpp" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { #ifdef PADDLE_WITH_MKLDNN using MKLDNNMemoryFormat = dnnl::memory::format_tag; @@ -190,7 +190,8 @@ inline void Reorder(dnnl::memory src, dnnl::memory dst, auto reorder_prim = dnnl::reorder(src, dst); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::TracerEventType::UserDefined, + 2, platform::EventRole::kUniqueOp); reorder_prim.execute(astream, src, dst); astream.wait(); } diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 8d706263f029c..7b8ca1ca42860 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -197,7 +197,8 @@ class MKLDNNHandlerNoCachingT { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::TracerEventType::UserDefined, + 2, platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); @@ -221,8 +222,9 @@ class MKLDNNHandlerNoCachingT { std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); @@ -514,7 +516,8 @@ class MKLDNNHandlerT { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::TracerEventType::UserDefined, + 2, platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); @@ -558,8 +561,9 @@ class MKLDNNHandlerT { dev_ctx_.SetBlob(key_reorder_p, reorder_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); @@ -580,8 +584,9 @@ class MKLDNNHandlerT { auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); reorder_p->execute(astream, {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); astream.wait(); From 370e952b680f0562127b7462f2bd2ab46141b15b Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 17 Feb 2022 21:33:38 +0800 Subject: [PATCH 04/12] add const default trace level --- paddle/fluid/platform/profiler/event_tracing.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h index 167876e3d45ce..54c5b219310a9 100644 --- a/paddle/fluid/platform/profiler/event_tracing.h +++ b/paddle/fluid/platform/profiler/event_tracing.h @@ -21,12 +21,13 @@ limitations under the License. */ namespace paddle { namespace platform { +static constexpr uint32_t kDefaultTraceLevel = 4; // CPU event tracing. A trace marks something that happens but has no duration // associated with it. For example, thread starts working. // Chrome Trace Viewer Format: Instant Event struct RecordInstantEvent { explicit RecordInstantEvent(const char* name, TracerEventType type, - uint32_t level = 1); + uint32_t level = kDefaultTraceLevel); }; // CPU event tracing. A trace starts when an object of this clas is created and @@ -37,16 +38,18 @@ class RecordEvent { explicit RecordEvent( const std::string& name, const TracerEventType type = TracerEventType::UserDefined, - uint32_t level = 4, const EventRole role = EventRole::kOrdinary); + uint32_t level = kDefaultTraceLevel, + const EventRole role = EventRole::kOrdinary); - explicit RecordEvent( - const char* name, - const TracerEventType type = TracerEventType::UserDefined, - uint32_t level = 4, const EventRole role = EventRole::kOrdinary); + explicit RecordEvent(const char* name, const TracerEventType type = + TracerEventType::UserDefined, + uint32_t level = kDefaultTraceLevel, + const EventRole role = EventRole::kOrdinary); RecordEvent(const std::string& name, const std::string& attr, const TracerEventType type = TracerEventType::UserDefined, - uint32_t level = 4, const EventRole role = EventRole::kOrdinary); + uint32_t level = kDefaultTraceLevel, + const EventRole role = EventRole::kOrdinary); // Stop event tracing explicitly before the object goes out of scope. // Sometimes it's inconvenient to use RAII From decbf4e774f82a98b760cf96020a9483646c08bf Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 17 Feb 2022 21:58:44 +0800 Subject: [PATCH 05/12] update record event interface using --- .../distributed/ps/service/brpc_ps_server.cc | 25 ++++++--- .../ps/service/communicator/communicator.cc | 53 ++++++++++++++----- .../ps/service/graph_brpc_server.cc | 2 +- .../distributed/ps/service/heter_client.cc | 6 ++- .../distributed/ps/service/heter_server.h | 6 ++- .../ps/service/ps_service/graph_py_service.cc | 2 +- .../framework/details/all_reduce_op_handle.cc | 5 +- .../bind_threaded_ssa_graph_executor.cc | 2 +- .../framework/details/broadcast_op_handle.cc | 5 +- .../details/eager_deletion_op_handle.cc | 5 +- .../fast_threaded_ssa_graph_executor.cc | 5 +- .../details/fetch_async_op_handle.cc | 4 +- .../framework/details/fetch_op_handle.cc | 5 +- .../details/fused_all_reduce_op_handle.cc | 5 +- .../details/fused_broadcast_op_handle.cc | 5 +- .../framework/details/reduce_op_handle.cc | 5 +- .../fluid/framework/details/rpc_op_handle.cc | 5 +- .../details/scale_loss_grad_op_handle.cc | 5 +- paddle/fluid/pybind/tensor_py.h | 2 +- 19 files changed, 101 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc index 58ce52552c9d2..fca7f8ac67419 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace google { namespace protobuf { @@ -188,7 +188,8 @@ void BrpcPsService::service(google::protobuf::RpcController *cntl_base, int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->pull_dense"); + platform::RecordEvent record_event( + "PsService->pull_dense", platform::TracerEventType::Communication, 1); CHECK_TABLE_EXIST(table, request, response) if (request.params_size() < 1) { set_response_code( @@ -219,7 +220,9 @@ int32_t BrpcPsService::push_dense_param(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->push_dense_param"); + platform::RecordEvent record_event("PsService->push_dense_param", + platform::TracerEventType::Communication, + 1); CHECK_TABLE_EXIST(table, request, response) thread_local std::string push_buffer; auto &req_io_buffer = cntl->request_attachment(); @@ -245,7 +248,8 @@ int32_t BrpcPsService::push_dense_param(Table *table, int32_t BrpcPsService::push_dense(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->push_dense"); + platform::RecordEvent record_event( + "PsService->push_dense", platform::TracerEventType::Communication, 1); CHECK_TABLE_EXIST(table, request, response) auto req_buffer_size = request.data().size(); if (req_buffer_size < 1) { @@ -291,7 +295,9 @@ int32_t BrpcPsService::push_sparse_param(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->push_sparse_param"); + platform::RecordEvent record_event("PsService->push_sparse_param", + platform::TracerEventType::Communication, + 1); CHECK_TABLE_EXIST(table, request, response) auto &push_data = request.data(); if (push_data.size() < 1) { @@ -323,7 +329,8 @@ int32_t BrpcPsService::pull_geo_param(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->pull_geo_param"); + platform::RecordEvent record_event( + "PsService->pull_geo_param", platform::TracerEventType::Communication, 1); CHECK_TABLE_EXIST(table, request, response) thread_local std::string push_sparse_request_buffer; @@ -346,7 +353,8 @@ int32_t BrpcPsService::pull_sparse(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->pull_sparse"); + platform::RecordEvent record_event( + "PsService->pull_sparse", platform::TracerEventType::Communication, 1); CHECK_TABLE_EXIST(table, request, response) auto &req_io_buffer = cntl->request_attachment(); @@ -392,7 +400,8 @@ int32_t BrpcPsService::push_sparse(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - platform::RecordEvent record_event("PsService->push_sparse"); + platform::RecordEvent record_event( + "PsService->push_sparse", platform::TracerEventType::Communication, 1); CHECK_TABLE_EXIST(table, request, response) auto &push_data = request.data(); if (push_data.size() < 1) { diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc index 99973ee8bdd74..b17fefaf1f51e 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" #define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@" @@ -113,7 +113,9 @@ int Communicator::SetClients(std::vector &host_sign_list) { void Communicator::RpcRecvDense(const std::vector &varnames, int table_id, Scope *scope) { - platform::RecordEvent record_event("Communicator->RpcRecvDense"); + platform::RecordEvent record_event("Communicator->RpcRecvDense", + platform::TracerEventType::Communication, + 1); std::vector regions; regions.reserve(varnames.size()); for (auto &t : varnames) { @@ -169,7 +171,9 @@ void Communicator::RpcRecvDense(const std::vector &varnames, void Communicator::RpcSendDenseParam(const std::vector &varnames, int table_id, const Scope &scope) { - platform::RecordEvent record_event("Communicator->RpcSendDenseParam"); + platform::RecordEvent record_event("Communicator->RpcSendDenseParam", + platform::TracerEventType::Communication, + 1); auto place = platform::CPUPlace(); std::vector regions; for (auto &t : varnames) { @@ -206,7 +210,9 @@ void Communicator::RpcSendDenseParam(const std::vector &varnames, } void Communicator::RpcSendDense(const CommContext &ctx, const Scope &scope) { - platform::RecordEvent record_event("Communicator->RpcSendDense"); + platform::RecordEvent record_event("Communicator->RpcSendDense", + platform::TracerEventType::Communication, + 1); auto &var_names = ctx.origin_varnames; auto &table_id = ctx.table_id; auto dense_data = std::make_shared>(); @@ -250,7 +256,9 @@ void Communicator::RpcSendDense(const CommContext &ctx, const Scope &scope) { void Communicator::RpcSendSparseParam(const std::string &varname, int table_id, const Scope &scope) { - platform::RecordEvent record_event("Communicator->RpcSendSparseParam"); + platform::RecordEvent record_event("Communicator->RpcSendSparseParam", + platform::TracerEventType::Communication, + 1); size_t request_call_num = _worker_ptr->get_server_nums(); std::vector push_g_vec; @@ -287,7 +295,9 @@ void Communicator::RpcSendSparseParam(const std::string &varname, int table_id, void Communicator::RpcSendSparse(const std::string &var_name, int table_id, const Scope &scope) { - platform::RecordEvent record_event("Communicator->RpcSendSparse"); + platform::RecordEvent record_event("Communicator->RpcSendSparse", + platform::TracerEventType::Communication, + 1); size_t request_call_num = _worker_ptr->get_server_nums(); std::vector sparse_push_keys; std::vector push_g_vec; @@ -338,7 +348,9 @@ void Communicator::RpcSendSparse(const std::string &var_name, int table_id, void Communicator::RpcRecvSparse(const std::string &varname, int table_id, Scope *scope) { - platform::RecordEvent record_event("Communicator->RpcRecvSparse"); + platform::RecordEvent record_event("Communicator->RpcRecvSparse", + platform::TracerEventType::Communication, + 1); auto *send_var = scope->Var(varname); auto *tensor = send_var->GetMutable(); auto dim = tensor->dims()[1]; @@ -406,7 +418,9 @@ void Communicator::SendGlobalStep(const CommContext &ctx, int batches, if (batches == 0) { return; } - platform::RecordEvent record_event("Communicator->SendGlobalStep"); + platform::RecordEvent record_event("Communicator->SendGlobalStep", + platform::TracerEventType::Communication, + 1); auto &table_id = ctx.table_id; size_t request_call_num = _worker_ptr->get_server_nums(); @@ -994,7 +1008,8 @@ void SyncCommunicator::BarrierRecv() { void GeoCommunicator::Send(const std::vector &var_names, const framework::Scope &scope) { - platform::RecordEvent record_event("GeoCommunicator->Send"); + platform::RecordEvent record_event( + "GeoCommunicator->Send", platform::TracerEventType::Communication, 1); waiting_ = false; auto before_send = GetCurrentUS(); auto table_name = var_names[0]; @@ -1137,7 +1152,9 @@ void GeoCommunicator::InitDense(std::vector &varnames, } void GeoCommunicator::SendDense(const CommContext &send_ctx) { - platform::RecordEvent record_event("GeoCommunicator->SendDense"); + platform::RecordEvent record_event("GeoCommunicator->SendDense", + platform::TracerEventType::Communication, + 1); auto &var_names = send_ctx.origin_varnames; auto &table_id = send_ctx.table_id; for (auto &varname : var_names) { @@ -1178,7 +1195,9 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) { } void GeoCommunicator::RecvDense(const CommContext &send_ctx) { - platform::RecordEvent record_event("GeoCommunicator->RecvDense"); + platform::RecordEvent record_event("GeoCommunicator->RecvDense", + platform::TracerEventType::Communication, + 1); auto &table_id = send_ctx.table_id; auto &varnames = recv_varname_to_ctx_.at(table_id); // 1. recv from pserver @@ -1237,7 +1256,9 @@ void GeoCommunicator::InitSparse(const std::string &var_name, int table_id) { std::vector GeoCommunicator::MergeSparseIds( const std::string &send_varname) { - platform::RecordEvent record_event("GeoCommunicator->MergeSparseIds"); + platform::RecordEvent record_event("GeoCommunicator->MergeSparseIds", + platform::TracerEventType::Communication, + 1); size_t merge_num = 0, wait_times = 0; std::unordered_set sparse_ids; while (merge_num < static_cast(max_merge_var_num_)) { @@ -1269,7 +1290,9 @@ std::vector GeoCommunicator::MergeSparseIds( void GeoCommunicator::SendSparse(const std::string &varname, std::vector &sparse_ids, int table_id, int ep_idx) { - platform::RecordEvent record_event("GeoCommunicator->SendSparse"); + platform::RecordEvent record_event("GeoCommunicator->SendSparse", + platform::TracerEventType::Communication, + 1); if (sparse_ids.size() == 0) { return; } @@ -1346,7 +1369,9 @@ void GeoCommunicator::SendSparse(const std::string &varname, void GeoCommunicator::RecvSparse(const std::string &varname, int table_id, int ep_idx) { - platform::RecordEvent record_event("GeoCommunicator->RecvSparse"); + platform::RecordEvent record_event("GeoCommunicator->RecvSparse", + platform::TracerEventType::Communication, + 1); // 1. recv from pserver std::vector keys; std::vector values; diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 441f489fb3097..9dcda6f603202 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -21,7 +21,7 @@ #include "iomanip" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index 8aebae237360e..65472f1003bca 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/heter_client.h" - #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/split.h" DECLARE_int32(rpc_deadline); @@ -152,7 +152,9 @@ void HeterClient::SendAndRecvAsync( const std::string& message_name, const std::vector& send_var_name, const std::vector& recv_var_name, const std::string& mode) { - platform::RecordEvent record_event("HeterClient->SendAndRecvAsync"); + platform::RecordEvent record_event("HeterClient->SendAndRecvAsync", + platform::TracerEventType::Communication, + 1); const platform::DeviceContext* p_ctx = &ctx; const framework::Scope* p_scope = &scope; const std::string message_name_val = message_name; diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h index 86f83cb1fc4fe..8bf21ade1c217 100644 --- a/paddle/fluid/distributed/ps/service/heter_server.h +++ b/paddle/fluid/distributed/ps/service/heter_server.h @@ -34,7 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace google { namespace protobuf { @@ -213,7 +213,9 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { int Handle(const MultiVarMsg* request, MultiVarMsg* response, brpc::Controller* cntl) override { - platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle"); + platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle", + platform::TracerEventType::Communication, + 1); FLAGS_eager_delete_tensor_gb = -1; // get microID from request diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc index b2aece98071c1..088edcb75bbc6 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc @@ -18,7 +18,7 @@ #include "iomanip" #include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace distributed { std::vector GraphPyService::split(std::string& str, diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 1facbe850ee52..796a5c9010035 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DECLARE_bool(sync_nccl_allreduce); @@ -68,7 +68,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); WaitInputVarGenerated(); std::vector inputs = this->Inputs(); diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc index 0d8f71a7555ec..75baf15dc5ec9 100644 --- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #if defined(PADDLE_WITH_XPU) namespace paddle { diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index d058949ec6a19..905f71d698464 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -18,14 +18,15 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); if (places_.size() == 1) return; diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index c760e7a98614c..b8def1f174647 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -128,7 +128,8 @@ void EagerDeletionOpHandle::RunImpl() { CallOnce(); } - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); std::deque> garbages; for (size_t i = 0; i < var_infos_.size(); ++i) { auto *var_info = var_infos_[i]; diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 75998e4582e2b..1cf69a1a3d652 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -65,7 +65,8 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { VLOG(3) << "enter FastThreadedSSAGraphExecutor Run"; std::unique_ptr event( - new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare")); + new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare", + platform::TracerEventType::UserDefined, 2)); std::unique_ptr>> op_deps = atomic_op_deps_.get(); PrepareAtomicOpDeps(); diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 69c39acc5fe59..8af22628f38d1 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -190,7 +191,8 @@ void FetchAsyncOpHandle::FetchMergedLodTensor( } void FetchAsyncOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); WaitInputVarGenerated(true); // get src vars diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 60e58fafa4198..a79f238e3bc8b 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -16,7 +16,7 @@ #include -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -128,7 +128,8 @@ static void TransData(const framework::LoDTensor &src_item, } void FetchOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); WaitInputVarGenerated(platform::CPUPlace()); tensors_.resize(inputs_.size()); diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 8f76de2393eaa..f524f89ce650f 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/device_memory_aligment.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); DECLARE_bool(allreduce_record_one_event); @@ -68,7 +68,8 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { } void FusedAllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); VLOG(4) << this->DebugString(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc index 51ed1ca01b660..d10a1f4ee10b5 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -15,14 +15,15 @@ #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { namespace details { void FusedBroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); if (places_.size() == 1UL) return; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 4df42a7d93d19..ac5e0f10ff8ee 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" PADDLE_DEFINE_EXPORTED_bool( cpu_deterministic, false, @@ -46,7 +46,8 @@ void ReduceOpHandle::Wait( } void ReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); if (places_.size() == 1) return; // the input and output may have dummy var. diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 8d61a103f98be..d45252e4590c4 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -30,7 +30,8 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc, place_(place) {} void RPCOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); for (auto *in : inputs_) { auto &p = static_cast(in)->place(); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 3d877dbbde248..23e96875daa3d 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -16,7 +16,7 @@ #include -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace pten { class DenseTensor; @@ -88,7 +88,8 @@ std::string ScaleLossGradOpHandle::LossGradName() const { } void ScaleLossGradOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true); } diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index f1983175bdf94..051f58da3931a 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -35,7 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" From c78cf026883e112a647a63e41b6ea3f1c65caffa Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 17 Feb 2022 22:02:24 +0800 Subject: [PATCH 06/12] update record event interface using --- paddle/fluid/framework/details/scope_buffered_monitor.cc | 8 +++++--- .../details/scope_buffered_ssa_graph_executor.cc | 8 +++++--- .../framework/details/sparse_all_reduce_op_handle.cc | 5 +++-- .../framework/details/threaded_ssa_graph_executor.cc | 5 +++-- paddle/fluid/framework/executor.cc | 1 + paddle/fluid/framework/ir/cost_model.h | 1 + paddle/fluid/framework/new_executor/interpretercore.cc | 8 ++++++-- .../new_executor/workqueue/nonblocking_threadpool.h | 3 ++- paddle/fluid/framework/parallel_executor.cc | 1 + paddle/fluid/framework/tensor_util.cc | 2 +- paddle/fluid/imperative/basic_engine.cc | 5 +++-- paddle/fluid/imperative/gradient_accumulator.cc | 2 +- paddle/fluid/imperative/layer.cc | 5 +++-- paddle/fluid/imperative/partial_grad_engine.cc | 2 +- paddle/fluid/imperative/tracer.cc | 4 +++- paddle/fluid/inference/api/analysis_predictor.cc | 1 + paddle/fluid/inference/api/api_impl.h | 2 +- paddle/fluid/inference/tensorrt/plugin/trt_plugin.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 2 +- 19 files changed, 43 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc index 2efe1c9555857..c3b53e8770a15 100644 --- a/paddle/fluid/framework/details/scope_buffered_monitor.cc +++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/scope_buffered_monitor.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -91,7 +91,8 @@ void ScopeBufferedMonitor::Apply(const std::function &callback, bool has_fetch) { std::unique_ptr pre_local_exec_scopes_event( new platform::RecordEvent( - "ScopeBufferedMonitor::pre_local_exec_scopes_process")); + "ScopeBufferedMonitor::pre_local_exec_scopes_process", + platform::TracerEventType::UserDefined, 2)); for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { pre_local_exec_scopes_.at(scope_id).clear(); auto scopes = local_exec_scopes_.at(scope_id)->kids(); @@ -105,7 +106,8 @@ void ScopeBufferedMonitor::Apply(const std::function &callback, std::unique_ptr post_local_exec_scopes_event( new platform::RecordEvent( - "ScopeBufferedMonitor::post_local_exec_scopes_process")); + "ScopeBufferedMonitor::post_local_exec_scopes_process", + platform::TracerEventType::UserDefined, 2)); for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { post_local_exec_scopes_.at(scope_id).clear(); auto scopes = local_exec_scopes_.at(scope_id)->kids(); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 5d271d06b6922..d49630129757b 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace framework { @@ -75,7 +75,8 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run( #endif if (drop_scope_counter_ == 0) { - platform::RecordEvent e("InitLocalVars"); + platform::RecordEvent e("InitLocalVars", + platform::TracerEventType::UserDefined, 2); InitVariables(); } @@ -164,7 +165,8 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { } void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) { - platform::RecordEvent drop_scope_event("DropLocalExeScopes"); + platform::RecordEvent drop_scope_event( + "DropLocalExeScopes", platform::TracerEventType::UserDefined, 2); drop_scope_counter_ = 0; if (need_wait) { for (auto &p : places_) { diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index f0de723c20b74..dee8e3718b262 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -25,7 +25,7 @@ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" DECLARE_bool(sync_nccl_allreduce); @@ -66,7 +66,8 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( } void SparseAllReduceOpHandle::RunImplEncoded() { - platform::RecordEvent record_event(Name()); + platform::RecordEvent record_event(Name(), + platform::TracerEventType::UserDefined, 2); auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c8a6cd25f0fcb..39683c9a0d868 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #if defined PADDLE_WITH_PSCORE #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" @@ -56,7 +56,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( const std::vector &fetch_tensors, bool return_merged) { std::unique_ptr event( - new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); + new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", + platform::TracerEventType::UserDefined, 2)); std::unique_ptr op_deps = op_deps_futures_.get(); CopyOpDeps(); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 4e6a4d5360860..48850d4624a14 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif diff --git a/paddle/fluid/framework/ir/cost_model.h b/paddle/fluid/framework/ir/cost_model.h index 41567df2cb332..20d9be7e95c39 100644 --- a/paddle/fluid/framework/ir/cost_model.h +++ b/paddle/fluid/framework/ir/cost_model.h @@ -26,6 +26,7 @@ #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/variant.h" namespace paddle { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 8a5ec83b8b364..ca3ae7b6e5730 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -391,7 +391,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { : global_scope_->GetMutableScope(); auto op_with_kernel = dynamic_cast(op); { - platform::RecordEvent infershape_event("InferShape"); + platform::RecordEvent infershape_event( + "InferShape", platform::TracerEventType::OperatorInner, 1, + platform::EventRole::kInnerOp); // If it is OperatorBase, InferShape do nothing. if (op_with_kernel != nullptr) op_with_kernel->Info().infer_shape_( @@ -411,7 +413,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { } } { - platform::RecordEvent compute_event("Compute"); + platform::RecordEvent compute_event( + "Compute", platform::TracerEventType::OperatorInner, 1, + platform::EventRole::kInnerOp); if (op_with_kernel == nullptr) { instr_node.OpBase()->Run(*local_scope, place_); } else { diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 2ad76562c15dd..7b3916bafc93e 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -408,7 +408,8 @@ class ThreadPoolTempl { ec_.Notify(true); return false; } - platform::RecordEvent("SleepWaitForWork"); + platform::RecordEvent("SleepWaitForWork", + platform::TracerEventType::UserDefined, 2); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1a826f6bdd5e7..5b913ff2d21de 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -39,6 +39,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index e510257c6106b..40fbca7a99fdb 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/core/dense_tensor.h" diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 973541e6dcc1b..75be0171970d3 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -29,7 +29,7 @@ #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/imperative/tracer.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/kernels/funcs/math_function.h" DECLARE_bool(sort_sum_gradient); @@ -410,7 +410,8 @@ void BasicEngine::Execute() { auto& inplace_grad_name_map = shared_cur_node->InplaceGradNameMap(); for (auto& cur_op : *shared_cur_node) { - platform::RecordEvent op_type_record_event(cur_op.Type()); + platform::RecordEvent op_type_record_event( + cur_op.Type(), platform::TracerEventType::Operator, 1); ++op_num; diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 17ab1f1f7c53f..c5c999aaae08a 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_XPU #include "xpu/refactor/math.h" diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index ed455b7fd0314..ab42af9c6cbef 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/imperative/var_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -233,7 +233,8 @@ void VarBase::ClearGradient(bool set_to_zero) { grad_t->mutable_value()->clear(); } } else { - platform::RecordEvent record_event("ClearGradient"); + platform::RecordEvent record_event( + "ClearGradient", platform::TracerEventType::UserDefined, 2); auto* grad_t = grad_var_->MutableVar()->GetMutable(); if (grad_t->IsInitialized()) { diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index fcadd0046fe2c..57746035eb790 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -30,7 +30,7 @@ #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/pten/kernels/funcs/math_function.h" diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index a600720ef78ed..859075ed5e007 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" DECLARE_bool(use_mkldnn); @@ -169,7 +170,8 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, const std::map& inplace_map, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { - platform::RecordEvent op_type_record_event(type); + platform::RecordEvent op_type_record_event( + type, platform::TracerEventType::Operator, 2); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7e4da57e9e7df..d956aa53911e9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -46,6 +46,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/api/ext/op_meta_info.h" #ifdef PADDLE_WITH_MKLML diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index f0ce652beae11..74f2482c823b0 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 6b2925a068bbd..9210cd48d078b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -24,7 +24,7 @@ #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace nvinfer1 { class ITensor; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 0757f5c505c61..38a4d3ae32b7e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -38,7 +38,7 @@ #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/utils/benchmark.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); From bdf756439c6c04cab3c7b38bec2d832a45261da3 Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 18 Feb 2022 14:50:21 +0800 Subject: [PATCH 07/12] update operator.cc --- paddle/fluid/framework/operator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e8fbe54b34c74..242bc38cbdeeb 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -263,7 +263,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // in order to record different op type cost time // and different op name cost time,we set two event. platform::RecordEvent op_type_record_event( - Type(), platform::TracerEventType::Operator, 1); + Type().c_str(), platform::TracerEventType::Operator, 1); auto op_name = platform::OpName(outputs_, Type()); platform::RecordEvent op_name_record_event( op_name, platform::TracerEventType::Operator, 1, From 8cd1182bd643bb08e095242fbc65677245a03f95 Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 18 Feb 2022 16:01:39 +0800 Subject: [PATCH 08/12] update part2 --- .../framework/details/all_reduce_op_handle.cc | 5 ++-- .../framework/details/broadcast_op_handle.cc | 5 ++-- .../details/fetch_async_op_handle.cc | 2 +- .../framework/details/fetch_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.cc | 4 ++-- .../details/fused_broadcast_op_handle.cc | 4 ++-- .../framework/details/reduce_op_handle.cc | 4 ++-- .../fluid/framework/details/rpc_op_handle.cc | 4 ++-- .../details/sparse_all_reduce_op_handle.cc | 2 ++ .../framework/new_executor/interpretercore.cc | 11 +++++---- .../fluid/imperative/gradient_accumulator.cc | 24 ++++--------------- paddle/fluid/imperative/tracer.cc | 14 ++--------- 12 files changed, 30 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 796a5c9010035..50c544b39c163 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -68,9 +68,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); - + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); WaitInputVarGenerated(); std::vector inputs = this->Inputs(); std::vector outputs = this->Outputs(); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 905f71d698464..ff2e66082e69d 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -25,9 +25,8 @@ namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); - + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (places_.size() == 1) return; // The input and output may have dummy vars. diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 8af22628f38d1..0731663de95f3 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -192,7 +192,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor( void FetchAsyncOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::Operator, 1); WaitInputVarGenerated(true); // get src vars diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index a79f238e3bc8b..a4de29c8387ae 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -129,7 +129,7 @@ static void TransData(const framework::LoDTensor &src_item, void FetchOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::Operator, 1); WaitInputVarGenerated(platform::CPUPlace()); tensors_.resize(inputs_.size()); diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index f524f89ce650f..f4ca4907d48d0 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -68,8 +68,8 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { } void FusedAllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); VLOG(4) << this->DebugString(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc index d10a1f4ee10b5..2490f3d6102da 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -22,8 +22,8 @@ namespace framework { namespace details { void FusedBroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (places_.size() == 1UL) return; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index ac5e0f10ff8ee..b5b052f1a1f56 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -46,8 +46,8 @@ void ReduceOpHandle::Wait( } void ReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (places_.size() == 1) return; // the input and output may have dummy var. diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index d45252e4590c4..39bcf1d0f385f 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -30,8 +30,8 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc, place_(place) {} void RPCOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); for (auto *in : inputs_) { auto &p = static_cast(in)->place(); diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index dee8e3718b262..d198eb1459288 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -280,6 +280,8 @@ bool SparseAllReduceOpHandle::IsEncoded() { } void SparseAllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (!IsEncoded()) { AllReduceOpHandle::RunImpl(); return; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 4b578326c3c4b..ca3ae7b6e5730 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -16,13 +16,16 @@ #include #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" -#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" -#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h" +#endif + PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, "Use inplace in new executor"); PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, @@ -727,12 +730,12 @@ void InterpreterCore::CheckGC(const Instruction& instr) { } else { static_cast(gc_.get())->Add( - var_scope.Var(var_id), &gc_event_.at(instr_id), + var_scope.Var(var_id), gc_event_.at(instr_id), &instr.DeviceContext()); } #else static_cast(gc_.get())->Add( - var_scope.Var(var_id), &gc_event_.at(instr_id), + var_scope.Var(var_id), gc_event_.at(instr_id), &instr.DeviceContext()); #endif } diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 0e06e0f7ee447..c5c999aaae08a 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -24,7 +24,6 @@ #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" @@ -301,10 +300,13 @@ void TensorAdd(const VarType& src, VarType* dst) { "should be equal, Otherwise, the calculation results " "will be incorrect.")); +#ifdef PADDLE_WITH_XPU // if src and dst are in different place, copy dst to src's place if (dst_tensor->place() != place) { paddle::framework::TensorCopySync(*dst_tensor, place, dst_tensor); } +#endif + #define PADDLE_TENSOR_ADD(cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ TensorAddFunctor func( \ @@ -384,9 +386,9 @@ void TensorAdd(const VarType& src, VarType* dst) { operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor); operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor); PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd( - dev_ctx->cnnl_handle(), static_cast(&alpha), + dev_ctx->cnnl_handle(), static_cast(&alpha), src_tensor_desc.get(), operators::GetBasePtr(&src_tensor), nullptr, 0, - static_cast(&beta), dst_tensor_desc.get(), + static_cast(&beta), dst_tensor_desc.get(), operators::GetBasePtr(dst_tensor))); return; } @@ -421,22 +423,6 @@ void TensorAdd(const VarType& src, VarType* dst) { src_tensor, dst_tensor, place); } } - if (data_type == framework::proto::VarType::BF16) { - if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) - return TensorAddImpl( - src_tensor, dst_tensor, place); -#else - PADDLE_THROW(platform::errors::Unimplemented( - "Gradient accumulation of data type (%s) on place (%s) is not " - "supported in imperative mode", - framework::DataTypeToString(data_type), place)); -#endif - } else if (platform::is_cpu_place(place)) { - return TensorAddImpl( - src_tensor, dst_tensor, place); - } - } PADDLE_THROW(platform::errors::Unimplemented( "Gradient accumulation of data type (%s) on place (%s) is not " "supported in imperative mode", diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index f3828d560dc38..859075ed5e007 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -36,8 +36,6 @@ thread_local bool Tracer::has_grad_ = true; thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0; -thread_local pten::DataType Tracer::amp_dtype_ = pten::DataType::FLOAT32; - static std::shared_ptr g_current_tracer(nullptr); const std::shared_ptr& GetCurrentTracer() { return g_current_tracer; } @@ -204,18 +202,10 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, NameVarMap new_ins = ins; if (amp_level_ == AmpLevel::O1) { VLOG(5) << "Auto mixed precision run operator: " << type; - if (amp_dtype_ == pten::DataType::FLOAT16) { - new_ins = AutoCastInputs(type, ins); - } else if (amp_dtype_ == pten::DataType::BFLOAT16) { - new_ins = AutoCastBF16Inputs(type, ins); - } + new_ins = AutoCastInputs(type, ins); } else if (amp_level_ == AmpLevel::O2) { VLOG(5) << "Pure fp16 run operator: " << type; - if (amp_dtype_ == pten::DataType::FLOAT16) { - new_ins = CastPureFp16Inputs(type, ins); - } else if (amp_dtype_ == pten::DataType::BFLOAT16) { - new_ins = CastPureBf16Inputs(type, ins); - } + new_ins = CastPureFp16Inputs(type, ins); } try { From d915a6077431ea05e8efdcaa9c6f482b1b38312a Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 18 Feb 2022 16:02:09 +0800 Subject: [PATCH 09/12] update part1 --- paddle/fluid/framework/details/all_reduce_op_handle.cc | 5 ++--- paddle/fluid/framework/details/broadcast_op_handle.cc | 5 ++--- paddle/fluid/framework/details/fetch_async_op_handle.cc | 2 +- paddle/fluid/framework/details/fetch_op_handle.cc | 2 +- paddle/fluid/framework/details/fused_all_reduce_op_handle.cc | 4 ++-- paddle/fluid/framework/details/fused_broadcast_op_handle.cc | 4 ++-- paddle/fluid/framework/details/reduce_op_handle.cc | 4 ++-- paddle/fluid/framework/details/rpc_op_handle.cc | 4 ++-- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 796a5c9010035..50c544b39c163 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -68,9 +68,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); - + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); WaitInputVarGenerated(); std::vector inputs = this->Inputs(); std::vector outputs = this->Outputs(); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 905f71d698464..ff2e66082e69d 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -25,9 +25,8 @@ namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); - + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (places_.size() == 1) return; // The input and output may have dummy vars. diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 8af22628f38d1..0731663de95f3 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -192,7 +192,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor( void FetchAsyncOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::Operator, 1); WaitInputVarGenerated(true); // get src vars diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index a79f238e3bc8b..a4de29c8387ae 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -129,7 +129,7 @@ static void TransData(const framework::LoDTensor &src_item, void FetchOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::Operator, 1); WaitInputVarGenerated(platform::CPUPlace()); tensors_.resize(inputs_.size()); diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index f524f89ce650f..f4ca4907d48d0 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -68,8 +68,8 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { } void FusedAllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); VLOG(4) << this->DebugString(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc index d10a1f4ee10b5..2490f3d6102da 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -22,8 +22,8 @@ namespace framework { namespace details { void FusedBroadcastOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (places_.size() == 1UL) return; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index ac5e0f10ff8ee..b5b052f1a1f56 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -46,8 +46,8 @@ void ReduceOpHandle::Wait( } void ReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); if (places_.size() == 1) return; // the input and output may have dummy var. diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index d45252e4590c4..39bcf1d0f385f 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -30,8 +30,8 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc, place_(place) {} void RPCOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), - platform::TracerEventType::UserDefined, 2); + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); for (auto *in : inputs_) { auto &p = static_cast(in)->place(); From 5b45b6aac5b26af4c97e90b1513d6982c6017498 Mon Sep 17 00:00:00 2001 From: chenjian Date: Sat, 19 Feb 2022 14:20:00 +0800 Subject: [PATCH 10/12] fix include profiler.h header in ps server --- paddle/fluid/distributed/ps/service/brpc_ps_server.cc | 2 +- paddle/fluid/distributed/ps/service/graph_brpc_server.cc | 2 +- paddle/fluid/distributed/ps/service/heter_server.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc index fca7f8ac67419..747b0cbb325d0 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler.h" namespace google { namespace protobuf { diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 9dcda6f603202..441f489fb3097 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -21,7 +21,7 @@ #include "iomanip" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h index 8bf21ade1c217..a14fb5f6cc04a 100644 --- a/paddle/fluid/distributed/ps/service/heter_server.h +++ b/paddle/fluid/distributed/ps/service/heter_server.h @@ -34,7 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN -#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler.h" namespace google { namespace protobuf { From f3392f8f62b8bcdd79eed1bcd051560a365b4210 Mon Sep 17 00:00:00 2001 From: chenjian Date: Sat, 19 Feb 2022 14:28:32 +0800 Subject: [PATCH 11/12] fix include profiler.h header in ps server --- .../fluid/distributed/ps/service/communicator/communicator.cc | 2 +- paddle/fluid/distributed/ps/service/heter_client.cc | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc index c4e1f5e1f5f2c..6dbe8ba3a0b90 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" #define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@" diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index 65472f1003bca..d6287cda6d443 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/split.h" DECLARE_int32(rpc_deadline); From d79be472c8fa8eceeb44e7bacd156a8a361bc136 Mon Sep 17 00:00:00 2001 From: chenjian Date: Sat, 19 Feb 2022 15:53:36 +0800 Subject: [PATCH 12/12] fix profiler.h header --- .../framework/new_executor/interpretercore.cc | 11 ++++----- .../fluid/imperative/gradient_accumulator.cc | 24 +++++++++++++++---- paddle/fluid/imperative/tracer.cc | 14 +++++++++-- .../fluid/inference/api/analysis_predictor.cc | 1 - 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index ca3ae7b6e5730..4b578326c3c4b 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -16,16 +16,13 @@ #include #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" -#include "paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h" -#endif - PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, "Use inplace in new executor"); PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, @@ -730,12 +727,12 @@ void InterpreterCore::CheckGC(const Instruction& instr) { } else { static_cast(gc_.get())->Add( - var_scope.Var(var_id), gc_event_.at(instr_id), + var_scope.Var(var_id), &gc_event_.at(instr_id), &instr.DeviceContext()); } #else static_cast(gc_.get())->Add( - var_scope.Var(var_id), gc_event_.at(instr_id), + var_scope.Var(var_id), &gc_event_.at(instr_id), &instr.DeviceContext()); #endif } diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 9aa54a0f891ab..168923e819daa 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" @@ -300,13 +301,10 @@ void TensorAdd(const VarType& src, VarType* dst) { "should be equal, Otherwise, the calculation results " "will be incorrect.")); -#ifdef PADDLE_WITH_XPU // if src and dst are in different place, copy dst to src's place if (dst_tensor->place() != place) { paddle::framework::TensorCopySync(*dst_tensor, place, dst_tensor); } -#endif - #define PADDLE_TENSOR_ADD(cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ TensorAddFunctor func( \ @@ -386,9 +384,9 @@ void TensorAdd(const VarType& src, VarType* dst) { operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor); operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor); PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd( - dev_ctx->cnnl_handle(), static_cast(&alpha), + dev_ctx->cnnl_handle(), static_cast(&alpha), src_tensor_desc.get(), operators::GetBasePtr(&src_tensor), nullptr, 0, - static_cast(&beta), dst_tensor_desc.get(), + static_cast(&beta), dst_tensor_desc.get(), operators::GetBasePtr(dst_tensor))); return; } @@ -423,6 +421,22 @@ void TensorAdd(const VarType& src, VarType* dst) { src_tensor, dst_tensor, place); } } + if (data_type == framework::proto::VarType::BF16) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) + return TensorAddImpl( + src_tensor, dst_tensor, place); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); +#endif + } else if (platform::is_cpu_place(place)) { + return TensorAddImpl( + src_tensor, dst_tensor, place); + } + } PADDLE_THROW(platform::errors::Unimplemented( "Gradient accumulation of data type (%s) on place (%s) is not " "supported in imperative mode", diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 859075ed5e007..f3828d560dc38 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -36,6 +36,8 @@ thread_local bool Tracer::has_grad_ = true; thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0; +thread_local pten::DataType Tracer::amp_dtype_ = pten::DataType::FLOAT32; + static std::shared_ptr g_current_tracer(nullptr); const std::shared_ptr& GetCurrentTracer() { return g_current_tracer; } @@ -202,10 +204,18 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, NameVarMap new_ins = ins; if (amp_level_ == AmpLevel::O1) { VLOG(5) << "Auto mixed precision run operator: " << type; - new_ins = AutoCastInputs(type, ins); + if (amp_dtype_ == pten::DataType::FLOAT16) { + new_ins = AutoCastInputs(type, ins); + } else if (amp_dtype_ == pten::DataType::BFLOAT16) { + new_ins = AutoCastBF16Inputs(type, ins); + } } else if (amp_level_ == AmpLevel::O2) { VLOG(5) << "Pure fp16 run operator: " << type; - new_ins = CastPureFp16Inputs(type, ins); + if (amp_dtype_ == pten::DataType::FLOAT16) { + new_ins = CastPureFp16Inputs(type, ins); + } else if (amp_dtype_ == pten::DataType::BFLOAT16) { + new_ins = CastPureBf16Inputs(type, ins); + } } try { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d956aa53911e9..7e4da57e9e7df 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -46,7 +46,6 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/pten/api/ext/op_meta_info.h" #ifdef PADDLE_WITH_MKLML