Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… zyf_slice
zyfncg · Aug 31, 2021 · 29c3e5b · 29c3e5b
2 parents 6666c94 + 561841d
commit 29c3e5b
Show file tree

Hide file tree

Showing 87 changed files with 6,460 additions and 1,108 deletions.
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
@@ -113,15 +113,15 @@ struct BuildStrategy {
   // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
   // should not be sparse types
   paddle::optional<bool> fuse_all_optimizer_ops_{false};
-  paddle::optional<bool> fuse_all_reduce_ops_{boost::none};
+  paddle::optional<bool> fuse_all_reduce_ops_{paddle::none};
   // fuse_relu_depthwise_conv can fuse the `relu ->
   // depthwise_conv`
   bool fuse_relu_depthwise_conv_{false};
   // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
   // faster. Because fusing broadcast OP equals delaying the execution of all
   // broadcast Ops, in this case, all nccl streams are used only for reduce
   // operations for a period of time.
-  paddle::optional<bool> fuse_broadcast_ops_{boost::none};
+  paddle::optional<bool> fuse_broadcast_ops_{paddle::none};
   // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
 
@@ -135,7 +135,7 @@ struct BuildStrategy {
   // By default, memory_optimize would be opened if gc is disabled, and
   // be closed if gc is enabled.
   // Users can forcely enable/disable memory_optimize by setting True/False.
-  paddle::optional<bool> memory_optimize_{boost::none};
+  paddle::optional<bool> memory_optimize_{paddle::none};
 
   // Turn on inplace by default.
   bool enable_inplace_{true};

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() {
 PDNode *patterns::QuantizePlacement::operator()(
     const std::unordered_set<std::string> &quantize_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
-                                       "fc", "matmul", "pool2d", "prior_box",
-                                       "reshape2", "transpose2", "fusion_gru"});
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d",
+           "prior_box", "reshape2", "transpose2", "fusion_gru", "multi_gru"});
   if (!quantize_enabled_op_types.empty()) {
     supported_op_types = quantize_enabled_op_types;
   }

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -832,7 +832,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
     GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern);
 
-    if (!AreScalesPresentForNodes({x, weight_h, weight_x})) {
+    if (!AreScalesPresentForNodes({x, weight_x})) {
       LogCannotQuantizeOp(op);
       return;
     }

diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -758,7 +758,9 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
       Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
       Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
       Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
+      Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out,
+      Node* softmax_qk, Node* eltadd0, Node* eltadd1, Node* eltadd2,
+      Node* matmul_qk) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
@@ -876,19 +878,35 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
       weight_max = std::max(weight_max, weight_scale2);
       multihead_op_desc.SetAttr("weight_scale", weight_max);
 
-      if (mul0_op_desc->HasAttr("out_threshold")) {
+      auto* add0_op_desc = eltadd0->Op();
+      auto* add1_op_desc = eltadd1->Op();
+      auto* add2_op_desc = eltadd2->Op();
+      if (add0_op_desc->HasAttr("out_threshold")) {
         auto out_scale0 =
-            BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold"));
+            BOOST_GET_CONST(float, add0_op_desc->GetAttr("out_threshold"));
         auto out_scale1 =
-            BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold"));
+            BOOST_GET_CONST(float, add1_op_desc->GetAttr("out_threshold"));
         auto out_scale2 =
-            BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold"));
+            BOOST_GET_CONST(float, add2_op_desc->GetAttr("out_threshold"));
         auto out_scale_max = std::max(out_scale0, out_scale1);
         out_scale_max = std::max(out_scale_max, out_scale2);
-        multihead_op_desc.SetAttr("out_threshold", out_scale_max);
+        multihead_op_desc.SetAttr("fc_out_threshold", out_scale_max);
       }
     }
 
+    auto* softmax_qk_op_desc = softmax_qk->Op();
+    auto* matmul_qk_op_desc = matmul_qk->Op();
+    if (matmul_qk_op_desc->HasAttr("X_scale")) {
+      multihead_op_desc.SetAttr("qkv2context_plugin_int8", true);
+      if (softmax_qk_op_desc->HasAttr("out_threshold")) {
+        auto qkv_plugin_scale = BOOST_GET_CONST(
+            float, softmax_qk_op_desc->GetAttr("out_threshold"));
+        multihead_op_desc.SetAttr("dp_probs", qkv_plugin_scale);
+      }
+    } else {
+      multihead_op_desc.SetAttr("qkv2context_plugin_int8", false);
+    }
+
     auto* multihead = graph->CreateOpNode(&multihead_op_desc);
 
     IR_NODE_LINK_TO(input0, multihead);
@@ -990,7 +1008,8 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     }
     fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
                  mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
-                 reshape2_0, reshape2_qkv_out, scale, scale_out);
+                 reshape2_0, reshape2_qkv_out, scale, scale_out, softmax_qk,
+                 eltadd0, eltadd1, eltadd2, matmul_qk);
 
     std::unordered_set<const Node*> marked_nodes({eltadd0,
                                                   eltadd1,

diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(workqueue SRCS workqueue.cc)
 cc_library(interpretercore SRCS interpretercore.cc DEPS op_registry
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
             lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor workqueue device_event device_event_gpu)
+            graph_to_program_pass variable_helper timer monitor workqueue ${DEVICE_EVENT_LIBS})
 cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
 cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
 # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -12,16 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
-#include "paddle/fluid/framework/executor_gc_helper.h"
-#include "paddle/fluid/framework/new_executor/interpretercore_gc_helper.h"
-
-#if defined(PADDLE_WITH_CUDA)
-using ::paddle::platform::kCUDA;
-USE_EVENT(kCUDA);
-#endif
 
 #include <unordered_set>
 
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/new_executor/interpretercore_gc_helper.h"
+
 namespace paddle {
 namespace framework {
 
@@ -74,27 +70,26 @@ std::vector<size_t> ParseEventVarIds(const Instruction& cur_instr,
 }
 
 void AssociateInputWithEvents(
-    const std::vector<size_t>& new_event_var_id, Instruction* next_instr,
-    std::map<size_t, std::shared_ptr<platform::CudaEvent>>* var_id2event,
+    const platform::Place& place, const std::vector<size_t>& new_event_var_id,
+    Instruction* next_instr,
+    std::map<size_t, std::shared_ptr<platform::DeviceEvent>>* var_id2event,
     bool is_sync) {
-#ifdef PADDLE_WITH_CUDA
   for (auto var_id : new_event_var_id) {
     if (var_id2event->count(var_id) == 0) {
-      auto cuda_event = std::make_shared<platform::CudaEvent>(
-          platform::get_cuda_flags(false, false, false));
-      var_id2event->emplace(var_id, std::move(cuda_event));
+      auto device_event = std::make_shared<platform::DeviceEvent>(
+          place, platform::GenerateDeviceEventFlag());
+      var_id2event->emplace(var_id, std::move(device_event));
     }
     // Add events for next_instr.inputs
     next_instr->intput_events_.emplace_back(var_id, var_id2event->at(var_id),
                                             is_sync);
   }
-#endif
 }
 
 void ParseDirectAndEventRunOps(
-    const std::vector<OpFuncNode>& op_func_nodes,
+    const platform::Place& place, const std::vector<OpFuncNode>& op_func_nodes,
     const std::vector<size_t>& downstream_ops, size_t op_index,
-    std::map<size_t, std::shared_ptr<platform::CudaEvent>>* var_id2event,
+    std::map<size_t, std::shared_ptr<platform::DeviceEvent>>* var_id2event,
     std::vector<Instruction>* instructions) {
   auto& op_func_type = op_func_nodes[op_index].type_;
   auto& cur_instr = instructions->at(op_index);
@@ -119,24 +114,22 @@ void ParseDirectAndEventRunOps(
 
       bool is_sync =
           (op_func_nodes[next_op_id].type_ == OpFuncType::kQueueSync);
-      AssociateInputWithEvents(new_event_var_ids, &next_instr, var_id2event,
-                               is_sync);
+      AssociateInputWithEvents(place, new_event_var_ids, &next_instr,
+                               var_id2event, is_sync);
 
       if (is_sync) {  // GPU -> CPU
         next_instruction.synchronize_run_.emplace_back(next_op_id);
       } else {  // GPU -> GPU(different stream)
         next_instruction.event_wait_run_.emplace_back(next_op_id);
       }
     }
-#ifdef PADDLE_WITH_CUDA
     // Create events for these cross-stream vars
     VLOG(3) << cur_instr.kernel_func_.operator_base_->Type()
             << " event_var_ids.size: " << event_var_ids.size();
     for (auto var_id : event_var_ids) {
       cur_instr.output_events_.emplace_back(var_id, var_id2event->at(var_id),
                                             false /*not used*/);
     }
-#endif
   }
 }
 }  // namespace
@@ -263,12 +256,10 @@ void InterpreterCore::Convert() {
   }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-#if defined(PADDLE_WITH_CUDA)
-    int device_type = static_cast<int>(paddle::platform::DeviceType::CUDA);
-    paddle::platform::DeviceOption dev_opt(
-        device_type, BOOST_GET_CONST(platform::CUDAPlace, place_).device);
-    gc_event_.emplace_back(dev_opt);
-#endif
+    // int device_type = static_cast<int>(paddle::platform::DeviceType::CUDA);
+    // paddle::platform::DeviceOption dev_opt(
+    //     device_type, BOOST_GET_CONST(platform::CUDAPlace, place_).device);
+    gc_event_.emplace_back(place_);
 
     std::vector<size_t> vec_temp;
     for (auto& item : vec_instruction_[i].output_index_) {
@@ -287,8 +278,8 @@ void InterpreterCore::Convert() {
       }
     }
 
-    ParseDirectAndEventRunOps(vec_func_list_, filter_next, i, &var_id2event_,
-                              &vec_instruction_);
+    ParseDirectAndEventRunOps(place_, vec_func_list_, filter_next, i,
+                              &var_id2event_, &vec_instruction_);
 
     // checkout ouput
     for (auto& item : vec_instruction_[i].output_index_) {
@@ -466,7 +457,7 @@ void InterpreterCore::CheckGC(size_t instr_id,
 #if defined(PADDLE_WITH_CUDA)
       auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
           platform::DeviceContextPool::Instance().Get(place));
-      gc_event_[instr_id].Record(place, dev_ctx);
+      gc_event_[instr_id].Record(dev_ctx);
       gc_queue_->AddTask(
           [ container = garbages_.release(), event = &gc_event_[instr_id] ]() {
             while (!event->Query()) {
@@ -483,7 +474,7 @@ void InterpreterCore::CheckGC(size_t instr_id,
 #if defined(PADDLE_WITH_CUDA)
       auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
           platform::DeviceContextPool::Instance().Get(place));
-      gc_event_[instr_id].Record(place, dev_ctx);
+      gc_event_[instr_id].Record(dev_ctx);
       gc_queue_->AddTask(
           [ container = garbages_.release(), event = &gc_event_[instr_id] ]() {
             while (!event->Query()) {
@@ -857,34 +848,23 @@ void InterpreterCore::RecordEventInstruction(const Instruction& instruction,
   // If InterpreterCore in on CPUPlace, do nothing.
   if (platform::is_cpu_place(place_)) return;
 
-#ifdef PADDLE_WITH_CUDA
-  const platform::CUDADeviceContext* dev_ctx =
-      reinterpret_cast<const platform::CUDADeviceContext*>(
-          instruction.dev_ctx_);
   for (auto& event : instruction.output_events_) {
     VLOG(3) << "Record event in out_var_id: " << event.var_id_;
-    event.event_->Record(*(dev_ctx->context()->Stream()));
+    event.event_->Record(instruction.dev_ctx_);
   }
-#endif
 }
 
 void InterpreterCore::WaitOrSync(const std::vector<EventInter>& events,
                                  const platform::DeviceContext* dev_ctx) {
-#ifdef PADDLE_WITH_CUDA
-  auto* cuda_dev_ctx =
-      reinterpret_cast<const platform::CUDADeviceContext*>(dev_ctx);
-
-  for (auto& event : events) {
-    if (event.is_sync_) {
-      VLOG(3) << "host sync wait in_var_id " << event.var_id_;
-      event.event_->Synchronize();
+  for (auto& event_iter : events) {
+    if (event_iter.is_sync_) {
+      VLOG(3) << "host sync wait in_var_id " << event_iter.var_id_;
+      event_iter.event_->Wait(platform::kCPU, dev_ctx);
     } else {
-      VLOG(3) << "stream async wait in_var_id " << event.var_id_;
-      cuda_dev_ctx->context()->Stream()->WaitEvent(
-          event.event_->GetRawCudaEvent());
+      VLOG(3) << "stream async wait in_var_id " << event_iter.var_id_;
+      event_iter.event_->Wait(platform::kCUDA, dev_ctx);
     }
   }
-#endif
 }
 
 void InterpreterCore::StreamWaitEventOrSync(const Instruction& instruction) {

diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -26,7 +26,6 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_event.h"
-#include "paddle/fluid/platform/event.h"
 
 namespace paddle {
 namespace framework {
@@ -101,7 +100,7 @@ class InterpreterCore {
   bool is_build_;
 
   std::vector<std::string> feed_names_;
-  std::map<size_t, std::shared_ptr<platform::CudaEvent>> var_id2event_;
+  std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
 
   std::vector<paddle::platform::DeviceEvent> gc_event_;
   std::unique_ptr<GarbageQueue> garbages_;

diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
 namespace paddle {
@@ -56,11 +57,12 @@ struct NextInstruction {
 };
 
 struct EventInter {
-  explicit EventInter(size_t var_id, std::shared_ptr<platform::CudaEvent> event,
+  explicit EventInter(size_t var_id,
+                      std::shared_ptr<platform::DeviceEvent> event,
                       bool is_sync)
       : var_id_(var_id), event_(event), is_sync_(is_sync) {}
   size_t var_id_;
-  std::shared_ptr<platform::CudaEvent> event_;
+  std::shared_ptr<platform::DeviceEvent> event_;
   bool is_sync_;
 };
 

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
@@ -78,7 +78,6 @@ set(SHARED_INFERENCE_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/io_utils.cc
-    ${mkldnn_quantizer_src_file}
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
@@ -20,10 +20,9 @@ endif(APPLE)
 add_subdirectory(details)
 
 if(WITH_MKLDNN)
-  set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
   set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
+  set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
   cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
-  set(mkldnn_quantizer_src_file ${mkldnn_quantizer_src} PARENT_SCOPE)
   set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE)
 endif()
 
@@ -71,6 +70,16 @@ elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
+
+if(WITH_TESTING AND WITH_MKLDNN)
+  if (NOT APPLE AND NOT WIN32)
+    cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
+  elseif (WIN32)
+    cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
+            ARGS --dirname=${WORD2VEC_MODEL_DIR})
+  endif()
+endif()
+
 if(WITH_TESTING AND TEST test_api_impl)
     if(NOT APPLE)
         set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)