PaddlePaddle · lidanqing-intel · Dec 17, 2019 · Dec 18, 2019 · Dec 19, 2019 · Dec 19, 2019
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
@@ -19,7 +19,7 @@ SET(MKLDNN_PREFIX_DIR    ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            518a316a8cd6deb82dc7866bc04bd0355a25c3a4)
+SET(MKLDNN_TAG            52c3052df8ec1d5b8b45cb6c350a952840eabd42)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
@@ -71,13 +71,13 @@ ExternalProject_Add(
                         -DMKLROOT=${MKLML_ROOT}
                         -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
                         -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
-                        -DMKLDNN_BUILD_TESTS=OFF -DMKLDNN_BUILD_EXAMPLES=OFF
+                        -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
 )
 if(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
 else(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so" CACHE FILEPATH "mkldnn library." FORCE)
 endif(WIN32)
 
 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
@@ -98,9 +98,21 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 # it can be directly contained in wheel or capi
 if(WIN32)
     SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
+    ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_INSTALL_DIR}/bin/dnnl.dll ${MKLDNN_SHARED_LIB})
+    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
+        COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt)
+    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
+        COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
+    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
+        COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
+    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
+        COMMAND for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
+    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
+        COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64)
 else(WIN32)
     SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-    SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libmkldnn.so.1)
+    SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1)
     ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD
             COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB})
     ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -396,7 +396,7 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 1800)
   endif()
 endfunction()
 
@@ -743,7 +743,7 @@ function(py_test TARGET_NAME)
     endif()
 
     # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 1800)
   endif()
 endfunction()
 

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
@@ -86,7 +86,7 @@ function(copy_part_of_thrid_party TARGET DST)
         set(dst_dir "${DST}/third_party/install/mkldnn")
         if(WIN32)
             copy(${TARGET}
-                    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
+                    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}  ${MKLDNN_LIB}
                     DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
         else()
             copy(${TARGET}

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -116,7 +116,9 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "multihead_matmul_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
+"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
+"multihead_matmul_op" "fusion_group_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()

diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
@@ -185,10 +185,8 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   }
   // For exepected NHWC data format we need to reshape the Output tensor
   // As MKL-DNN description was in NCHW and paddle is expecting NHWC
-  if (out_layout == DataLayout::kNHWC) {
-    std::rotate(out_tz.begin() + 1, out_tz.begin() + 2, out_tz.end());
-    out->Resize(framework::make_ddim(out_tz));
-  }
+  platform::MatchShapeToLayout(out, in_layout, out_layout);
+
   out->set_layout(out_layout);
   // reset format since the out tensor will be feed to non-MKLDNN OPkernel
   out->set_format(MKLDNNMemoryFormat::undef);

diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
@@ -58,13 +58,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
         out.ShareDataWith(input_tensor);
         // For NHWC data we need reshape of tensors as MKL-DNN
         // is expecting NHWC dims description order
-        if (lin == DataLayout::kNHWC) {
-          auto nchw_dims = paddle::framework::vectorize<int>(out.dims());
-          std::rotate(nchw_dims.begin() + 1, nchw_dims.end() - 1,
-                      nchw_dims.end());
-          out.Resize(framework::make_ddim(nchw_dims));
-          paddle::platform::set_cur_paddle_data_layout(lin);
-        }
+        platform::MatchShapeToLayout(&out, lin, lout);
+        paddle::platform::set_cur_paddle_data_layout(lin);
         out.set_layout(DataLayout::kMKLDNN);
         out.set_format(out_format);
       } else {

diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
@@ -39,6 +39,5 @@ void TransformData(const OpKernelType &expected_kernel_type,
  */
 void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
                          Variable *out_var);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -41,7 +41,7 @@ struct Param {
   std::string LSTMOUT = "at.lstmout.new";
 };
 
-void PrepareParameters(Graph* graph, const Param& param);
+void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op);
 
 void FindWhileOp(Graph* graph) {
   GraphPatternDetector gpd;
@@ -98,7 +98,7 @@ void FindWhileOp(Graph* graph) {
   auto* hidden_init = graph->RetrieveNode(8);
 
   auto* lstm_op = graph->CreateOpNode(&op_desc);
-  PrepareParameters(graph, param);
+  PrepareParameters(graph, param, lstm_op);
 
   IR_NODE_LINK_TO(X, lstm_op);
   IR_NODE_LINK_TO(cell_init, lstm_op);
@@ -133,20 +133,29 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out);
 
-void PrepareParameters(Graph* graph, const Param& param) {
+void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
   // Check parameters
   PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
   auto& scope = graph->Get<Scope>(kParamScopeAttr);
 
   // Create new parameters.
+  // AddInput
   scope.Var(param.LSTMWeight)->GetMutable<LoDTensor>();
   scope.Var(param.LSTMBias)->GetMutable<LoDTensor>();
-  scope.Var(param.Hidden)->GetMutable<LoDTensor>();
-  scope.Var(param.Cell)->GetMutable<LoDTensor>();
-  scope.Var(param.AttentionedX)->GetMutable<LoDTensor>();
-  scope.Var(param.AttentionFCOut)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMX)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMOUT)->GetMutable<LoDTensor>();
+// AddOutput
+#define IR_NODE(x)                                 \
+  VarDesc key_##x(param.x);                        \
+  key_##x.SetPersistable(false);                   \
+  auto* node_##x = graph->CreateVarNode(&key_##x); \
+  IR_NODE_LINK_TO(lstm_op, node_##x);
+
+  IR_NODE(Hidden);
+  IR_NODE(Cell);
+  IR_NODE(AttentionedX);
+  IR_NODE(AttentionFCOut);
+  IR_NODE(LSTMX);
+  IR_NODE(LSTMOUT);
+#undef IR_NODE
 
 #define GATE_W(name__)                                               \
   auto* W_##name__##_w0 = scope.FindVar(#name__ ".w_0");             \

diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -127,47 +127,53 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
         embedding_data, k, weightx_data, n, beta, embeddings_data, n);
     op_desc.SetInput("Embeddings", {embeddings});
 
-    // Create temp variables.
-    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
-    const std::string BatchedCellPreAct =
-        patterns::UniqueKey("BatchedCellPreAct");
-    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
-
-    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
-
     op_desc.SetInput("H0", {});
     op_desc.SetInput("C0", {});
     op_desc.SetOutput("Hidden", {hidden->Name()});
     op_desc.SetOutput("Cell", {cell->Name()});
     op_desc.SetOutput("XX", {xx->Name()});
-    op_desc.SetOutput("BatchedGate", {BatchedGate});
-    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
-    op_desc.SetOutput("BatchedInput", {BatchedInput});
     op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
     op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
     // TODO(TJ): get from attr
     op_desc.SetAttr("use_seq", true);
 
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
+// Create temp variables.
 #define OP_SET_OUT(x)                            \
   const std::string x = patterns::UniqueKey(#x); \
-  op_desc.SetOutput(#x, {x});                    \
-  scope.Var(x)->GetMutable<LoDTensor>()
+  op_desc.SetOutput(#x, {x});
+
+    OP_SET_OUT(BatchedGate);
+    OP_SET_OUT(BatchCellPreAct);
+    OP_SET_OUT(BatchedInput);
     OP_SET_OUT(BatchedCell);
     OP_SET_OUT(BatchedHidden);
     OP_SET_OUT(ReorderedH0);
     OP_SET_OUT(ReorderedC0);
 #undef OP_SET_OUT
 
     auto* op = graph->CreateOpNode(&op_desc);
+
     IR_NODE_LINK_TO(input, op);
     IR_NODE_LINK_TO(weight_x, op);
     IR_NODE_LINK_TO(weight_h, op);
     IR_NODE_LINK_TO(bias, op);
     IR_NODE_LINK_TO(op, hidden);
+
+#define IR_NODE(x)                                 \
+  VarDesc key_##x(x);                              \
+  key_##x.SetPersistable(false);                   \
+  auto* node_##x = graph->CreateVarNode(&key_##x); \
+  IR_NODE_LINK_TO(op, node_##x);
+
+    IR_NODE(BatchedGate);
+    IR_NODE(BatchCellPreAct);
+    IR_NODE(BatchedInput);
+    IR_NODE(BatchedCell);
+    IR_NODE(BatchedHidden);
+    IR_NODE(ReorderedH0);
+    IR_NODE(ReorderedC0);
+#undef IR_NODE
+
     return op;
   };
 

diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -92,14 +92,15 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     // This is to add padding for dimension 128 on concern of MKL performance
     auto* scope = param_scope();
     auto* weight = scope->FindVar(w->Name())->GetMutable<LoDTensor>();
-    auto place = weight->place();
-    bool use_gpu = Get<bool>("use_gpu");
     auto* weight_data = weight->data<float>();
     auto weight_dims = weight->dims();
     int weight_num = product(weight_dims);
     int w_h = weight_dims[0];
     int w_w = weight_dims[1];
-    if (!use_gpu) {
+    bool use_gpu = Has("use_gpu") ? Get<bool>("use_gpu") : false;
+    bool use_fc_padding =
+        Has("use_fc_padding") ? Get<bool>("use_fc_padding") : true;
+    if (!use_gpu && use_fc_padding) {
       if (w_h % 128 == 0 && w_w % 128 == 0) {
         auto* weight_data_tmp = new float[weight_num];
         for (int i = 0; i < w_h; i++) {

diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -92,8 +92,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     }
 #undef GET_NODE
 
-#define NEW_IMTERMEDIATE_OUT(key) \
-  scope.Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+#define NEW_IMTERMEDIATE_OUT(key)                \
+  VarDesc key(NEW_NAME(key));                    \
+  key.SetPersistable(false);                     \
+  auto* key##_node = graph->CreateVarNode(&key); \
+  IR_NODE_LINK_TO(op, key##_node);
+
     NEW_IMTERMEDIATE_OUT(ReorderedH0);
     NEW_IMTERMEDIATE_OUT(XX);
     NEW_IMTERMEDIATE_OUT(BatchedInput);

diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -74,50 +74,55 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
       op_desc.SetInput("Bias", {new_bias_var});
     }
 
-    // Create temp variables.
-    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
-    const std::string BatchedCellPreAct =
-        patterns::UniqueKey("BatchedCellPreAct");
-    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
-    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
-
-    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
-    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
-    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
-
     op_desc.SetInput("H0", {});
     op_desc.SetInput("C0", {});
     op_desc.SetOutput("Hidden", {hidden->Name()});
     op_desc.SetOutput("Cell", {cell->Name()});
     op_desc.SetOutput("XX", {xx->Name()});
-    op_desc.SetOutput("BatchedGate", {BatchedGate});
-    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
-    op_desc.SetOutput("BatchedInput", {BatchedInput});
-    op_desc.SetOutput("CheckedCell", {CheckedCell});
     op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
     op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
     // TODO(TJ): get from attr
     op_desc.SetAttr("use_seq", true);
 
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
+// Create temp variables.
 #define OP_SET_OUT(x)                            \
   const std::string x = patterns::UniqueKey(#x); \
-  op_desc.SetOutput(#x, {x});                    \
-  scope.Var(x)->GetMutable<LoDTensor>()
+  op_desc.SetOutput(#x, {x});
+
+    OP_SET_OUT(BatchedGate);
+    OP_SET_OUT(BatchedCellPreAct);
+    OP_SET_OUT(BatchedInput);
+    OP_SET_OUT(CheckedCell);
     OP_SET_OUT(BatchedCell);
     OP_SET_OUT(BatchedHidden);
     OP_SET_OUT(ReorderedH0);
     OP_SET_OUT(ReorderedC0);
 #undef OP_SET_OUT
 
     auto* op = graph->CreateOpNode(&op_desc);
+
     IR_NODE_LINK_TO(input, op);
     IR_NODE_LINK_TO(weight_x, op);
     IR_NODE_LINK_TO(weight_h, op);
     IR_NODE_LINK_TO(bias, op);
     IR_NODE_LINK_TO(op, hidden);
+
+#define IR_NODE(x)                                 \
+  VarDesc key_##x(x);                              \
+  key_##x.SetPersistable(false);                   \
+  auto* node_##x = graph->CreateVarNode(&key_##x); \
+  IR_NODE_LINK_TO(op, node_##x);
+
+    IR_NODE(BatchedGate);
+    IR_NODE(BatchedCellPreAct);
+    IR_NODE(BatchedInput);
+    IR_NODE(CheckedCell);
+    IR_NODE(BatchedCell);
+    IR_NODE(BatchedHidden);
+    IR_NODE(ReorderedH0);
+    IR_NODE(ReorderedC0);
+#undef IR_NODE
+
     return op;
   };
 

diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -82,7 +82,7 @@ void FusionGroupPass::InsertFusionGroupOp(
     input_names.push_back(n->Name());
     external_nodes.insert(n);
   }
-  op_desc.SetInput("Xs", input_names);
+  op_desc.SetInput("Inputs", input_names);
 
   std::vector<std::string> output_names;
   for (auto* n : output_vars_of_subgraph) {

diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -214,7 +214,9 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
     op_desc.SetInput("FCWeight", {fc_w->Name()});
     op_desc.SetInput("FCBias", {fc_bias->Name()});
     const std::string fc_out_tmp = fc_out->Name() + ".tmp";
-    param_scope()->Var(fc_out_tmp)->GetMutable<framework::LoDTensor>();
+    VarDesc fc_out_key(fc_out_tmp);
+    fc_out_key.SetPersistable(false);
+    auto* fc_out_node = graph->CreateVarNode(&fc_out_key);
     op_desc.SetOutput("FCOut", {fc_out_tmp});
     op_desc.SetOutput("Out", {fc_out->Name()});
     op_desc.SetAttr("fc_activation", act->Op()->Type());
@@ -227,6 +229,7 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
     IR_NODE_LINK_TO(sequence_expand0_in, op_node);
     IR_NODE_LINK_TO(sequence_expand1_in, op_node);
     IR_NODE_LINK_TO(op_node, fc_out);
+    IR_NODE_LINK_TO(op_node, fc_out_node);
 
     // Clean nodes.
     std::unordered_set<const Node*> marked_nodes;