diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1554fba5e1fa..fa87cc14f2668 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,7 @@ option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
+option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/README.md b/README.md
index d14d0ef001481..580ebca8ef308 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.4.post97
+pip install paddlepaddle-gpu==1.8.5.post97
 
 ```
 It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
diff --git a/README_cn.md b/README_cn.md
index e4544a3eff6e5..ee8cfbef1cef9 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -30,7 +30,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.4.post97
+pip install paddlepaddle-gpu==1.8.5.post97
 
 ```
 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index cf458d9770675..fc984f5e560ef 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -51,6 +51,16 @@ if(WIN32)
   endif(NOT MSVC)
 endif(WIN32)
 
+if(WITH_MUSL)
+    add_definitions(-DPADDLE_WITH_MUSL)
+
+    message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON")
+    SET(WITH_MKL OFF)
+
+    message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON")
+    SET(WITH_GPU OFF)
+endif()
+
 if(WITH_PSLIB)
     add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 3da550519bae2..1da47bba7b6a5 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if(NOT LINUX OR NOT WITH_MKL)
-  message("Paddle-lite will not build because the required Linux and MKL do not exist.")
+if(NOT LINUX)
+  message("Paddle-lite will not build because the required Linux do not exist.")
   set(WITH_LITE OFF)
   return()
 endif()
@@ -42,30 +42,30 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   endif()
 
   # No quotes, so cmake can resolve it as a command with arguments.
-  set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
-  set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
-                         -DLITE_WITH_CUDA=${WITH_GPU}
-                         -DWITH_MKLDNN=OFF
-                         -DLITE_WITH_X86=ON
-                         -DLITE_WITH_PROFILE=OFF
-                         -DWITH_LITE=OFF
-                         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
-                         -DWITH_PYTHON=OFF
-                         -DWITH_TESTING=OFF
-                         -DLITE_BUILD_EXTRA=ON
-                         -DCUDNN_ROOT=${CUDNN_ROOT}
-                         -DLITE_WITH_STATIC_CUDA=OFF
-                         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
-                         -DLITE_WITH_XPU=${LITE_WITH_XPU}
-                         -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-                         -DLITE_WITH_ARM=OFF)
-
-  ExternalProject_Add(
+  if(WITH_ARM)
+    set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
+    message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
+    set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF
+                           -DLITE_WITH_CUDA=OFF
+                           -DWITH_MKLDNN=OFF
+                           -DLITE_WITH_X86=OFF
+                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON
+                           -DLITE_WITH_PROFILE=OFF
+                           -DARM_TARGET_OS=armlinux
+                           -DWITH_LITE=ON
+                           -DWITH_PYTHON=OFF
+                           -DWITH_TESTING=OFF
+                           -DLITE_BUILD_EXTRA=ON
+                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                           -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
+                           -DLITE_WITH_ARM=ON)
+    ExternalProject_Add(
       ${LITE_PROJECT}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
       GIT_TAG             ${LITE_GIT_TAG}
       PREFIX              ${LITE_SOURCES_DIR}
+      PATCH_COMMAND       mkdir -p ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc
       UPDATE_COMMAND      ""
       BUILD_COMMAND       ${LITE_BUILD_COMMAND}
       INSTALL_COMMAND     ""
@@ -81,7 +81,51 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                           -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                           ${EXTERNAL_OPTIONAL_ARGS}
                           ${LITE_OPTIONAL_ARGS}
-  )
+    )
+    set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8)
+  else()
+    set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
+    set(LITE_OUTPUT_BIN_DIR inference_lite_lib)
+    set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
+                           -DLITE_WITH_CUDA=${WITH_GPU}
+                           -DWITH_MKLDNN=OFF
+                           -DLITE_WITH_X86=ON
+                           -DLITE_WITH_PROFILE=OFF
+                           -DWITH_LITE=OFF
+                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
+                           -DWITH_PYTHON=OFF
+                           -DWITH_TESTING=OFF
+                           -DLITE_BUILD_EXTRA=ON
+                           -DCUDNN_ROOT=${CUDNN_ROOT}
+                           -DLITE_WITH_STATIC_CUDA=OFF
+                           -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
+                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                           -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
+                           -DLITE_WITH_ARM=OFF)
+
+    ExternalProject_Add(
+        ${LITE_PROJECT}
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+        GIT_TAG             ${LITE_GIT_TAG}
+        PREFIX              ${LITE_SOURCES_DIR}
+        UPDATE_COMMAND      ""
+        BUILD_COMMAND       ${LITE_BUILD_COMMAND}
+        INSTALL_COMMAND     ""
+        CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            ${LITE_OPTIONAL_ARGS}
+    )
+  endif()
   ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR)
   ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR)
   set(LITE_BINARY_DIR ${BINARY_DIR})
@@ -103,8 +147,8 @@ function(external_lite_libs alias path)
   endif()
 endfunction()
 
-external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
-set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
 
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index c0adda0da31ae..e3ac8624a809a 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            64a48f9565aa72f6359917b3406328075a409939)
+SET(MKLDNN_TAG            361725600224f41b7347a1c6bee9b04d1e6c14d7)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index f4603051a0e7e..d5ef6d85b578f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,7 +131,7 @@ function(copy_part_of_thrid_party TARGET DST)
     if (LITE_BINARY_DIR)
         set(dst_dir "${DST}/third_party/install/lite")
         copy(${TARGET}
-                SRCS ${LITE_BINARY_DIR}/inference_lite_lib/*
+                SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/*
                 DSTS ${dst_dir})
     endif()
 endfunction()
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 902dfb11fc0af..5f36a9adf1ae6 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -28,5 +28,6 @@ endif()
 
 if(WIN32)
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Os -DNDEBUG")
 endif()
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bb5e2e1369a84..d31943289d7a1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -123,7 +123,9 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 
-cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_library(op_version_proto SRCS op_version_proto.cc DEPS framework_proto boost)
+
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS op_version_proto framework_proto boost)
 cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
 
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 108cd9ac6d1c0..8563b5b6d3695 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -203,7 +203,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   // As MKL-DNN description was in NCHW and paddle is expecting NHWC
   platform::MatchShapeToLayout(out, in_layout, out_layout);
 
-  out->set_layout(out_layout);
+  out->set_layout(DataLayout::kNCHW);
   // reset format since the out tensor will be feed to non-MKLDNN OPkernel
   out->set_format(MKLDNNMemoryFormat::undef);
 }
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 3a40de6988f29..70693a5df2609 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -117,6 +117,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
     auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
+#ifdef PADDLE_WITH_MKLDNN
+    tran_lod_tensor->set_format(in_lod_tensor.format());
+#endif
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
     auto &in_selected_rows = in_var.Get<SelectedRows>();
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 21e28d7ac86d0..881ef30ffe690 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -98,6 +98,7 @@ message AsyncConfig {
   optional int32 send_wait_times = 7 [ default = 1 ];
   optional bool runtime_split_send_recv = 8 [ default = false ];
   optional bool launch_barrier = 9 [ default = true ];
+  optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
 }
 
 message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 29312370b3448..c33d71b3b0a9c 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -179,29 +179,15 @@ message BlockDesc {
   optional int32 forward_block_idx = 5 [ default = -1 ];
 }
 
-// CompatibleInfo is used to determine if a feature is compatible and
-// provides the information.
-message CompatibleInfo {
-  enum Type {
-    COMPATIBLE = 0;
-    DEFINITELY_NOT = 1;
-    POSSIBLE = 2;
-    BUG_FIX = 3;
-    PRECISION_CHANGE = 4;
-  }
-  required string version = 1;
-  required Type type = 2;
-}
-
-// In some cases, Paddle Fluid may perform operator definition iterations,
-// and the operator uses OpCompatibleMap for compatibility testing.
-message OpCompatibleMap {
-  message OpCompatiblePair {
+// In some cases, Paddle may perform operator definition iterations,
+// and the operator uses OpVersionMap for compatibility testing.
+message OpVersion { required int32 version = 1; }
+message OpVersionMap {
+  message OpVersionPair {
     required string op_name = 1;
-    required CompatibleInfo compatible_info = 2;
+    required OpVersion op_version = 2;
   }
-  repeated OpCompatiblePair pair = 1;
-  optional string default_required_version = 2;
+  repeated OpVersionPair pair = 1;
 }
 
 // Please refer to
@@ -210,8 +196,8 @@ message OpCompatibleMap {
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
 message ProgramDesc {
-  reserved 2; // For backward compatibility.
+  reserved 2, 3; // For backward compatibility.
   repeated BlockDesc blocks = 1;
   optional Version version = 4;
-  optional OpCompatibleMap op_compatible_map = 3;
+  optional OpVersionMap op_version_map = 5;
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 96952e20c2158..ed2863e8bf798 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1882,9 +1882,9 @@ PDNode *patterns::MultipleQuantize::operator()() {
 PDNode *patterns::QuantizePlacement::operator()(
     const std::unordered_set<std::string> &quantize_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
-                                       "fc", "matmul", "pool2d", "prior_box",
-                                       "relu", "reshape2", "transpose2"});
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d",
+           "prior_box", "relu", "reshape2", "transpose2", "fusion_gru"});
   if (!quantize_enabled_op_types.empty()) {
     supported_op_types = quantize_enabled_op_types;
   }
@@ -1894,7 +1894,8 @@ PDNode *patterns::QuantizePlacement::operator()(
 
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
-  std::unordered_set<std::string> supported_op_types{"conv2d"};
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>({"conv2d", "fusion_gru"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
@@ -2280,6 +2281,23 @@ PDNode *patterns::MatmulTransposeReshapePattern::operator()() {
   return reshape_out;
 }
 
+PDNode *patterns::FusionGru::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_op("fusion_gru");
+  auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input(
+      "fusion_gru", "X");
+  auto weight_h = pattern->NewNode(weight_h_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("fusion_gru", "WeightH");
+  auto weight_x = pattern->NewNode(weight_x_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("fusion_gru", "WeightX");
+  auto out = pattern->NewNode(out_repr())
+                 ->AsOutput()
+                 ->assert_is_op_output("fusion_gru", "Hidden");
+  op->LinksFrom({x, weight_h, weight_x}).LinksTo({out});
+  return out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 7116b8a2a6f35..15f6ea1541d58 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1312,6 +1312,21 @@ struct MatmulTransposeReshapePattern : public PatternBase {
   PATTERN_DECL_NODE(reshape_out_xshape);
 };
 
+// fusion_gru op
+// Forward pass for fusion_gru.
+// fusion_gru out is a result of the operator.
+struct FusionGru : public PatternBase {
+  FusionGru(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fusion_gru") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(weight_h);
+  PATTERN_DECL_NODE(weight_x);
+  PATTERN_DECL_NODE(out);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 0254b5e757351..58931f3ed3872 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -63,8 +63,9 @@ enum { U8_MAX = 255, S8_MAX = 127 };
 
 void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
                                     std::string input_name, double scale_to_one,
-                                    bool is_unsigned,
-                                    std::string scale_attr_name) const {
+                                    bool is_input_unsigned,
+                                    std::string scale_attr_name, float shift,
+                                    std::string shift_attr_name) const {
   auto inputs = op->Op()->InputNames();
   bool name_found =
       std::find(inputs.begin(), inputs.end(), input_name) != inputs.end();
@@ -72,7 +73,7 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
                     platform::errors::InvalidArgument(
                         "Var(%s) isn't the input of the %s operator.",
                         input_name, op->Op()->Type()));
-  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  unsigned max = is_input_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
   // Create quantize output variable
@@ -86,7 +87,8 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   q_desc.SetOutput("Output",
                    std::vector<std::string>({quantize_out_node->Name()}));
   q_desc.SetAttr("Scale", scale);
-  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  q_desc.SetAttr("Shift", shift);
+  q_desc.SetAttr("is_negative_input", !is_input_unsigned);
 
   q_desc.SetAttr("output_format",
                  Has("data_layout") ? Get<std::string>("data_layout") : "NHWC");
@@ -103,11 +105,13 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   IR_NODE_LINK_TO(quantize_out_node, op);
 
   if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+  if (!shift_attr_name.empty()) op->Op()->SetAttr(shift_attr_name, shift);
 }
 
 void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                                     bool are_unsigned,
-                                     std::string scale_attr_name) const {
+                                     bool are_inputs_unsigned,
+                                     std::string scale_attr_name, float shift,
+                                     std::string shift_attr_name) const {
   auto inputs = op->inputs;
   auto output = op->outputs[0];
   PADDLE_ENFORCE_GE(inputs.size(), 1,
@@ -127,7 +131,7 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
   std::vector<std::string> quantize_out_node_names(inputs.size());
 
   double scale_out = GetScaleValueForNode(output);
-  unsigned max = are_unsigned ? U8_MAX : S8_MAX;
+  unsigned max = are_inputs_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_out * max;
 
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -137,10 +141,11 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
     quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
 
     q_desc.SetAttr("Scale", scale);
+    q_desc.SetAttr("Shift", shift);
     q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
     q_desc.SetOutput("Output",
                      std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("is_negative_input", !are_unsigned);
+    q_desc.SetAttr("is_negative_input", !are_inputs_unsigned);
     auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
 
     // link quantize op
@@ -154,6 +159,7 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
   op->Op()->SetInput(input_name, quantize_out_node_names);
 
   if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+  if (!shift_attr_name.empty()) op->Op()->SetAttr(shift_attr_name, shift);
 }
 
 void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
@@ -782,6 +788,62 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
                   quantize_elementwise_add_count);
 }
 
+void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::FusionGru pattern{gpd.mutable_pattern(), name_scope_};
+  pattern();
+
+  int quantize_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize fusion_gru op";
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, pattern);
+
+    // skip if should not be quantized
+    if (!platform::HasOpINT8DataType(op->Op())) {
+      LogQuantizationDisabled(op);
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(x, x, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(weight_h, weight_h, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern);
+
+    if (!AreScalesPresentForNodes(op, {x, weight_h, weight_x})) {
+      LogCannotQuantizeOp(op);
+      return;
+    }
+
+    bool is_x_unsigned{false};
+    auto input_x_scale = GetScaleValueForNode(x, &is_x_unsigned);
+
+    double input_x_shift{128.};
+    if (is_x_unsigned) input_x_shift = 0.;
+
+    QuantizeInput(g, op, x, "X", input_x_scale, is_x_unsigned, "Scale_data",
+                  input_x_shift, "Shift_data");
+
+    auto weight_scale_tensor = GetScaleTensorForNode(weight_x);
+    EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
+                                     weight_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> scale_weights{
+        weight_scale_tensor.data<double>(),
+        weight_scale_tensor.data<double>() + weight_scale_tensor.numel()};
+
+    op->Op()->SetAttr("Scale_weights", scale_weights);
+    // return fp32 data
+    op->Op()->SetAttr("force_fp32_output", true);
+
+    ++quantize_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_count);
+
+  PrettyLogDetail("---    quantized %d fusion_gru ops", quantize_count);
+}
+
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
   PADDLE_ENFORCE_NOT_NULL(
@@ -801,6 +863,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
   QuantizeElementwiseAdd(graph);
+  QuantizeFusionGru(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index bd87b31b781ec..0d4c424901081 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -49,31 +49,26 @@ class CPUQuantizePass : public FusePassBase {
   void ApplyImpl(ir::Graph* graph) const override;
 
   void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
-
   void QuantizeFc(Graph* graph) const;
-
   void QuantizePool(Graph* graph) const;
-
   void QuantizeConcat(Graph* graph) const;
-
   void QuantizePriorBox(Graph* graph) const;
-
   void QuantizeTranspose(Graph* graph) const;
-
   void QuantizeReshape(Graph* graph) const;
-
   void QuantizeMatmul(Graph* graph) const;
-
   void QuantizeElementwiseAdd(Graph* graph) const;
+  void QuantizeFusionGru(Graph* graph) const;
 
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
-                     double scale_to_one, bool is_unsigned,
-                     std::string scale_attr_name = "") const;
+                     double scale_to_one, bool is_input_unsigned,
+                     std::string scale_attr_name = "", float shift = 0.0,
+                     std::string shift_attr_name = "") const;
 
   // quantize all inputs of given name with the same (minimum) scale
   void QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                      bool are_unsigned,
-                      std::string scale_attr_name = "") const;
+                      bool are_inputs_unsigned,
+                      std::string scale_attr_name = "", float shift = 0.0,
+                      std::string shift_attr_name = "") const;
 
   void DequantizeOutput(Graph* g, Node* op, Node* output,
                         std::string output_name, double scale_to_one,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index a66e9f0e93898..65be404dfef2f 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -91,6 +91,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
+  } else if (type == "fusion_gru") {
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Bias", {inputs[1]});
+    op->SetInput("WeightX", {inputs[2]});
+    op->SetInput("WeightH", {inputs[3]});
+    op->SetOutput("Hidden", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    op->SetAttr("Scale_data", 1.0f);
+    op->SetAttr("Shift_data", 0.0f);
+    op->SetAttr("Weight_scale", std::vector<float>{1.0f});
   }
 }
 
@@ -389,6 +399,77 @@ TEST(CpuQuantizePass, transpose) {
                     quant_count, dequant_count, added_nodes_count, 2.0f * 127);
 }
 
+static const std::initializer_list<std::string> variable_names_fusion_gru = {
+    "x", "wx", "wh", "b", "h"};
+
+// x->Fusion_gru->h
+ProgramDesc BuildProgramDescFusionGru() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_transpose) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("wx") == 0 || v.find("wh") || v.find("b")) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "fusion_gru", "Fusion_gru", {"x", "wx", "wh", "b"}, {"h"}, true,
+        "int8");
+
+  return prog;
+}
+
+void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count,
+                       int dequant_count, int added_nodes_count, float scale,
+                       float shift) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_fusion_gru, &original_nodes_num,
+              &current_nodes_num);
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int gru_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "fusion_gru") {
+        gru_nodes_count++;
+
+        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale)
+            << "Scale_data for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift)
+            << "Shift_data for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
+                                  op->GetAttr("Scale_weights"))[0],
+                  scale)
+            << "Scale_weights for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true)
+            << "force_fp32_output for node '" + op_name + "'.";
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(gru_nodes_count, gru_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, fusion_gru) {
+  // x->Fusion_gru->h
+  int gru_count = 1;
+  int quant_count = 1;
+  int dequant_count = 0;
+  // 1 Quant + 1 IN + 0 DeQuant + 0 OUT
+  int added_nodes_count = 1 + 1 + 0 + 0;
+  MainTestFusionGru(BuildProgramDescFusionGru(), gru_count, quant_count,
+                    dequant_count, added_nodes_count, 2. * 127, 128.);
+}
+
 static const std::initializer_list<std::string> variable_names_reshape = {
     "a", "w1", "b", "c", "d", "e", "f"};
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 54ab244a99bd4..d6146f264ab8d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -76,6 +76,8 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
         BOOST_GET_CONST(float, dequant_op->Op()->GetAttr("Scale"));
     float quant_scale =
         BOOST_GET_CONST(float, quant_op->Op()->GetAttr("Scale"));
+    float dequant_shift = dequant_op->Op()->GetAttrIfExists<float>("Shift");
+    float quant_shift = quant_op->Op()->GetAttrIfExists<float>("Shift");
     PADDLE_ENFORCE_NE(
         nodes_keep_counter->find(dequant_out), nodes_keep_counter->end(),
         platform::errors::NotFound("The dequant output node is not found."));
@@ -83,7 +85,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
     // check if dequantize op should be kept or removed, decrease the counter
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
 
-    if (dequant_scale == quant_scale) {
+    if (dequant_scale == quant_scale && dequant_shift == quant_shift) {
       // squash dequantize-quantize to nothing
       auto quant_out_var_name = quant_out->Name();
       auto next_op_inputs = next_op_desc->InputNames();
@@ -110,7 +112,9 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
       desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
       desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
       desc.SetAttr("Scale_in", dequant_scale);
+      desc.SetAttr("Shift_in", dequant_shift);
       desc.SetAttr("Scale_out", quant_scale);
+      desc.SetAttr("Shift_out", quant_shift);
 
       auto requant_op = g->CreateOpNode(&desc);
 
@@ -293,6 +297,7 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
         }));
     auto* first_quant_out = first_quant_op->outputs[0];
     float scale = first_quant_op->Op()->GetAttrIfExists<float>("Scale");
+    float shift = first_quant_op->Op()->GetAttrIfExists<float>("Shift");
 
     PADDLE_ENFORCE_NE(scale, 0,
                       platform::errors::InvalidArgument(
@@ -302,7 +307,8 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
       auto quant_op = prev_out->outputs[iter];
       if (quant_op->IsOp() && quant_op->Op()->Type() == "quantize" &&
           quant_op->id() != first_quant_op->id() &&
-          quant_op->Op()->GetAttrIfExists<float>("Scale") == scale) {
+          quant_op->Op()->GetAttrIfExists<float>("Scale") == scale &&
+          quant_op->Op()->GetAttrIfExists<float>("Shift") == shift) {
         auto quant_out = quant_op->outputs[0];
         auto last_op = quant_out->outputs[0];
 
diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
index 826e14dedb76d..93826fc97b196 100644
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
@@ -182,40 +182,5 @@ OpCompatibleType OpCompatibleMap::IsRequireMiniVersion(
   }
 }
 
-bool OpCompatibleMap::ConvertToProto(proto::OpCompatibleMap* desc) const {
-  desc->Clear();
-  desc->set_default_required_version(default_required_version_);
-  for (auto pair : op_compatible_map_) {
-    const CompatibleInfo& info = pair.second;
-    auto* pair_desc = desc->add_pair();
-    pair_desc->set_op_name(pair.first);
-    auto* info_desc = pair_desc->mutable_compatible_info();
-    info_desc->set_version(info.required_version_);
-    info_desc->set_type(
-        static_cast<proto::CompatibleInfo_Type>(info.compatible_type_));
-  }
-  return true;
-}
-
-bool OpCompatibleMap::ReadFromProto(const proto::OpCompatibleMap& desc) {
-  std::string version = desc.default_required_version();
-  if (version.empty()) {
-    LOG(INFO) << "The default operator required version is missing."
-                 " Please update the model version.";
-    return false;
-  }
-  op_compatible_map_.clear();
-  default_required_version_ = desc.default_required_version();
-  for (int i = 0; i < desc.pair_size(); ++i) {
-    const auto& pair_desc = desc.pair(i);
-    auto info_desc = pair_desc.compatible_info();
-    CompatibleInfo info(info_desc.version(),
-                        static_cast<OpCompatibleType>(info_desc.type()));
-    std::pair<std::string, CompatibleInfo> pair(pair_desc.op_name(), info);
-    op_compatible_map_.insert(pair);
-  }
-  return true;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
index 01fbdef99cbbc..6f86b8b64ed21 100644
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
@@ -58,14 +58,6 @@ class OpCompatibleMap {
   OpCompatibleType IsRequireMiniVersion(std::string op_name,
                                         std::string current_version) const;
 
-  // Convert the entire OpCompatibleMap to Proto, which can be serialized
-  // to the model file as part of the ProgramDesc.
-  bool ConvertToProto(proto::OpCompatibleMap* desc) const;
-
-  // Read and reset the entire object from proto, which can be read from
-  // the model file as part of the program.
-  bool ReadFromProto(const proto::OpCompatibleMap& desc);
-
   const std::string& GetDefaultRequiredVersion() const {
     return default_required_version_;
   }
diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc
index 98f3f5071ad28..cf210ed8ab2d5 100644
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
@@ -28,12 +28,6 @@ TEST(test_op_compatible_info, test_op_compatible) {
   auto comp_map = OpCompatibleMap();
   comp_map.InitOpCompatibleMap();
 
-  // Ensure save-load consistency.
-  auto program_desc = ProgramDesc();
-  proto::OpCompatibleMap* proto_map = program_desc.OpCompatibleMap();
-  comp_map.ConvertToProto(proto_map);
-  comp_map.ReadFromProto(*proto_map);
-
   ASSERT_NE(comp_map.GetDefaultRequiredVersion(), std::string());
   ASSERT_NE(comp_map.GetOpCompatibleInfo("sequence_pad").required_version_,
             std::string());
diff --git a/paddle/fluid/framework/op_version_proto.cc b/paddle/fluid/framework/op_version_proto.cc
new file mode 100644
index 0000000000000..696e322380740
--- /dev/null
+++ b/paddle/fluid/framework/op_version_proto.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_version_proto.h"
diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h
new file mode 100644
index 0000000000000..1a876f43d2f00
--- /dev/null
+++ b/paddle/fluid/framework/op_version_proto.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+namespace pb {
+
+class OpVersion {
+ public:
+  explicit OpVersion(proto::OpVersion* desc) : desc_{desc} {}
+  void SetVersionID(uint32_t version) { desc_->set_version(version); }
+
+ private:
+  proto::OpVersion* desc_;
+};
+
+class OpVersionMap {
+ public:
+  explicit OpVersionMap(proto::OpVersionMap* desc) : desc_{desc} {}
+  OpVersion operator[](const std::string& key) {
+    for (int i = 0; i < desc_->pair_size(); ++i) {
+      if (desc_->pair(i).op_name() == key) {
+        return OpVersion(desc_->mutable_pair(i)->mutable_op_version());
+      }
+    }
+    auto* pair = desc_->add_pair();
+    pair->set_op_name(key);
+    return OpVersion(pair->mutable_op_version());
+  }
+
+ private:
+  proto::OpVersionMap* desc_;
+};
+
+}  // namespace pb
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
index 11b7224e68340..9a67c160f0233 100644
--- a/paddle/fluid/framework/op_version_registry.cc
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index fea043a0ff311..5ddaf1bd8d8ce 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include <boost/any.hpp>
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -159,12 +160,14 @@ class OpVersionRegistrar {
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
+  const std::unordered_map<std::string, OpVersion>& GetVersionMap() {
+    return op_version_map_;
+  }
   uint32_t GetVersionID(const std::string& op_type) const {
     auto it = op_version_map_.find(op_type);
     if (it == op_version_map_.end()) {
       return 0;
     }
-
     return it->second.GetVersionID();
   }
 
@@ -175,6 +178,14 @@ class OpVersionRegistrar {
   OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
+inline void SaveOpVersions(
+    const std::unordered_map<std::string, OpVersion>& src,
+    pb::OpVersionMap* dst) {
+  for (const auto& pair : src) {
+    (*dst)[pair.first].SetVersionID(pair.second.GetVersionID());
+  }
+}
+
 class OpVersionComparator {
  public:
   virtual bool operator()() = 0;
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index d6b18751cefe5..2b173c9571588 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index d37a16a3e7d9f..0faa870f50565 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -39,8 +39,8 @@ proto::ProgramDesc *ProgramDesc::Proto() {
   return &desc_;
 }
 
-proto::OpCompatibleMap *ProgramDesc::OpCompatibleMap() {
-  return desc_.mutable_op_compatible_map();
+proto::OpVersionMap *ProgramDesc::OpVersionMap() {
+  return desc_.mutable_op_version_map();
 }
 
 int64_t ProgramDesc::Version() const { return desc_.version().version(); }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 5cafc9111da67..8b1aac95fc288 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -58,7 +58,7 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
-  proto::OpCompatibleMap *OpCompatibleMap();
+  proto::OpVersionMap *OpVersionMap();
 
   int64_t Version() const;
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index a073dbd733f0b..4fe01aff79e52 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -38,6 +38,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
+#ifdef PADDLE_WITH_MKLDNN
+  dst->set_format(src.format());
+#endif
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
@@ -237,6 +240,9 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   src.check_memory_size();
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
+#ifdef PADDLE_WITH_MKLDNN
+  dst->set_format(src.format());
+#endif
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index f85e1f6511656..6d35d3395ba60 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -88,7 +88,7 @@ if(NOT APPLE AND NOT WIN32)
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   # check symbol hidden
   FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    "execute_process(COMMAND sh -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
     " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
     "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
     "  message(FATAL_ERROR \"Check symbol failed.\")\n"
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index e78d5ef017b7f..2c454893a6203 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -252,7 +252,11 @@ void LiteSubgraphPass::SetUpEngine(
   } else if (use_xpu) {
     target_type = TARGET(kXPU);
   } else {
+#ifdef PADDLE_WITH_ARM
+    target_type = TARGET(kARM);
+#else
     target_type = TARGET(kX86);
+#endif
   }
 
   paddle::lite_api::PrecisionType precision_type =
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6c68b385bcbc0..98bee2d4bb471 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -192,11 +192,6 @@ bool AnalysisPredictor::PrepareProgram(
     // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
     // still need to create other persistable variables.
     // So in both case, create persistable variables at first.
-    if (!CheckOperatorCompatible()) {
-      LOG(WARNING) << "WARNING: Results may be DIFF! "
-                      "Please use the corresponding version of the model and "
-                      "prediction library, and do not use the develop branch.";
-    }
     executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
 
     // if enable_ir_optim_ is false,
@@ -998,40 +993,6 @@ std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
-bool AnalysisPredictor::CheckOperatorCompatible() {
-  if (!inference_program_) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Inference program version check failed because the program does not "
-        "exist."));
-    return false;
-  }
-  bool res = true;
-  op_compatible_map_.ReadFromProto(*inference_program_->OpCompatibleMap());
-  const auto &version = framework::DumpVersion(framework::kCurProgramVersion);
-  LOG(INFO) << "MODEL VERSION: "
-            << framework::DumpVersion(inference_program_->Version());
-  LOG(INFO) << "PREDICTOR VERSION: " << version;
-  std::set<std::string> op_types;
-  for (size_t i = 0; i < inference_program_->Size(); ++i) {
-    const auto &block = inference_program_->Block(i);
-    for (const auto *op : block.AllOps()) {
-      op_types.insert(op->Type());
-    }
-  }
-  for (const auto type : op_types) {
-    auto compatible_type =
-        op_compatible_map_.IsRequireMiniVersion(type, version);
-    if (compatible_type != framework::OpCompatibleType::compatible) {
-      if (!framework::kCurProgramVersion) {
-        LOG(WARNING) << " - Version incompatible ("
-                     << static_cast<int>(compatible_type) << ") " << type;
-      }
-      res = false;
-    }
-  }
-  return res;
-}
-
 // Add SaveOptimModel
 void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
   // save model
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index c4a7173b0104b..269f2fd80bb47 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -335,13 +335,6 @@ class AnalysisPredictor : public PaddlePredictor {
   /// AnalysisPredictor::ZeroCopyRun() now.
   ///
   void MkldnnPostReset();
-  ///
-  /// \brief Compute compatibility based on model version information and
-  /// operator version information
-  ///
-  /// \return Compatible information
-  ///
-  bool CheckOperatorCompatible();
 
 #if PADDLE_WITH_TENSORRT
   ///
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index b6b7d1f20baf7..a0f64796576c8 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 33661594b926f..7b909b3f84205 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -46,6 +46,7 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) {
   switch (type) {
     case TargetType::kHost:
     case TargetType::kX86:
+    case TargetType::kARM:
       return platform::CPUPlace();
     case TargetType::kCUDA:
       return platform::CUDAPlace(id);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index ac0a04b9a116d..4a386ac1d81c5 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -164,6 +164,7 @@ class OpConverter {
       const std::unordered_set<std::string>& parameters,
       const std::vector<std::string>& outputs, TensorRTEngine* engine) {
     engine->InitNetwork();
+    bool all_dynamic_shape_set = true;
     for (auto& input : inputs) {
       if (parameters.count(input)) continue;
       auto* var = block_desc->FindVar(input);
@@ -181,6 +182,13 @@ class OpConverter {
         auto max_input_shape = engine->max_input_shape()[input];
         auto optim_input_shape = engine->optim_input_shape()[input];
         size_t ranks = min_input_shape.size();
+        if (ranks == 0) {
+          all_dynamic_shape_set = false;
+          LOG(INFO) << "trt input [" << input.c_str()
+                    << "] dynamic shape info not set, please check and retry.";
+          // check other input
+          continue;
+        }
         std::vector<int64_t> input_shape;
         input_shape.push_back(-1);
         for (size_t i = 1; i < ranks; i++) {
@@ -207,6 +215,10 @@ class OpConverter {
             Vec2TRT_Dims(var_shape, input));
       }
     }
+    PADDLE_ENFORCE_EQ(all_dynamic_shape_set, true,
+                      platform::errors::InvalidArgument(
+                          "some trt inputs dynamic shape info not set, "
+                          "check the INFO log above for more details."));
     framework::proto::BlockDesc* block_proto = block_desc->Proto();
     ConvertBlock(*block_proto, parameters, scope, engine);
     for (auto& output : outputs) {
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 723e989be8de8..252bca2d5522e 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -65,6 +65,7 @@ DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
 DEFINE_bool(warmup, false,
             "Use warmup to calculate elapsed_time more accurately. "
             "To reduce CI time, it sets false in default.");
+DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup.");
 
 DEFINE_bool(enable_profile, false, "Turn on profiler for fluid");
 DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
@@ -364,15 +365,28 @@ void PredictionWarmUp(PaddlePredictor *predictor,
   if (FLAGS_zero_copy) {
     ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
   }
-  outputs->resize(1);
+  int iterations = 1;
+  if (FLAGS_warmup_iters > 1)
+    iterations = std::min(FLAGS_warmup_iters, static_cast<int>(inputs.size()));
+  outputs->resize(iterations);
   Timer warmup_timer;
-  warmup_timer.tic();
+  double elapsed_time = 0;
   if (!FLAGS_zero_copy) {
-    predictor->Run(inputs[0], &(*outputs)[0], batch_size);
+    for (int i = 0; i < iterations; ++i) {
+      warmup_timer.tic();
+      predictor->Run(inputs[i], &(*outputs)[i], batch_size);
+      elapsed_time += warmup_timer.toc();
+    }
   } else {
-    predictor->ZeroCopyRun();
+    for (int i = 0; i < iterations; ++i) {
+      warmup_timer.tic();
+      predictor->ZeroCopyRun();
+      elapsed_time += warmup_timer.toc();
+    }
   }
-  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type);
+  auto batch_latency = elapsed_time / iterations;
+  PrintTime(batch_size, 1, num_threads, tid, batch_latency, iterations,
+            data_type);
   if (FLAGS_enable_profile) {
     paddle::platform::ResetProfiler();
   }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 53e6f4aa6e41b..5fa8f6bab8cca 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -144,4 +144,5 @@ cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_o
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
+include(mkldnn/nhwc_op_tests.cmake)
 endif()
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 1903b9e30d800..26ad09cc265f1 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -41,7 +41,7 @@ struct CudnnActivationFunctor {
     TensorDescriptor x_desc, out_desc;
     x_desc.set(x);
     out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation"));
-    PADDLE_ENFORCE(platform::dynload::cudnnActivationForward(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), out_desc.desc(),
@@ -67,7 +67,7 @@ struct CudnnActivationGradFunctor {
     out_desc.set(out);
     dout_desc.set(dout);
     dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad"));
-    PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
         dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index fca3c531b4055..8bd76a9886c62 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -103,7 +103,7 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](float decr_ratio) {
           PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
                             platform::errors::InvalidArgument(
-                                "'incr_ratio' should be between 0 and 1, but "
+                                "'decr_ratio' should be between 0 and 1, but "
                                 "the received is %f",
                                 decr_ratio));
         });
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 3cb3f1d48bfa7..4bf4ba1120df0 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -117,7 +117,8 @@ void BeamSearchDecodeFunctor::apply() const {
 
 template <>
 void BeamSearchDecodeFunctor::apply<bool>() const {
-  PADDLE_THROW("beam search decode op does not support bool!");
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "beam search decode op does not support bool!"));
 }
 
 class BeamSearchDecodeOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index b742b4c0deea8..b00604155d67e 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -50,20 +50,25 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
     int64_t input_chans = input_dims[1];
 
     int64_t output_chans;
-    if (has_offset) {
-      PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0,
-                        platform::errors::InvalidArgument(
-                            "Slicing with affine offset, coefficients grid "
-                            "should have n_out*(n_in+1) channels, but got %d",
-                            coeffs_chans));
-      output_chans = coeffs_chans / (input_chans + 1);
+    if ((!ctx->IsRuntime()) && ((coeffs_chans < 0) || (input_chans < 0))) {
+      output_chans = -1;
     } else {
-      PADDLE_ENFORCE_EQ((coeffs_chans % input_chans), 0,
-                        platform::errors::InvalidArgument(
-                            "Slicing without affine offset, coefficients grid "
-                            "should have n_out*n_in channels, but got %d .",
-                            coeffs_chans));
-      output_chans = coeffs_chans / input_chans;
+      if (has_offset) {
+        PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0,
+                          platform::errors::InvalidArgument(
+                              "Slicing with affine offset, coefficients grid "
+                              "should have n_out*(n_in+1) channels, but got %d",
+                              coeffs_chans));
+        output_chans = coeffs_chans / (input_chans + 1);
+      } else {
+        PADDLE_ENFORCE_EQ(
+            (coeffs_chans % input_chans), 0,
+            platform::errors::InvalidArgument(
+                "Slicing without affine offset, coefficients grid "
+                "should have n_out*n_in channels, but got %d .",
+                coeffs_chans));
+        output_chans = coeffs_chans / input_chans;
+      }
     }
 
     std::vector<int64_t> output_dims;
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index bee3ab37448e8..555130fe85268 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -146,7 +146,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
       tag_end = -1;
       tag_single = -1;
     } else {
-      PADDLE_THROW("Unknown chunk scheme.");
+      PADDLE_THROW(platform::errors::InvalidArgument("Unknown chunk scheme."));
     }
     other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
     excluded_chunk_types.insert(
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index b85e740ada9bd..b8ecbe8ab4a9f 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <set>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -70,6 +72,23 @@ class WhileOp : public framework::OperatorBase {
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
     auto *program = block->Program();
+    bool is_test = Attr<bool>("is_test");
+
+    std::set<std::string> no_copy_var_names;
+    if (!is_test) {
+      const std::vector<framework::OpDesc *> &all_ops = block->AllOps();
+      for (const framework::OpDesc *op : all_ops) {
+        const framework::VariableNameMap &input_var_names = op->Inputs();
+        const framework::VariableNameMap &output_var_names = op->Outputs();
+        for (auto &ipt : input_var_names) {
+          for (const std::string &var_name : ipt.second) {
+            if (StrInVaraiableNameMap(var_name, output_var_names)) {
+              no_copy_var_names.insert(var_name);
+            }
+          }
+        }
+      }
+    }
 
     auto step_scopes =
         scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
@@ -89,7 +108,6 @@ class WhileOp : public framework::OperatorBase {
                           "The Output(StepScope) of WhileOp should be empty."));
 
     bool cond_data = GetCondData(cond);
-    bool is_test = Attr<bool>("is_test");
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
     VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
 
@@ -98,8 +116,32 @@ class WhileOp : public framework::OperatorBase {
       while (cond_data) {
         auto &current_scope = scope.NewScope();
         step_scopes->push_back(&current_scope);
+
+        std::vector<std::string> rename_vars;
+        for (const std::string &input_var_name : Inputs(kX)) {
+          if (no_copy_var_names.find(input_var_name) ==
+              no_copy_var_names.end()) {
+            std::string input_var_rename = input_var_name + kSuffix;
+            framework::Variable *input_var = scope.FindVar(input_var_name);
+            if (input_var->IsType<framework::LoDTensor>()) {
+              rename_vars.push_back(input_var_rename);
+              auto input_var_tensor = input_var->Get<LoDTensor>();
+              auto *rename_input_var_tensor =
+                  current_scope.Var(input_var_rename)->GetMutable<LoDTensor>();
+              framework::TensorCopy(input_var_tensor, dev_place,
+                                    rename_input_var_tensor);
+              rename_input_var_tensor->set_lod(input_var_tensor.lod());
+            }
+          }
+        }
         executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
                                     true);
+
+        for (auto &var_rename : rename_vars) {
+          std::string input_var_name =
+              var_rename.substr(0, var_rename.size() - strlen(kSuffix));
+          current_scope.Rename(var_rename, input_var_name);
+        }
         cond_data =
             GetCondData(scope.FindVar(Input(kCondition))->Get<LoDTensor>());
       }
@@ -312,6 +354,10 @@ class WhileGradOp : public framework::OperatorBase {
         //    continue;
         //  }
 
+        auto var_iter =
+            std::find(outside_og_names.begin(), outside_og_names.end(),
+                      pg_ig_names[param_id]);
+
         // zero gradient variable in step 0
         if (cur_scope_iter == step_scopes->rbegin()) {
           auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
@@ -326,7 +372,8 @@ class WhileGradOp : public framework::OperatorBase {
                         "or LoDTensor, but the received var[%s] is %s.",
                         inside_grad_name, framework::ToTypeName(var->Type())));
 
-          if (var->IsType<LoDTensor>()) {
+          if ((var_iter == outside_og_names.end()) &&
+              var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
             attrs["dtype"] = inside_tensor.type();
@@ -343,13 +390,18 @@ class WhileGradOp : public framework::OperatorBase {
                 ->set_lod(inside_tensor.lod());
           }
         }
-        auto new_inside_name = cur_scope.Rename(inside_grad_name);
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
-            {{"Out", {pg_ig_names[param_id]}}},
-            framework::AttributeMap{{"use_mkldnn", {false}}});
-        sum_op->Run(cur_scope, dev_place);
-        cur_scope.Rename(new_inside_name, inside_grad_name);
+        auto var_outside = scope.FindVar(pg_ig_names[param_id]);
+        if ((var_iter == outside_og_names.end()) ||
+            ((var_iter != outside_og_names.end()) &&
+             var_outside->IsType<framework::LoDTensorArray>())) {
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
+          auto sum_op = framework::OpRegistry::CreateOp(
+              "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
+              {{"Out", {pg_ig_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
+          sum_op->Run(cur_scope, dev_place);
+          cur_scope.Rename(new_inside_name, inside_grad_name);
+        }
       }
       dev_ctx.Wait();
       const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index a3fe71f3ec8b3..b8e9f9f36ac81 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -232,5 +232,16 @@ bool GetCondData(const framework::LoDTensor &cond) {
   return cpu_cond->data<bool>()[0];
 }
 
+bool StrInVaraiableNameMap(const std::string &name,
+                           const framework::VariableNameMap &var_names) {
+  for (auto &ipt : var_names) {
+    if (std::find(ipt.second.begin(), ipt.second.end(), name) !=
+        ipt.second.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index d2e9953e6477a..8b4a14570b1ef 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -38,6 +38,7 @@ static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
 static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+static constexpr char kSuffix[] = "@TMP_COPY";
 
 void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
     const framework::ProgramDesc &program, int block_id,
@@ -50,5 +51,8 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
 
 bool GetCondData(const framework::LoDTensor &cond);
 
+bool StrInVaraiableNameMap(const std::string &,
+                           const framework::VariableNameMap &);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 82954bc109a74..31f0c26a3f3a1 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -25,7 +26,6 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
     OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
     OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
 
@@ -122,7 +122,13 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
-             " cudnn concatenate all the weight to one Tensor");
+             " cudnn concatenate all the weight to one Tensor")
+        .AsDispensable();
+    AddInput("WeightList",
+             "(vector<Tensor>), stores weight and bias data when the weight "
+             "use the list format. ")
+        .AsDispensable()
+        .AsDuplicable();
     AddInput("SequenceLength",
              "(Tensor) When the input data is padding, "
              "set this parameter. This parameter represents "
@@ -216,7 +222,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
     OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
     OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
 
@@ -228,7 +233,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
     };
 
     SetOutGradDim("Input");
-    SetOutGradDim("W");
+    if (ctx->HasInputs("WeightList")) {
+      ctx->SetOutputsDim(framework::GradVarName("WeightList"),
+                         ctx->GetInputsDim("WeightList"));
+    }
     SetOutGradDim("InitH");
     SetOutGradDim("InitC");
   }
@@ -251,7 +259,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("Input", this->Input("Input"));
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
-    op->SetInput("W", this->Input("W"));
+    if (this->HasInput("WeightList")) {
+      op->SetInput("WeightList", this->Input("WeightList"));
+    }
     if (this->HasInput("SequenceLength")) {
       op->SetInput("SequenceLength", this->Input("SequenceLength"));
     }
@@ -262,8 +272,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
     op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
 
+    if (this->HasInput("WeightList")) {
+      op->SetOutput(framework::GradVarName("WeightList"),
+                    this->InputGrad("WeightList", false));
+    }
+
     op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
     op->SetOutput(framework::GradVarName("InitH"), this->InputGrad("InitH"));
     op->SetOutput(framework::GradVarName("InitC"), this->InputGrad("InitC"));
     op->SetAttrMap(this->Attrs());
@@ -274,8 +288,8 @@ template <typename T>
 class NotImpleKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(
-        "CPU is not support for this kernel now. Will be add in the future");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "CPU is not support for this kernel now. Will be add in the future"));
   }
 };
 
@@ -290,3 +304,20 @@ REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
 
 REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
 REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
+
+// TODO(Shixiaowei02) Add ModifyInput support
+REGISTER_OP_VERSION(cudnn_lstm)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade cudnn_lstm add a new input [WeightList] and modify input [W] to dispensable.)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput(
+                "WeightList",
+                "The WeightList stores weight and bias data. WeightList is "
+                "dispensable.")
+            .NewInput("SequenceLength",
+                      "When the input data is padding, set this parameter. "
+                      "SequenceLength is dispensable.")
+            .NewOutput("StateOut", "Store the global drop state when training")
+            .NewOutput("Reserve",
+                       "A temporary output Tensor to store the reserve_data"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 6ac75b78d7058..bea7d9c02ca7d 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -30,6 +30,66 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T, typename Type>
+bool is_continuous(const Type &weight_list) {
+  bool continuous = true;
+  for (size_t i = 0; i < weight_list.size() - 1; ++i) {
+    auto *in_data = weight_list[i]->template data<T>();
+    auto *in_after_data = weight_list[i + 1]->template data<T>();
+    auto in_size = weight_list[i]->numel();
+    bool temp = in_data + in_size == in_after_data;
+    continuous = continuous && temp;
+  }
+  return continuous;
+}
+
+int size_sum(const std::vector<const Tensor *> &weight_list) {
+  int size = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    auto in_size = weight_list[i]->numel();
+    size += in_size;
+  }
+  return size;
+}
+
+template <typename T>
+void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
+                      const std::vector<const Tensor *> &weight_list,
+                      Tensor *weight) {
+  auto weight_data = weight->data<T>();
+  int weight_offset = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    const T *in_data = weight_list[i]->data<T>();
+    auto in_size = weight_list[i]->numel();
+
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+                 weight_data + weight_offset,
+                 BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()),
+                 in_data, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
+template <typename T>
+void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream,
+                           std::vector<Tensor *> *weight_grad,
+                           const std::vector<const Tensor *> &weight_input,
+                           const Tensor *weight) {
+  int weight_offset = 0;
+  auto *weight_data = weight->data<T>();
+  for (size_t i = 0; i < weight_input.size(); ++i) {
+    auto in_size = weight_input[i]->numel();
+    T *weight_grad_data = (*weight_grad)[i]->mutable_data<T>(place);
+    const T *src = weight_data + weight_offset;
+
+    memory::Copy(
+        BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()),
+        weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+        src, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
 template <typename T>
 void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
                   const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
@@ -75,8 +135,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     const Tensor *init_h = ctx.Input<Tensor>("InitH");
     const Tensor *init_c = ctx.Input<Tensor>("InitC");
 
-    auto w = ctx.Input<Tensor>("W");
-
     Tensor *out = ctx.Output<Tensor>("Out");
     Tensor *last_h = ctx.Output<Tensor>("LastH");
     Tensor *last_c = ctx.Output<Tensor>("LastC");
@@ -87,8 +145,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     const T *init_h_data = init_h->data<T>();
     const T *init_c_data = init_c->data<T>();
 
-    const T *w_data = w->data<T>();
-
     T *out_data = out->mutable_data<T>(ctx.GetPlace());
     T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
     T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
@@ -113,11 +169,45 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seq_length = x->dims()[0];
     int batch_size = x->dims()[1];
     int input_size = x->dims()[2];
-    int weight_numel = w->numel();
     bool state_initialized = state_out->IsInitialized() ? true : false;
 
     size_t workspace_size;
     size_t reserve_size;
+    Tensor weight_whole;
+    T *w_data = nullptr;
+    int weight_numel;
+    bool w_initialized = false;
+    auto place = ctx.GetPlace();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    if (is_test && ctx.HasInput("W")) {
+      auto *W = ctx.Input<Tensor>("W");
+      w_initialized = W->IsInitialized() ? true : false;
+      weight_numel = W->numel();
+    }
+    if (!w_initialized) {
+      auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+      bool continuous =
+          is_continuous<T, std::vector<const Tensor *>>(weight_list);
+      weight_numel = size_sum(weight_list);
+
+      if (!continuous) {
+        LOG_FIRST_N(WARNING, 2)
+            << "If the memory space of the Input WeightList is not "
+               "continuous, less efficient calculation will be "
+               "called. Please call coalesce_tensor op to make the "
+               "input memory continuous.";
+        weight_whole.mutable_data<T>({weight_numel}, place);
+        weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+        w_data = weight_whole.data<T>();
+      } else {
+        w_data = const_cast<T *>(weight_list[0]->data<T>());
+      }
+    } else {
+      auto *W = ctx.Input<Tensor>("W");
+      w_data = const_cast<T *>(W->data<T>());
+    }
 
     ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
                       num_layers, dropout_prob, seed, weight_numel,
@@ -136,6 +226,12 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
                       init_h_data, init_c_data, w_data, out_data, last_h_data,
                       last_c_data, &workspace_data_, workspace_size);
+      if (!w_initialized && ctx.HasInput("W") && ctx.HasInput("WeightList")) {
+        auto *W = const_cast<Tensor *>(ctx.Input<Tensor>("W"));
+        auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+        W->mutable_data<T>({weight_numel}, place);
+        weight_to_tensor<T>(place, stream, weight_list, W);
+      }
     } else {
       if (!has_seq_length) {
         // for train
@@ -176,11 +272,11 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *input = ctx.Input<Tensor>("Input");
-    auto *weight = ctx.Input<Tensor>("W");
     auto *init_h = ctx.Input<Tensor>("InitH");
     auto *init_c = ctx.Input<Tensor>("InitC");
     auto *reserve = ctx.Input<Tensor>("Reserve");
     auto *state_out = ctx.Input<Tensor>("StateOut");
+    auto weight_list = ctx.MultiInput<Tensor>("WeightList");
 
     auto *out = ctx.Input<Tensor>("Out");
     auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -188,9 +284,10 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
 
     auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
     auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
     auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -199,7 +296,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
 
-    auto *weight_data = weight->data<T>();
     auto *init_h_data = init_h->data<T>();
     auto *init_c_data = init_c->data<T>();
     auto *out_data = out->data<T>();
@@ -207,18 +303,50 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *last_h_grad_data = last_h_grad->data<T>();
     auto *last_c_grad_data = last_c_grad->data<T>();
 
+    auto place = ctx.GetPlace();
+    int weight_numel = size_sum(weight_list);
+    bool continuous =
+        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    Tensor weight_whole;
+    T *weight_data = nullptr;
+
+    if (!continuous) {
+      weight_whole.mutable_data<T>({weight_numel}, place);
+      weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+      weight_data = weight_whole.data<T>();
+    } else {
+      weight_data = const_cast<T *>(weight_list[0]->data<T>());
+    }
+
+    Tensor weight_grad;
     math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    weight_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+    weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
+    zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+    T *weight_grad_data = weight_grad.data<T>();
+
+    int offset = 0;
+    for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+      size_t len = weight_grad_list[i]->numel();
+      auto dim = weight_grad_list[i]->dims();
+      weight_grad_list[i]
+          ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                            static_cast<int64_t>(offset + len)))
+          .Resize(dim);
+      offset += len;
+    }
 
     in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
     auto *in_grad_data = in_grad->data<T>();
 
-    init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-    auto *init_h_grad_data = init_h_grad->data<T>();
+    if (init_h_grad) init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+    auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
 
-    init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-    auto *init_c_grad_data = init_c_grad->data<T>();
+    if (init_c_grad) init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+    auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
 
     float dropout_prob = ctx.Attr<float>("dropout_prob");
     bool is_bidirec = ctx.Attr<bool>("is_bidirec");
@@ -236,7 +364,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int seq_length = input_dims[0];
     int batch_size = input->dims()[1];
     int input_size = input->dims()[2];
-    int weight_numel = weight->numel();
 
     size_t workspace_size;
     size_t reserve_size;
@@ -268,8 +395,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
-          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-          reserve_size));
+          weight_grad_data, const_cast<uint8_t *>(reserve_data), reserve_size));
     } else {
 #if CUDNN_VERSION >= 7201
       // for train
@@ -288,7 +414,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
           out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
-          rnn.weight_desc(), weight_grad->data<T>(),
+          rnn.weight_desc(), weight_grad_data,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 8d79626aa8785..80490af33a1f9 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -111,8 +111,9 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
 
     if (normalized) {
       for (size_t i = 1; i < ref_lod.size(); ++i) {
-        PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                       "Reference string %d is empty.", i);
+        PADDLE_ENFORCE_GT(ref_lod[i], ref_lod[i - 1],
+                          platform::errors::InvalidArgument(
+                              "Reference string %d is empty.", i));
       }
     }
 
diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
index 3e1aec7ceeec7..ef290c2eff2be 100644
--- a/paddle/fluid/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
@@ -58,8 +58,9 @@ class EditDistanceKernel : public framework::OpKernel<T> {
 
     if (normalized) {
       for (size_t i = 1; i < ref_lod.size(); ++i) {
-        PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                       "Reference string %d is empty.", i);
+        PADDLE_ENFORCE_GT(ref_lod[i], ref_lod[i - 1],
+                          platform::errors::InvalidArgument(
+                              "Reference string %d is empty.", i));
       }
     }
     auto num_strs = hyp_lod.size() - 1;
@@ -106,10 +107,11 @@ class EditDistanceKernel : public framework::OpKernel<T> {
       }
 
       if (normalized) {
-        PADDLE_ENFORCE(n > 0,
-                       "The reference string (#%d) cannot be empty "
-                       "when Attr(normalized) is enabled.",
-                       n);
+        PADDLE_ENFORCE_GT(n, 0UL,
+                          platform::errors::InvalidArgument(
+                              "The reference string (#%d) cannot be empty "
+                              "when Attr(normalized) is enabled.",
+                              n));
         distance = distance / n;
       }
       out[num] = distance;
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 870464efed2b1..25b83ed93f729 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -89,8 +89,9 @@ class ExpandAsGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true);
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true);
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "ExpandAs");
 
     auto x_dims = ctx->GetInputDim("X");
     auto x_grad_name = framework::GradVarName("X");
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
index b189aa6f12274..cbaeb0c4e4256 100644
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -61,7 +61,10 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     switch (rank) {
       REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
-        PADDLE_THROW("Only support tensor with rank being between 1 and 6.");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But received "
+            "tensor X's rank = %d.",
+            rank));
     }
   }
 
@@ -77,13 +80,19 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto x_dims = in0->dims();
     auto y_dims = target_tensor->dims();
     for (int i = 0; i < y_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(x_dims[i], 0, "X(input) should not have 0 dim");
+      PADDLE_ENFORCE_NE(
+          x_dims[i], 0UL,
+          platform::errors::InvalidArgument(
+              "X(input) should not have 0 dim. But received x_dims[%d] = 0.",
+              i));
       bcast_dims[i] = y_dims[i] / x_dims[i];
       bcast_dims_remainder += y_dims[i] % x_dims[i];
     }
-    PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
-                      "X(input) could not be broadcast together with remapped "
-                      "shape(expand tensor's shape)");
+    PADDLE_ENFORCE_EQ(
+        bcast_dims_remainder, 0UL,
+        platform::errors::InvalidArgument(
+            "X(input) could not be broadcast together with remapped "
+            "shape(expand tensor's shape)"));
     framework::DDim out_dims(in_dims);
     for (size_t i = 0; i < bcast_dims.size(); ++i) {
       out_dims[i] *= bcast_dims[i];
@@ -137,7 +146,10 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
       switch (dims) {
         REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
         default:
-          PADDLE_THROW("Only support tensor with rank being between 1 and 6.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
       }
     }
   }
@@ -149,12 +161,6 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
                         const std::vector<int>& reduce_dims_vec) const {
     size_t reshape_size = reshape_dims_vec.size();
     size_t reduce_size = reduce_dims_vec.size();
-    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reshape dimensions.");
-    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reduce dimensions.");
     auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index e9b4c7dacf8b4..04fa8db9a5a6f 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -404,6 +404,10 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
     AddComment(R"DOC(
 The scale of FakeChannelWiseQuantize operator is a vector.
 In detail, each channel of the input X has a scale value.
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 2f5afbe0eedf9..94a75f930beba 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -146,16 +146,19 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 
     auto* out = context.Output<framework::Tensor>("Out");
     auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
     out->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
     int quant_axis = context.Attr<int>("quant_axis");
+    bool is_test = context.Attr<bool>("is_test");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                 out_scale_data);
+    if (!is_test) {
+      T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+      FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                   out_scale_data);
+    }
     ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
         dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
   }
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 847b24f4f0b0b..d791b2bcfd09f 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -23,64 +23,80 @@ class FCOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "X(Input) of Fully Connected should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Out(Output) of Fully Connected should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      "W(Input) of Fully Connected should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "FC");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "FC");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FC");
 
-    auto in_dims = ctx->GetInputDim("Input");
     auto w_dims = ctx->GetInputDim("W");
     bool padding_weights = ctx->Attrs().Get<bool>("padding_weights");
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The input Weight of fc is expected to be a 2-D tensor. "
+            "But received the number of Weight's dimensions is %d, "
+            "Weight's shape is %s.",
+            w_dims.size(), w_dims));
 
     if (ctx->HasInput("Bias")) {
       auto bias_dims = ctx->GetInputDim("Bias");
       auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+
+      PADDLE_ENFORCE_LE(
+          bias_dims.size(), 2,
+          platform::errors::InvalidArgument(
+              "The input Bias of fc is expected to be a 1-D or 2-D tensor. But "
+              "received the number of Bias's dimensions is %d, "
+              "Bias's shape is %s.",
+              bias_dims.size(), bias_dims));
+
+      PADDLE_ENFORCE_EQ(
+          bias_dims[bias_dims.size() - 1], w_dims1,
+          platform::errors::InvalidArgument(
+              "The last dimension of input Bias is expected be equal "
+              "to the actual width of input Weight. But received the last "
+              "dimension of Bias is %d, Bias's shape is %s; "
+              "the actual width of Weight is %d, Weight's shape is %s.",
+              bias_dims[bias_dims.size() - 1], bias_dims, w_dims1, w_dims));
+
       if (bias_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(bias_dims[0], 1,
-                          platform::errors::InvalidArgument(
-                              "The shape of Bias is invalid."
-                              "The height of Bias should be 1."
-                              "But received height of Bias is %d.",
-                              bias_dims[0]));
-        PADDLE_ENFORCE_EQ(
-            bias_dims[1], w_dims1,
-            platform::errors::InvalidArgument(
-                "The shape of Bias is invalid."
-                "The width of Bias should be equal to width of Weight."
-                "But received width of Bias is %d and width of Weight is %d.",
-                bias_dims[1], w_dims1));
-      } else if (bias_dims.size() == 1) {
         PADDLE_ENFORCE_EQ(
-            bias_dims[0], w_dims1,
+            bias_dims[0], 1,
             platform::errors::InvalidArgument(
-                "The shape of Bias is invalid."
-                "The height of Bias should be equal to the width of weight."
-                "But received height of Bias is %d and width of Weight is %d.",
-                bias_dims[0], w_dims1));
+                "The first dimension of input Bias is expected to be 1, "
+                "but received %d, Bias's shape is %s.",
+                bias_dims[0], bias_dims));
       }
     }
 
+    auto in_dims = ctx->GetInputDim("Input");
+    int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
+    PADDLE_ENFORCE_LT(
+        in_num_col_dims, in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The attribute in_num_col_dims used to flatten Input to "
+            "a 2-D tensor, is expected to be less than the number of "
+            "Input's dimensions. But recieved in_num_col_dims is %d, "
+            "the number of Input's dimensions is %d, Input's shape is %s.",
+            in_num_col_dims, in_dims.size(), in_dims));
+
     auto& activation_type = ctx->Attrs().Get<std::string>("activation_type");
     if (!activation_type.empty()) {
       PADDLE_ENFORCE_EQ(activation_type, "relu",
-                        "Activation %s is not supportetd in fc now.",
-                        activation_type.c_str());
+                        platform::errors::InvalidArgument(
+                            "The attribute activation_type of fc is expected "
+                            "to be \"relu\", but received %s.",
+                            activation_type.c_str()));
     }
+
     if (ctx->Attrs().Get<bool>("use_mkldnn")) {
       PADDLE_ENFORCE_EQ(
           in_dims.size() >= 2 && in_dims.size() <= 4, true,
           platform::errors::Unimplemented(
-              "Fully Connected input should be 2D, 3D or 4D tensor."));
+              "The Input of fc is expected to be a 2-D, 3-D or 4-D tensor when "
+              "use_mkldnn is set. But recieved the number of Input's "
+              "dimensions is %d, Input's shape is %s.",
+              in_dims.size(), in_dims));
     }
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected weights should be 2-D tensor.");
-    int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
-    PADDLE_ENFORCE_GT(
-        in_dims.size(), in_num_col_dims,
-        "The input tensor Input's rank of FCOp should be larger than "
-        "in_num_col_dims.");
 
     std::vector<int64_t> output_dims;
     FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims,
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 907f61196d61b..6258dd0a3868f 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -32,11 +32,15 @@ inline void FCOutputSize(const framework::DDim& in_dims,
   auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
   auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
   auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
-  PADDLE_ENFORCE_EQ(in_mat_dims[1], w_dims0,
-                    platform::errors::InvalidArgument(
-                        "Fully Connected input and weigth size do not match. "
-                        "input width: %d,weight height: %d",
-                        in_mat_dims[1], w_dims0));
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims0,
+      platform::errors::InvalidArgument(
+          "The input's second dimension and weight's first dimension is "
+          "expected to be the same. But recieved input's second dimension is "
+          "%d, input's shape is %s; weight's first dimension is %d, weight's "
+          "shape is %s.",
+          in_mat_dims[1], in_mat_dims, w_dims0,
+          framework::make_ddim({w_dims0, w_dims1})));
 
   out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
   for (int i = 0; i < in_num_col_dims; ++i) {
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index 2c0c5f9ec0afa..c61b9a9e48854 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -276,7 +276,8 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
         ctx, paddle::operators::math::MulFunctor<T>(),
         paddle::operators::math::SigmoidFunctor<T>(), in_x, in_y, outputs);
   } else {
-    PADDLE_THROW("%s has not been implemented.", funcs_str);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s has not been implemented.", funcs_str));
   }
 }
 
@@ -374,7 +375,8 @@ static void RunGradFunctors(
         paddle::operators::math::SigmoidGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else {
-    PADDLE_THROW("%s has not been implemented.", funcs_str);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s has not been implemented.", funcs_str));
   }
 }
 
@@ -386,16 +388,21 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
                                  "X", "FusedElemwiseActivation");
     auto &in_y = GET_DATA_SAFELY(ctx.Input<framework::Tensor>("Y"), "Input",
                                  "Y", "FusedElemwiseActivation");
-    PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty");
+
+    PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "The output(Out) should not be empty"));
     auto output = ctx.Output<framework::Tensor>("Out");
 
     std::vector<framework::Tensor *> outputs;
     outputs.emplace_back(output);
 
     if (ctx.Attr<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"),
-                     "The save_intermediate_out is enable, so the "
-                     "IntermediateOut should not be empty.");
+      PADDLE_ENFORCE_EQ(ctx.HasOutput("IntermediateOut"), true,
+                        platform::errors::InvalidArgument(
+                            "The save_intermediate_out is enable, so the "
+                            "IntermediateOut should not be empty."));
+
       auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut");
       outputs.emplace_back(intermediate_out);
     } else {
@@ -411,13 +418,18 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_y = ctx.Input<framework::Tensor>("Y");
-    PADDLE_ENFORCE(in_y != nullptr, "Input(Y) should not be nullptr.");
+    PADDLE_ENFORCE_NE(in_y, nullptr, platform::errors::InvalidArgument(
+                                         "Input(Y) should not be nullptr."));
     auto in_out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE(in_out != nullptr, "Input(Out) should not be nullptr.");
+    PADDLE_ENFORCE_NE(
+        in_out, nullptr,
+        platform::errors::InvalidArgument("Input(Out) should not be nullptr."));
     auto in_out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(in_out_grad != nullptr,
-                   "Input(Out@Grad) should not be nullptr.");
+    PADDLE_ENFORCE_NE(in_out_grad, nullptr,
+                      platform::errors::InvalidArgument(
+                          "Input(Out@Grad) should not be nullptr."));
+
     framework::Tensor *in_x =
         const_cast<framework::Tensor *>(ctx.Input<framework::Tensor>("X"));
     framework::Tensor *x_grad =
@@ -437,24 +449,28 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
       // recompute.
       in_intermediate_out = const_cast<framework::Tensor *>(
           ctx.Input<framework::Tensor>("IntermediateOut"));
-      PADDLE_ENFORCE(in_intermediate_out != nullptr,
-                     "The option of 'save_intermediate_out' is opened, "
-                     "so the number of 'Out' should be two.");
+      PADDLE_ENFORCE_NE(in_intermediate_out, nullptr,
+                        platform::errors::InvalidArgument(
+                            "The option of 'save_intermediate_out' is opened,"
+                            " so the number of 'Out' should be two."));
     } else {
       if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be null.");
+        PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
+                                             "Input(X) should not be null."));
       }
     }
 
     // Get in_x
     if (ctx.HasInput("X")) {
-      PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be nullptr.");
+      PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
+                                           "Input(X) should not be null."));
     } else {
       // If functor_list contains elementwise_add, the backward doesn't use
       // in_x, in_y and in_out.
-      PADDLE_ENFORCE(InputXCanBeAbsent(functor_list),
-                     "Only when the compoundfunctor contains "
-                     "elementwise_add_grad, the 'X' could be absent.");
+      PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list), true,
+                        platform::errors::InvalidArgument(
+                            "Only when the compoundfunctor contains "
+                            "elementwise_add_grad, the 'X' could be absent."));
       in_x = const_cast<framework::Tensor *>(in_out_grad);
     }
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index aeaec84ba5c94..8713d58034241 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -204,9 +204,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       auto *table_t = context.Input<SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PermissionDenied(
           "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
+          "must be either LoDTensor or SelectedRows."));
     }
 
     bool is_sparse = context.Attr<bool>("is_sparse");
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
index ea7d6a93d1b28..08909bcb6fcb9 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
@@ -22,47 +22,73 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        "Input(X) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("W"), true,
-        "Input(W) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        "Input(Y) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        "Output(Out) of fused_fc_elementwise_layernorm should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FusedFcElementwiseLayernorm");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W",
+                   "FusedFcElementwiseLayernorm");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y",
+                   "FusedFcElementwiseLayernorm");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedFcElementwiseLayernorm");
 
     auto w_dims = ctx->GetInputDim("W");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected input should be 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The input Weight of fc is expected to be a 2-D tensor. "
+            "But received the number of Weight's dimensions is %d, ",
+            "Weight's shape is %s.", w_dims.size(), w_dims));
 
     if (ctx->HasInput("Bias0")) {
       auto bias0_dims = ctx->GetInputDim("Bias0");
+
+      PADDLE_ENFORCE_LE(bias0_dims.size(), 2,
+                        platform::errors::InvalidArgument(
+                            "The input Bias of fc is expected to be an 1-D or "
+                            "2-D tensor. But received the number of Bias's "
+                            "dimensions is %d, Bias's shape is %s.",
+                            bias0_dims.size(), bias0_dims));
+
+      PADDLE_ENFORCE_EQ(
+          bias0_dims[bias0_dims.size() - 1], w_dims[1],
+          platform::errors::InvalidArgument(
+              "The last dimension of input Bias is expected be equal "
+              "to the actual width of input Weight. But received the last "
+              "dimension of Bias is %d, Bias's shape is %s; "
+              "the actual width of Weight is %d, Weight's shape is %s.",
+              bias0_dims[bias0_dims.size() - 1], bias0_dims, w_dims[1],
+              w_dims));
+
       if (bias0_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(bias0_dims[0], 1,
-                          "The shape of Bias must be [1, dim].");
-        PADDLE_ENFORCE_EQ(bias0_dims[1], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
-      } else if (bias0_dims.size() == 1) {
-        PADDLE_ENFORCE_EQ(bias0_dims[0], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
+        PADDLE_ENFORCE_EQ(
+            bias0_dims[0], 1,
+            platform::errors::InvalidArgument(
+                "The first dimension of input Bias is expected to be 1, "
+                "but received %d, Bias's shape is %s.",
+                bias0_dims[0], bias0_dims));
       }
     }
 
     auto x_dims = ctx->GetInputDim("X");
     int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), x_num_col_dims,
-        "The input tensor Input's rank of FCOp should be larger than "
-        "in_num_col_dims.");
+    PADDLE_ENFORCE_LT(
+        x_num_col_dims, x_dims.size(),
+        platform::errors::InvalidArgument(
+            "The attribute x_num_col_dims used to flatten input X to "
+            "a 2-D tensor, is expected to be less than the number of "
+            "input X's dimensions. But recieved x_num_col_dims is %d, "
+            "the number of input X's dimensions is %d, input X's shape is %s.",
+            x_num_col_dims, x_dims.size(), x_dims));
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], w_dims[0],
-        "Fully Connected input and weigth size do not match. %s, %s");
+        platform::errors::InvalidArgument(
+            "The input's second dimension and weight's first dimension is "
+            "expected to be the same. But recieved input's second dimension is "
+            "%d, input's shape is %s; weight's first dimension is %d, weight's "
+            "shape is %s.",
+            x_mat_dims[1], x_mat_dims, w_dims[0], w_dims));
 
     std::vector<int64_t> fc_out_dims;
     for (int i = 0; i < x_num_col_dims; ++i) {
@@ -71,29 +97,67 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
     fc_out_dims.push_back(w_dims[1]);
 
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims);
+    PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims,
+                      platform::errors::InvalidArgument(
+                          "The output's shape of fc is expected to be equal to "
+                          "that of input Y. But recieved output's shape of fc "
+                          "is %s, input Y's shape is %s.",
+                          framework::make_ddim(fc_out_dims), y_dims));
 
     auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
     PADDLE_ENFORCE_LT(
         begin_norm_axis, y_dims.size(),
-        "'begin_norm_axis' must be less than the rank of Input(Y).");
+        platform::errors::InvalidArgument(
+            "The attribute begin_norm_axis used to flatten input Y to a 2-D "
+            "tensor, is expected to be less than the number of input Y's "
+            "dimensions. But recieved begin_norm_axis is %d, the number of "
+            "input Y's dimensions is %d, input Y's shape is %s.",
+            begin_norm_axis, y_dims.size(), y_dims));
 
     auto y_mat_dim = framework::flatten_to_2d(y_dims, begin_norm_axis);
     int64_t dim_0 = y_mat_dim[0];
     int64_t dim_1 = y_mat_dim[1];
     if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
+      auto scale_dims = ctx->GetInputDim("Scale");
+      PADDLE_ENFORCE_EQ(scale_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The input Scale is expected to be an 1-D tensor. "
+                            "But recieved the number of input Scale's "
+                            "dimensions is %d, input Scale's shape is %s.",
+                            scale_dims.size(), scale_dims));
 
       if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], dim_1,
-                          "scale should with right");
+        PADDLE_ENFORCE_EQ(
+            scale_dims[0], dim_1,
+            platform::errors::InvalidArgument(
+                "The first dimension of input Scale is expected to be equal to "
+                "the second dimension of input Y after flattened. "
+                "But recieved the first dimension of input Scale is %d, input "
+                "Scale's shape is %s; the second dimension of flattened input "
+                "Y is %d, input Y's shape is %s, flattened axis is %d.",
+                scale_dims[0], scale_dims, dim_1, y_dims, begin_norm_axis));
       }
     }
     if (ctx->HasInput("Bias1")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1").size(), 1);
+      auto bias1_dims = ctx->GetInputDim("Bias1");
+      PADDLE_ENFORCE_EQ(
+          bias1_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The input Bias1 is expected to be an 1-D tensor. "
+              "But recieved the number of input Bias1's dimension is %d, "
+              "input Bias1's shape is %s.",
+              bias1_dims.size(), bias1_dims));
+
       if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1")[0], dim_1,
-                          "bias should with right");
+        PADDLE_ENFORCE_EQ(
+            bias1_dims[0], dim_1,
+            platform::errors::InvalidArgument(
+                "The first dimension of input Bias1 is expected to be equal to "
+                "the second dimension of input Y after flattened. "
+                "But recieved the first dimension of input Bias1 is %d, input "
+                "Bias1's shape is %s; the second dimension of flatten input "
+                "Y is %d, input Y's shape is %s, flattened axis is %d.",
+                bias1_dims[0], bias1_dims, dim_1, y_dims, begin_norm_axis));
       }
     }
 
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 5fad1b116de64..e51d94e4b1e05 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -86,7 +86,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
 
       // Weights for int8 kernel are of a type s8
       const auto weights_dt =
-          is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
+          is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType<T>();
 
       // oneDNN RNN dimensions
       const int64_t D = 1;  // Directions
@@ -95,7 +95,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
 
       // Create memory descriptors
       auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
-                                    MKLDNNMemoryFormat::any);
+                                    MKLDNNMemoryFormat::ntc);
       auto weight_x_md =
           MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
       auto weight_h_md =
@@ -103,7 +103,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
                                    MKLDNNMemoryFormat::ldgo);
       auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
-                                     MKLDNNMemoryFormat::any);
+                                     MKLDNNMemoryFormat::ntc);
       auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
                                  MKLDNNMemoryFormat::ldnc);
 
@@ -226,6 +226,8 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
   }
 
   // TODO(grygielski) H0 is for now persistable
+  // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
+  // not support in yet)
   std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
     const std::string h0_key = memory_key_ + "@h0";
     auto memory_p =
@@ -397,14 +399,14 @@ template <typename T>
 class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    const bool is_bf16 = std::is_same<T, paddle::platform::bfloat16>::value;
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
-    // TODO(grygielski) Add option for bfloat
-    if (!is_INT8 || force_fp32_output) {
+    // BF16 does not support force output
+    if (!is_bf16 && force_fp32_output) {
       RunKernel<float>(ctx);
     } else {
-      RunKernel<uint8_t>(ctx);
+      RunKernel<T>(ctx);
     }
   }
 
@@ -495,4 +497,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
                    ops::FusionGRUMKLDNNKernel<float>,
+                   ops::FusionGRUMKLDNNKernel<paddle::platform::bfloat16>,
                    ops::FusionGRUMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index 03279a9b2c15b..1018adcd930a4 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -181,10 +181,22 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     auto *place = dev_ctx.eigen_device();
 
+    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+// Once eigen on Windows is updated, the if branch can be removed.
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::DSizes<int, 2> bcast(1, sample_size);
     Eigen::DSizes<int, 2> C_shape(C, 1);
     Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+    Eigen::DSizes<int, 1> rdims(1);
+#else
+    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+    bcast.set(1, sample_size);
+    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+    C_shape.set(0, C);
+    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+    NxC_shape.set(0, NxC);
+    Eigen::IndexList<Eigen::type2index<1>> rdims;
+#endif
 
     math::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
@@ -201,8 +213,6 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto x_arr = x_e.reshape(shape);
 
-    Eigen::DSizes<int, 1> rdims(1);
-
     saved_mean_e.device(*place) = x_arr.mean(rdims);
     auto saved_variance_arr =
         (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
@@ -316,14 +326,25 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     auto *place = dev_ctx.eigen_device();
 
+    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
+    Eigen::DSizes<int, 2> param_shape(N, C);
+    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::DSizes<int, 1> rdims(0);
     Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
     Eigen::DSizes<int, 2> bcast(1, sample_size);
     Eigen::DSizes<int, 2> C_shape(C, 1);
     Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#else
+    Eigen::IndexList<Eigen::type2index<0>> rdims;
+    Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
+    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+    bcast.set(1, sample_size);
+    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+    C_shape.set(0, C);
+    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+    NxC_shape.set(0, NxC);
+#endif
 
     math::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 488cbc6d517fc..d4f3fc5d7a622 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -27,9 +27,10 @@ static inline T NormalizeL1(T* x, size_t len) {
   // (This comment is from the old LinearChainCRFLayer.)
   // Right now, we just bet that sum won't be zero. If this really happens, we
   // will figure out what should be done then.
-  PADDLE_ENFORCE(sum,
-                 "The unnormalized probabilities of all possible unfinished "
-                 "sequences must be greater than 0.");
+  PADDLE_ENFORCE_GT(
+      sum, 0., platform::errors::InvalidArgument(
+                   "The unnormalized probabilities of all possible unfinished "
+                   "sequences must be greater than 0."));
   T s = 1. / sum;
   for (size_t i = 0; i < len; ++i) x[i] *= s;
   return sum;
@@ -84,13 +85,19 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       const Tensor* label_length = ctx.Input<framework::Tensor>("Length");
       length_data = label_length->data<int64_t>();
       seq_num = label_length->numel();
-      PADDLE_ENFORCE_EQ(seq_num, emission_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "emission_dims[0].");
+      PADDLE_ENFORCE_EQ(
+          seq_num, emission_dims[0],
+          platform::errors::InvalidArgument(
+              "the size of Input(length) must be equal to "
+              "emission_dims[0]. But input_size = %d, emission_dims[0] = %d.",
+              seq_num, emission_dims[0]));
       auto label_dims = label->dims();
-      PADDLE_ENFORCE_EQ(seq_num, label_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "label_dims[0].");
+      PADDLE_ENFORCE_EQ(
+          seq_num, label_dims[0],
+          platform::errors::InvalidArgument(
+              "the size of Input(length) must be equal to "
+              "label_dims[0]. But input_size = %d, label_dims[0] = %d.",
+              seq_num, label_dims[0]));
 
       batch_size = emission_dims[0] * emission_dims[1];
       tag_num = emission_dims[2];
@@ -102,7 +109,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       math::set_constant(ctx.device_context(), alpha, 0.0);
     } else {
       in_lod = ctx.Input<LoDTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence.");
+      PADDLE_ENFORCE_NE(in_lod.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(Label) must be a sequence."));
       seq_num = in_lod[0].size() - 1;
       batch_size = emission_dims[0];
       tag_num = emission_dims[1];
@@ -204,7 +213,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     const int64_t* lbl = label.data<int64_t>();
     PADDLE_ENFORCE_LT(
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
-        "An invalid tag label that execesses the largest tag number.");
+        platform::errors::InvalidArgument(
+            "An invalid tag label that execesses the largest tag number."));
 
     // Calculate the nominator part, which depends on the label sequence.
     ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
@@ -254,7 +264,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
           {emission_dims[0] * emission_dims[1], emission_dims[2]});
     } else {
       in_lod = ctx.Input<LoDTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence.");
+      PADDLE_ENFORCE_NE(in_lod.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(Label) must be a sequence."));
       seq_num = static_cast<int64_t>(in_lod[0].size() - 1);
     }
 
diff --git a/paddle/fluid/operators/load_op_xpu.cc b/paddle/fluid/operators/load_op_xpu.cc
new file mode 100644
index 0000000000000..e56586552e498
--- /dev/null
+++ b/paddle/fluid/operators/load_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, int64_t>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index c325c0892ed81..917482589fcf3 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -23,23 +23,31 @@ class LstmUnitOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
-                   "Input(C_prev) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("C"),
-                   "Output(C) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("H"),
-                   "Output(H) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasInput("C_prev"), "Input", "C_prev", "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasOutput("C"), "Output", "C", "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasOutput("H"), "Output", "H", "lstm_unit");
 
     auto x_dims = ctx->GetInputDim("X");
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Input(X)'s rank must be 2. Received %d instead.", x_dims.size()));
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
-                        "Batch size of inputs and states must be equal");
+                        platform::errors::InvalidArgument(
+                            "Batch size of inputs and states must be equal, "
+                            "but received %d (inputs)"
+                            "vs %d (states).",
+                            x_dims[0], c_prev_dims[0]));
       PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
-                        "Dimension of FC should equal to prev state * 4");
+                        platform::errors::InvalidArgument(
+                            "Dimension of FC should equal to prev state * 4, "
+                            "but received %d (dimension of FC)"
+                            "vs %d (prev state * 4).",
+                            x_dims[1], c_prev_dims[1] * 4));
     }
 
     int b_size = c_prev_dims[0];  // batch size
@@ -85,10 +93,10 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
-                   "Input(C@GRAD) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
-                   "Input(H@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("C")), "Input",
+                   framework::GradVarName("C"), "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("H")), "Input",
+                   framework::GradVarName("H"), "lstm_unit");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
     ctx->SetOutputDim(framework::GradVarName("C_prev"),
                       ctx->GetInputDim("C_prev"));
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 810b83cb535fe..3949a066e0868 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -93,8 +93,9 @@ template <typename T>
 class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
 
     auto* x_tensor = ctx.Input<framework::Tensor>("X");
     auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -124,8 +125,9 @@ template <typename T>
 class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
 
     auto x_tensor = ctx.Input<Tensor>("X");
     auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
index 3fe7bda39b68d..99ae654d7ef0c 100644
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -39,8 +39,9 @@ template <typename DeviceContext, typename T>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
 
     auto* x_tensor = ctx.Input<framework::Tensor>("X");
     auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -82,8 +83,9 @@ template <typename DeviceContext, typename T>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
 
     auto x_tensor = ctx.Input<Tensor>("X");
     auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index f0a727f34fec7..a2d1d5295be82 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -91,7 +91,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
     else if (act_type == math::detail::ActivationType::kReLU)
       ReluFunctor<T>()(d, x, y);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("unsupported activation type"));
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -263,7 +264,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     else if (act_type == math::detail::ActivationType::kReLU)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("unsupported activation type"));
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 40cea7483f397..fec738378a64c 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -60,19 +60,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             hstart = AdaptStartIndex(ph, input_height, output_height);
             hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
+              hstart = ph * stride_height - padding_height;
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
               wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
             }
 
             T ele = pool_process.initial();
@@ -81,9 +87,10 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
+            }
+
             pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
@@ -137,19 +144,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
 
               T ele = pool_process.initial();
@@ -158,9 +171,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   pool_process.compute(input_data[h * input_width + w], &ele);
                 }
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[ph * output_width + pw] = ele;
             }
@@ -178,19 +191,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
               T ele = pool_process.initial();
               for (int h = hstart; h < hend; ++h) {
@@ -201,10 +220,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       &ele);
                 }
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
-
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[ph * output_width * output_channels +
                           pw * output_channels + c] = ele;
@@ -262,23 +280,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             hstart = AdaptStartIndex(ph, input_height, output_height);
             hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
+              hstart = ph * stride_height - padding_height;
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
               wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
+            }
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
             }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
             float scale = 1.0 / pool_size;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -346,23 +370,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
+              }
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
@@ -391,23 +421,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
+              }
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
@@ -672,34 +708,43 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     int dstart, dend;
     int hstart, hend;
     int wstart, wend;
+
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
             dstart = AdaptStartIndex(pd, input_depth, output_depth);
             dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
           }
+
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
+
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                dstart = pd * stride_depth - padding_depth;
+                dend =
+                    std::min(dstart + ksize_depth, input_depth + padding_depth);
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart = std::max(dstart, 0);
+                hstart = std::max(hstart, 0);
                 wstart = std::max(wstart, 0);
+                dend = std::min(dend, input_depth);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T ele = pool_process.initial();
@@ -712,10 +757,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   }
                 }
               }
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[output_idx] = ele;
             }
@@ -767,7 +811,6 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     int dstart, dend;
     int hstart, hend;
     int wstart, wend;
-
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
       const int output_stride = output_depth * output_height * output_width;
@@ -777,29 +820,40 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
+
                 int output_idx = (pd * output_height + ph) * output_width + pw;
                 T ele = pool_process.initial();
                 for (int d = dstart; d < dend; ++d) {
@@ -811,10 +865,10 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     }
                   }
                 }
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 pool_process.finalize(static_cast<T>(pool_size), &ele);
                 output_data[output_idx] = ele;
               }
@@ -835,28 +889,38 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
                 T ele = pool_process.initial();
@@ -871,10 +935,10 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     }
                   }
                 }
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 pool_process.finalize(static_cast<T>(pool_size), &ele);
                 int output_idx =
                     ((pd * output_height + ph) * output_width + pw) *
@@ -943,34 +1007,42 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             dstart = AdaptStartIndex(pd, input_depth, output_depth);
             dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
           }
+
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
+
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                dstart = pd * stride_depth - padding_depth;
+                dend =
+                    std::min(dstart + ksize_depth, input_depth + padding_depth);
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart = std::max(dstart, 0);
+                hstart = std::max(hstart, 0);
                 wstart = std::max(wstart, 0);
+                dend = std::min(dend, input_depth);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
 
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+              }
               float scale = 1.0 / pool_size;
               for (int d = dstart; d < dend; ++d) {
                 for (int h = hstart; h < hend; ++h) {
@@ -1046,34 +1118,44 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 float scale = 1.0 / pool_size;
                 for (int d = dstart; d < dend; ++d) {
                   for (int h = hstart; h < hend; ++h) {
@@ -1108,34 +1190,44 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 float scale = 1.0 / pool_size;
                 for (int d = dstart; d < dend; ++d) {
                   for (int h = hstart; h < hend; ++h) {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 809164df2056c..129298edafcf9 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -348,6 +348,181 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx,
   return dim;
 }
 
+template <typename DeviceContext, typename T>
+class MatMulDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b, bool flag,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+
+    int head_number = 1;
+#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
+    head_number = context.Attr<int>("head_number");
+#endif
+
+    if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) {
+      // the transpose_X must be false, if is true, the transpose cost much time
+      if (!trans_a) {
+        mat_dim_a.height_ *= mat_dim_a.batch_size_;
+        mat_dim_a.batch_size_ = 0;
+      }
+    }
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b,
+                static_cast<T>(context.Attr<float>("alpha")), out,
+                static_cast<T>(flag));
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b, bool flag,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, flag, out);
+    } else {
+      auto &ctx = context.template device_context<DeviceContext>();
+      MatMul(context, is_fold_init_dims_a
+                          ? FoldInitDims(a)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+             trans_a, is_fold_init_dims_b
+                          ? FoldInitDims(b)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+             trans_b, flag, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout = *context.Input<framework::LoDTensor>("DOut");
+    auto *ddx = context.Input<framework::LoDTensor>("DDX");
+    auto *ddy = context.Input<framework::LoDTensor>("DDY");
+
+    auto *dx = context.Output<framework::LoDTensor>("DX");
+    auto *dy = context.Output<framework::LoDTensor>("DY");
+    auto *ddout = context.Output<framework::LoDTensor>("DDOut");
+
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    framework::DDim ddout_dims;
+    if (ddout) {
+      ddout_dims = ddout->dims();
+      if (ddout_dims != dout.dims()) {
+        ddout->Resize(dout.dims());
+      }
+    }
+
+    bool ddout_flag = false;
+    if (ddx) {
+      auto ddx_mat = *ddx;
+      if (ddx_mat.dims() != x.dims()) {
+        ddx_mat.Resize(x.dims());
+      }
+      if (dy) {
+        if (transpose_x && transpose_y) {
+          // dy = dout' * ddx'
+          CalcInputGrad(context, dout, true, true, ddx_mat, true, false, false,
+                        dy);
+        } else if (transpose_x) {
+          // dy = ddx * dout
+          CalcInputGrad(context, ddx_mat, false, false, dout, false, true,
+                        false, dy);
+        } else if (transpose_y) {
+          // dy = dout' * ddx
+          CalcInputGrad(context, dout, true, true, ddx_mat, false, true, false,
+                        dy);
+        } else {
+          // dy = ddx' * dout
+          CalcInputGrad(context, ddx_mat, true, true, dout, false, true, false,
+                        dy);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad(context, ddx_mat, transpose_x, true, y, transpose_y,
+                      false, ddout_flag, ddout);
+        ddout_flag = true;
+      }
+    }
+
+    if (ddy) {
+      auto ddy_mat = *ddy;
+      if (ddy_mat.dims() != y.dims()) {
+        ddy_mat.Resize(y.dims());
+      }
+      if (dx) {
+        if (transpose_x && transpose_y) {
+          // dx = ddy' * dout'
+          CalcInputGrad(context, ddy_mat, true, true, dout, true, false, false,
+                        dx);
+        } else if (transpose_x) {
+          // dx = ddy * dout'
+          CalcInputGrad(context, ddy_mat, false, false, dout, true, false,
+                        false, dx);
+        } else if (transpose_y) {
+          // dx = dout * ddy
+          CalcInputGrad(context, dout, false, false, ddy_mat, false, true,
+                        false, dx);
+        } else {
+          // dx = dout * ddy'
+          CalcInputGrad(context, dout, false, false, ddy_mat, true, false,
+                        false, dx);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad(context, x, transpose_x, true, ddy_mat, transpose_y,
+                      false, ddout_flag, ddout);
+      }
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+
+    if (ddout) {
+      if (ddout_dims != dout.dims()) {
+        ddout->Resize(ddout_dims);
+      }
+    }
+  }
+};
+
 class MatMulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -647,6 +822,61 @@ class MatMulOpGradMaker : public framework::SingleGradOpMaker<T> {
     retv->SetAttrMap(this->Attrs());
   }
 };
+
+class MatMulOpDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul");
+    OP_INOUT_CHECK(context->HasInput("DOut"), "Input", "DOut", "matmul");
+
+    if (context->HasOutput("DX") && context->HasInput("DDY")) {
+      context->ShareDim("X", "DX");
+    }
+
+    if (context->HasOutput("DY") && context->HasInput("DDX")) {
+      context->ShareDim("Y", "DY");
+    }
+
+    if (context->HasOutput("DDOut") &&
+        (context->HasInput("DDY") || context->HasInput("DDX"))) {
+      context->ShareDim("DOut", "DDOut");
+    }
+  }
+};
+
+template <typename T>
+class MatMulOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("matmul_grad_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Y", this->Input("Y"));
+    retv->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    retv->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    retv->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y")));
+
+    auto ddx = this->OutputGrad(framework::GradVarName("X"));
+    auto ddy = this->OutputGrad(framework::GradVarName("Y"));
+
+    if (!ddx.empty() || !ddy.empty()) {
+      retv->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+    }
+    retv->SetOutput(
+        "DX", ddy.empty() ? this->EmptyInputGrad() : this->InputGrad("X"));
+    retv->SetOutput(
+        "DY", ddx.empty() ? this->EmptyInputGrad() : this->InputGrad("Y"));
+
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -654,7 +884,10 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
                   ops::MatMulOpGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad,
+                  ops::MatMulOpDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::MatMulOpDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(matmul_grad_grad, ops::MatMulOpDoubleGrad);
 REGISTER_OP_CPU_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>);
@@ -663,6 +896,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>);
 
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad_grad,
+    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
@@ -675,4 +913,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
                           paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad_grad,
+    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index 7db2e9421b5ca..6d8d18a3d126e 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -83,6 +83,18 @@ class MaxOutOp : public framework::OperatorWithKernel {
                                      "Attr(groups) of Op(maxout) should be "
                                      "larger than 1. But received %d.",
                                      groups));
+    PADDLE_ENFORCE_EQ(
+        axis == 1 || axis == -1 || axis == 3, true,
+        platform::errors::InvalidArgument(
+            "axis only supported 1, -1 or 3, but recevied axis is: %d", axis));
+    PADDLE_ENFORCE_EQ(in_x_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "x's dims should be 4, but received x's dims is: %d",
+                          in_x_dims.size()));
+
+    if (axis < 0) {
+      axis += in_x_dims.size();
+    }
     PADDLE_ENFORCE_EQ(
         in_x_dims[axis] % groups, 0,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
index ec3897e4044ad..64b538fc5d5bd 100644
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
@@ -31,6 +31,9 @@ class MaxOutKernel : public framework::OpKernel<T> {
     Tensor* out = context.Output<Tensor>("Out");
     int groups = context.template Attr<int>("groups");
     int axis = context.template Attr<int>("axis");
+    if (axis < 0) {
+      axis += in_x->dims().size();
+    }
 
     math::MaxOutFunctor<DeviceContext, T> maxout_forward;
     maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
@@ -49,6 +52,10 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
     int groups = context.template Attr<int>("groups");
     int axis = context.template Attr<int>("axis");
+    if (axis < 0) {
+      axis += in_x->dims().size();
+    }
+
     auto& device_ctx = context.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
new file mode 100644
index 0000000000000..c0aa00e79341e
--- /dev/null
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    if (num_samples == 0) {
+      return;
+    }
+    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
+    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
+    size_t label_int32_size = num_samples * sizeof(int);
+    size_t label_int64_size = num_samples * sizeof(int64_t);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int* indices_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
+                   indices_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(indices_int32_size)));
+    int* label_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
+                   label_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(label_int32_size)));
+
+    int* indices_int32_host =
+        reinterpret_cast<int*>(std::malloc(indices_int32_size));
+    int64_t* indices_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
+    int* label_int32_host =
+        reinterpret_cast<int*>(std::malloc(label_int32_size));
+    int64_t* label_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
+    dev_ctx.Wait();
+    memory::Copy(platform::CPUPlace(), indices_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_data, indices_int64_size);
+    memory::Copy(platform::CPUPlace(), label_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_data, label_int64_size);
+    for (int i = 0; i < num_samples; ++i) {
+      label_int32_host[i] = label_int64_host[i];
+      for (int j = 0; j < class_dim; ++j) {
+        indices_int32_host[i * class_dim + j] =
+            indices_int64_host[i * class_dim + j];
+      }
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_int32_device, platform::CPUPlace(), indices_int32_host,
+                 indices_int32_size);
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_int32_device, platform::CPUPlace(), label_int32_host,
+                 label_int32_size);
+    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
+                          label_int32_device, num_samples, class_dim,
+                          correct_data, total_data, accuracy_data);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+    dev_ctx.Wait();
+    xpu_free(indices_int32_device);
+    xpu_free(label_int32_device);
+    std::free(indices_int32_host);
+    std::free(indices_int64_host);
+    std::free(label_int32_host);
+    std::free(label_int64_host);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    accuracy,
+    ops::AccuracyXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
new file mode 100644
index 0000000000000..232626df02e50
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -0,0 +1,2 @@
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op pooling transpose_op scope device_context enforce executor)
+
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index bf12c61a4d9b1..72d2f779f800b 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -126,6 +126,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, data_dims,
                   strides, ksize);
 
+    platform::PoolingMKLDNNHandler<T>::ComputeAdaptivePoolParameters(
+        ctx, paddle::framework::vectorize(in_x->dims()), ksize, strides);
+
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
new file mode 100644
index 0000000000000..e7caeef85f5f9
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+USE_OP(pool2d);
+USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
+USE_OP(transpose);
+USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
+
+namespace paddle {
+namespace operators {
+
+struct InputVars {
+  std::string name;
+  framework::LoDTensor *tensor;
+};
+
+TEST(test_pool2d_transpose_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});           // NHWC shape
+  framework::DDim expected_dims({1, 7, 512, 3});  // NHWC expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
+                                             static_cast<float>(20.0));
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(framework::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by transpose
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  auto axis = std::vector<int>(4, 0);
+  axis[1] = 2;
+  axis[2] = 3;
+  axis[3] = 1;
+  auto op_transpose = framework::OpRegistry::CreateOp(
+      "transpose", {{"X", {"y"}}}, {{"Out", {"z"}}},
+      {{"axis", {axis}}, {"use_mkldnn", {true}}});
+
+  op_pool->Run(scope, p);
+  op_transpose->Run(scope, p);
+  pool.Get(p)->Wait();
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(z->dims(), expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Computed shape does not match expected shape"));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 1c75424fae7ef..8748078109f16 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -104,25 +104,29 @@ class NCEKernel : public framework::OpKernel<T> {
 
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_probs->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAlias) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_alias->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistAlias) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_alias->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAliasProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAliasProbs).numel() = %d, "
-            "Attr(num_total_classes) = %d.",
-            dist_alias_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in "
+                "Input(CustomDistAliasProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAliasProbs).numel() = %d, "
+                "Attr(num_total_classes) = %d.",
+                dist_alias_probs->numel(), num_total_classes));
 
         const float *probs_data = dist_probs->data<float>();
         const int *alias_data = dist_alias->data<int>();
@@ -140,10 +144,11 @@ class NCEKernel : public framework::OpKernel<T> {
 
     for (int x = 0; x < sample_labels->numel(); x++) {
       PADDLE_ENFORCE_GE(sample_labels_data[x], 0,
-                        "ValueError: Every sample label should be "
-                        "non-negative. But received: "
-                        "Input(SampleLabels)[%d] = %d",
-                        x, sample_labels_data[x]);
+                        platform::errors::InvalidArgument(
+                            "ValueError: Every sample label should be "
+                            "non-negative. But received: "
+                            "Input(SampleLabels)[%d] = %d",
+                            x, sample_labels_data[x]));
     }
 
     auto sample_out = context.Output<Tensor>("SampleLogits");
@@ -311,25 +316,29 @@ class NCEGradKernel : public framework::OpKernel<T> {
 
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_probs->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAlias) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_alias->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistAlias) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_alias->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAliasProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAliasProbs).numel() = %d, "
-            "Attr(num_total_classes) = %d.",
-            dist_alias_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in "
+                "Input(CustomDistAliasProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAliasProbs).numel() = %d, "
+                "Attr(num_total_classes) = %d.",
+                dist_alias_probs->numel(), num_total_classes));
 
         const float *probs_data = dist_probs->data<float>();
         const int *alias_data = dist_alias->data<int>();
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index e42c4666e110f..75d1b36c7d6a8 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -37,13 +37,15 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
     if (ctx->HasInput("AccumulatePositivePair") ||
         ctx->HasInput("AccumulateNegativePair") ||
         ctx->HasInput("AccumulateNeutralPair")) {
-      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
-                         ctx->HasInput("AccumulateNegativePair") &&
-                         ctx->HasInput("AccumulateNeutralPair"),
-                     "All optional inputs(AccumulatePositivePair, "
-                     "AccumulateNegativePair, AccumulateNeutralPair) of "
-                     "PositiveNegativePairOp are required if one of them is "
-                     "specified.");
+      PADDLE_ENFORCE_EQ(
+          ctx->HasInput("AccumulatePositivePair") &&
+              ctx->HasInput("AccumulateNegativePair") &&
+              ctx->HasInput("AccumulateNeutralPair"),
+          true, platform::errors::InvalidArgument(
+                    "All optional inputs(AccumulatePositivePair, "
+                    "AccumulateNegativePair, AccumulateNeutralPair) of "
+                    "PositiveNegativePairOp are required if one of them "
+                    "is specified."));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 7cd164bfd3a3d..9d2639c10301d 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -32,7 +32,7 @@ class LogsumexpOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(x_rank, 4,
                       platform::errors::InvalidArgument(
                           "The input tensor X's dimensions of logsumexp "
-                          "should be less equal than 4. But received X's "
+                          "should be less or equal than 4. But received X's "
                           "dimensions = %d, X's shape = [%s].",
                           x_rank, x_dims));
     auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
@@ -45,20 +45,18 @@ class LogsumexpOp : public framework::OperatorWithKernel {
             axis.size()));
 
     for (size_t i = 0; i < axis.size(); i++) {
-      PADDLE_ENFORCE_LT(
-          axis[i], x_rank,
-          platform::errors::InvalidArgument(
-              "axis[%d] should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "where dimesion(X) is %d. But received axis[i] = %d.",
-              i, x_rank, axis[i]));
-      PADDLE_ENFORCE_GE(
-          axis[i], -x_rank,
-          platform::errors::InvalidArgument(
-              "axis[%d] should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "where dimesion(X) is %d. But received axis[i] = %d.",
-              i, x_rank, axis[i]));
+      PADDLE_ENFORCE_LT(axis[i], x_rank,
+                        platform::errors::InvalidArgument(
+                            "axis[%d] should be in the "
+                            "range [-D, D), where D is the dimensions of X and "
+                            "D is %d. But received axis[%d] = %d.",
+                            i, x_rank, i, axis[i]));
+      PADDLE_ENFORCE_GE(axis[i], -x_rank,
+                        platform::errors::InvalidArgument(
+                            "axis[%d] should be in the "
+                            "range [-D, D), where D is the dimensions of X and "
+                            "D is %d. But received axis[%d] = %d.",
+                            i, x_rank, i, axis[i]));
       if (axis[i] < 0) {
         axis[i] += x_rank;
       }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index e03824ca8c3f4..05bb37ee421ff 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -49,7 +49,8 @@ inline std::vector<int> get_new_shape(
             "the element's shape must be [1]. But received the element's shape "
             "is [%s]",
             tensor->dims()));
-    if (platform::is_gpu_place(tensor->place())) {
+    if (platform::is_gpu_place(tensor->place()) ||
+        platform::is_xpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
 
@@ -362,7 +363,8 @@ class ReshapeKernel {
       if (shape_tensor) {
         auto *shape_data = shape_tensor->data<int>();
         framework::Tensor cpu_shape_tensor;
-        if (platform::is_gpu_place(shape_tensor->place())) {
+        if (platform::is_gpu_place(shape_tensor->place()) ||
+            platform::is_xpu_place(shape_tensor->place())) {
           TensorCopySync(*shape_tensor, platform::CPUPlace(),
                          &cpu_shape_tensor);
           shape_data = cpu_shape_tensor.data<int>();
@@ -375,9 +377,22 @@ class ReshapeKernel {
 
     out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), out);
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(ctx.GetPlace())) {
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::XPUDeviceContext>();
+      xpu::memcpy_device(
+          dev_ctx.x_context(), out->data<void>(), in->data<void>(),
+          in->numel() * paddle::framework::SizeOfType(in->type()));
+    } else {
+#endif
+      framework::TensorCopy(
+          *in, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+#ifdef PADDLE_WITH_XPU
+    }
+#endif
     out->Resize(out_dims);
   }
 };
@@ -644,3 +659,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float,
                                 ops::ReshapeDoubleGradKernel, plat::float16,
                                 ops::ReshapeDoubleGradKernel);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel, plat::float16,
+                               ops::ReshapeKernel);
+REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel, plat::float16,
+                               ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
new file mode 100644
index 0000000000000..4002be8100152
--- /dev/null
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/scale_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ScaleXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in_var = ctx.InputVar("X");
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* out_var = ctx.OutputVar("Out");
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+    auto* out =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+    out->mutable_data<T>(in->place());
+    PADDLE_ENFORCE_EQ(
+        in->dims(), out->dims(),
+        platform::errors::InvalidArgument("In and out should have the same dim,"
+                                          " expected %s, but got %s.",
+                                          in->dims().to_str().c_str(),
+                                          out->dims().to_str().c_str()));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias,
+                       bias_after_scale, in->data<float>(), out->data<float>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index 99e8064d2446f..5f976685c982b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -59,20 +59,22 @@ class SequenceConvOp : public framework::OperatorWithKernel {
             filter_dims[0], context_length * in_dims[1]));
 
     if (ctx->Attrs().Get<bool>("paddingTrainable")) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("PaddingData"),
-          "Input(PaddingData) of SequenceConvOp should not be null.");
+      OP_INOUT_CHECK(ctx->HasInput("PaddingData"), "Input", "PaddingData",
+                     "sequence_conv");
       framework::DDim padding_dim = ctx->GetInputDim("PaddingData");
       int up_pad = std::max(0, -context_start);
       int down_pad = std::max(0, context_start + context_length - 1);
       int total_pad = up_pad + down_pad;
       int input_width = static_cast<int>(in_dims[1]);
+      bool start_equals_zero = context_start == 0;
+      bool length_equals_one = context_length == 1;
+      bool start_length = start_equals_zero && length_equals_one;
 
-      if (context_start == 0 && context_length == 1) {
-        PADDLE_THROW(
-            "If context_start is 0 and context_length is 1, paddingTrainable "
-            "should be false.");
-      }
+      PADDLE_ENFORCE_EQ(
+          start_length, false,
+          platform::errors::InvalidArgument(
+              "If context_start is 0 and context_length is 1, paddingTrainable "
+              "should be false."));
       PADDLE_ENFORCE_EQ(
           padding_dim.size(), 2,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index 1dbddfa709d72..758ff01b1e7ec 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -43,8 +43,11 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
               "Output LoDTensor of SequenceEnumerate operator.");
     AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
         .AddCustomChecker([](const int& win_size) {
-          PADDLE_ENFORCE(win_size >= 2,
-                         "The window size should be not less than 2.");
+          PADDLE_ENFORCE_GE(win_size, 2,
+                            platform::errors::InvalidArgument(
+                                "The window size should be not less than 2."
+                                "Received window size is %d",
+                                win_size));
         });
     AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
         .SetDefault(0);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index d5deb7582c7c0..6d8f60ce932ab 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -58,7 +58,10 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(
         static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
-        "The actual input data's size mismatched with LoD information.");
+        platform::errors::InvalidArgument(
+            "The actual input data's size mismatched with LoD information."
+            "Received input data size is %d (actual) vs %d (loD information).",
+            static_cast<uint64_t>(in_dims[0]), in_lod[0].back()));
 
     /* Generate enumerate sequence set */
     auto stream = context.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 4807521bc0d92..d104d33caebb3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -29,21 +29,31 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     int win_size = context.Attr<int>("win_size");
     auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
 
-    PADDLE_ENFORCE_EQ(in->lod().empty(), false,
-                      "Input(X) Tensor of SequenceEnumerateOp does not contain "
-                      "LoD information.");
+    PADDLE_ENFORCE_EQ(
+        in->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(X) Tensor of SequenceEnumerateOp does not contain "
+            "LoD information."));
 
     auto in_dims = in->dims();
     auto lod0 = in->lod()[0];
     PADDLE_ENFORCE_EQ(
         static_cast<uint64_t>(in_dims[0]), lod0.back(),
-        "The actual input data's size mismatched with LoD information.");
+        platform::errors::InvalidArgument(
+            "The actual input data's size mismatched with LoD information."
+            "Received input data size is %d (actual) vs %d (loD information).",
+            static_cast<uint64_t>(in_dims[0]), lod0.back()));
     PADDLE_ENFORCE_EQ(
         in_dims.size(), 2UL,
-        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+        platform::errors::InvalidArgument(
+            "Input(X) of SequenceEnumerate operator's rank should be 2."
+            "Received %d instead.",
+            in_dims.size()));
     PADDLE_ENFORCE_EQ(in_dims[1], 1,
-                      "Input(X) of SequenceEnumerate operator's 2nd "
-                      "dimension should be 1.");
+                      platform::errors::InvalidArgument(
+                          "Input(X) of SequenceEnumerate operator's 2nd "
+                          "dimension should be 1. Received %d instead.",
+                          in_dims[1]));
 
     // Generate enumerate sequence set
     auto in_data = in->data<T>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index b8912dd4c7960..b06b1f755a22b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -69,8 +69,10 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
                  "= max(Input(X)).")
         .SetDefault(-1)
         .AddCustomChecker([](const int& v) {
-          PADDLE_ENFORCE(v < 0 || v >= 1,
-                         "Attr(maxlen) must be less than 0 or larger than 1");
+          PADDLE_ENFORCE_EQ(
+              v < 0 || v >= 1, true,
+              platform::errors::InvalidArgument(
+                  "Attr(maxlen) must be less than 0 or larger than 1"));
         });
     AddAttr<int>("out_dtype", "Output data type");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index 8fe68deca66aa..37f9caf76ceba 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -42,14 +42,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
                                         "Input(X) Tensor of SequencePoolOp "
                                         "does not contain LoD information."));
     PADDLE_ENFORCE_LE(lod_level, 2UL,
-                      "The lod level of input shall be no more than 2.");
+                      platform::errors::InvalidArgument(
+                          "The lod level of input shall be no more than 2."
+                          "Received lod level is %d.",
+                          lod_level));
     PADDLE_ENFORCE_GE(
         dims[0],
         /*batch size = */ static_cast<int64_t>(lod[lod_level - 1].size() - 1),
-        "The first dimension of Input(X) must be large than batch size.");
+        platform::errors::InvalidArgument(
+            "The first dimension of Input(X) must be large than batch size."
+            "But received first dimension of Input(X) is %d, while batch"
+            "size is %d.",
+            dims[0], static_cast<int64_t>(lod[lod_level - 1].size() - 1)));
     if (lod_level > 1UL) {
       PADDLE_ENFORCE_EQ(lod[0][lod[0].size() - 1], lod[1].size() - 1,
-                        "The input lod information is illegal.");
+                        platform::errors::InvalidArgument(
+                            "The input lod information is illegal."));
       framework::LoD out_lod;
       out_lod.push_back(lod[0]);
       out->set_lod(out_lod);
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
new file mode 100644
index 0000000000000..2e9092a643253
--- /dev/null
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/shape_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
+                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
+                       ops::ShapeKernel<double>);
+
+#endif
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
new file mode 100644
index 0000000000000..44fd555544e7f
--- /dev/null
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sign_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SignXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+    auto xpu_context = context.device_context<DeviceContext>().x_context();
+    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
+                                    in->numel(), in->data<T>(), out->data<T>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
new file mode 100644
index 0000000000000..29740000aeb4c
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class SoftmaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true,
+                      platform::errors::InvalidArgument(
+                          "xpu softmax kernel only support last dimension of x "
+                          "(axis==-1 or axis==x_dims-1), but received axis: "
+                          "%d, x's shape: %s.",
+                          axis, x->dims()));
+
+    // allocate memory on device.
+    out->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, x->dims());
+    const int d = SizeFromAxis(axis, x->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data<float>(),
+                                   out->data<float>(), n, d, d <= 2048);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_forward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    const int rank = dx->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    dx->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, dx->dims());
+    const int d = SizeFromAxis(axis, dx->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r =
+        xpu::softmax2d_backward(dev_ctx.x_context(), out->data<float>(),
+                                dout->data<float>(), dx->data<float>(), n, d);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_backward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    softmax, ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index ba56e5e36f985..3ac7a5a127b37 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -357,7 +357,8 @@ static void HardLabelSoftmaxWithCrossEntropy(
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
     default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
       break;
   }
 #undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
@@ -397,7 +398,8 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
     default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
       break;
   }
 
@@ -408,8 +410,10 @@ template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("softmax_with_cross_entropy operator's "
+                                      "CUDA kernel only runs on GPU device."));
     const Tensor* logits = context.Input<Tensor>("Logits");
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* softmax = context.Output<Tensor>("Softmax");
@@ -469,8 +473,10 @@ template <typename T>
 class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("softmax_with_cross_entropy operator's "
+                                      "CUDA kernel only runs on GPU device."));
     const Tensor* labels = context.Input<Tensor>("Label");
     const T* loss_grad_data =
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 93d8f42ce2175..479973a5daa5f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -249,6 +249,19 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("squeeze");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 // FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
 // the XShape is used to carry the shape and lod of X which will be used in
 // squeeze_grad, in this way, the framework can reuse the memory of X
@@ -279,8 +292,22 @@ class Squeeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer,
+template <typename T>
+class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("squeeze2");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetOutput("XShape", this->Input("XShape"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
                             framework::GradVarName("X")});
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
@@ -292,14 +319,18 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
+                  ops::SqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::SqueezeGradNoNeedBufferVarsInferer);
 
 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                   ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SequeezeInplaceInferer);
+                  ops::SqueezeInplaceInferer);
 REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
-                  ops::SequeezeGradInplaceInferer);
+                  ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
+                  ops::SqueezeGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
new file mode 100644
index 0000000000000..14928061d23dd
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sum_op.h"
+#include <vector>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto in_vars = context.MultiInputVar("X");
+    auto out_var = context.OutputVar("Out");
+    auto *out = context.Output<LoDTensor>("Out");
+    bool in_place = out_var == in_vars[0];
+    int N = in_vars.size();
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+    if (!in_place) {
+      out->mutable_data<T>(context.GetPlace());
+    }
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<const float *> ptrs(N, nullptr);
+    int valid_count = 0;
+    for (int i = 0; i < N; ++i) {
+      PADDLE_ENFORCE_EQ(
+          in_vars[i]->IsType<framework::LoDTensor>(), true,
+          platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+      auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+      if (in_t.numel() == 0) {
+        continue;
+      }
+      ptrs[valid_count] = reinterpret_cast<const float *>(in_t.data<T>());
+      valid_count++;
+    }
+    int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data<T>(),
+                           valid_count, out->numel());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    sum, ops::SumXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 946fa6305d737..0e870937ec1a5 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -61,6 +61,19 @@ class TransposeOp : public framework::OperatorWithKernel {
     }
 
     framework::DDim out_dims(x_dims);
+#ifdef PADDLE_WITH_MKLDNN
+    // Here we need to match dims to paddle layout
+    // as we are producing non-oneDNN result
+    if ((x_dims.size() >= 3) &&
+        (paddle::platform::MKLDNNDeviceContext::tls()
+             .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC)) {
+      auto dims = framework::vectorize<int>(x_dims);
+      std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
+      x_dims = x_dims.reshape(dims);
+      VLOG(3)
+          << "Rotating Shape in Transpose from: kMKLDNN to: kNHWC output_shape";
+    }
+#endif
     for (size_t i = 0; i < axis_size; i++) {
       out_dims[i] = x_dims[axis[i]];
     }
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index ee1361e361830..0e58e1391cfab 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -228,6 +228,19 @@ class UnsqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("unsqueeze");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 // FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
 // unsqueeze, the XShape is used to carry the shape and lod of X which
 // will be used in unsqueeze_grad, in this way, the framework can reuse
@@ -304,6 +317,20 @@ class Unsqueeze2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("unsqueeze2");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetOutput("XShape", this->Input("XShape"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -317,6 +344,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradOpNoNeedBufferVarInferer);
 
 REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
@@ -324,6 +353,8 @@ REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
                   ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeInplaceInferer);
 REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index e379832593c78..2df1f291f9f8c 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -164,6 +164,13 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
         // AVX512F: EBX Bit 16
         int avx512f_mask = (1 << 16);
         return (reg[1] & avx512f_mask) != 0;
+      } else if (cpu_isa == avx512_core) {
+        unsigned int avx512f_mask = (1 << 16);
+        unsigned int avx512dq_mask = (1 << 17);
+        unsigned int avx512bw_mask = (1 << 30);
+        unsigned int avx512vl_mask = (1 << 31);
+        return ((reg[1] & avx512f_mask) && (reg[1] & avx512dq_mask) &&
+                (reg[1] & avx512bw_mask) && (reg[1] & avx512vl_mask));
       }
     }
 #endif
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a3ae9e48eea30..165321d9c87ff 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -47,6 +47,10 @@ limitations under the License. */
 #include <type_traits>
 #include <utility>
 
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+#include <execinfo.h>
+#endif
+
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
@@ -236,13 +240,14 @@ inline std::string SimplifyDemangleStr(std::string str) {
 }
 
 inline std::string GetCurrentTraceBackString() {
-  static constexpr int TRACE_STACK_LIMIT = 100;
   std::ostringstream sout;
 
   sout << "\n\n--------------------------------------\n";
   sout << "C++ Traceback (most recent call last):";
   sout << "\n--------------------------------------\n";
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+  static constexpr int TRACE_STACK_LIMIT = 100;
+
   void* call_stack[TRACE_STACK_LIMIT];
   auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
   auto symbols = backtrace_symbols(call_stack, size);
@@ -261,7 +266,7 @@ inline std::string GetCurrentTraceBackString() {
   }
   free(symbols);
 #else
-  sout << "Windows not support stack backtrace yet.\n";
+  sout << "Not support stack backtrace yet.\n";
 #endif
   return sout.str();
 }
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 32b7efc04c1f2..fb5cf9fb31915 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -25,6 +25,8 @@ limitations under the License. */
   classname& operator=(classname&&) = delete
 #endif
 
+#ifndef PADDLE_WITH_MUSL
 #if defined(__FLT_MAX__)
 #define FLT_MAX __FLT_MAX__
 #endif  // __FLT_MAX__
+#endif  // PADDLE_WITH_MUSL
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index b012a103ea303..d8dd166f325c8 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -14,7 +14,9 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <iostream>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -81,12 +83,30 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in,
     return;
   }
 
+  auto print_dims = [](const std::vector<int>& dims) {
+    std::ostringstream oss;
+
+    if (!dims.empty()) {
+      oss << "[";
+      // Convert all but the last element to avoid a trailing ","
+      std::copy(dims.begin(), dims.end() - 1,
+                std::ostream_iterator<int>(oss, ","));
+
+      // Now add the last element with no delimiter
+      oss << dims.back() << "]";
+    }
+
+    return oss.str();
+  };
+
   switch (from) {
     case framework::DataLayout::kMKLDNN:
       if (to == framework::DataLayout::kNHWC) {
         auto dims = framework::vectorize<int>(tensor_in->dims());
         std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
         tensor_in->Resize(framework::make_ddim(dims));
+        VLOG(3) << "Rotating Shape from: kMKLDNN to: kNHWC output_shape"
+                << print_dims(dims);
       }
       break;
     case framework::DataLayout::kNHWC:
@@ -94,6 +114,8 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in,
         auto dims = framework::vectorize<int>(tensor_in->dims());
         std::rotate(dims.begin() + 1, dims.end() - 1, dims.end());
         tensor_in->Resize(framework::make_ddim(dims));
+        VLOG(3) << "Rotating Shape from: kNHWC to: kMKLDNN output_shape"
+                << print_dims(dims);
       }
       break;
     default:
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index d1c5480c0f543..785627a09fb27 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -853,6 +853,9 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
         CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
                           mkldnn_paddings[1]);
       }
+
+      ComputeAdaptivePoolParameters(ctx, src_tz, ksize, strides);
+
       this->AcquireForwardPrimitiveDescriptor(
           is_test ? mkldnn::prop_kind::forward_inference
                   : mkldnn::prop_kind::forward_training,
@@ -919,6 +922,27 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
     return mem_p;
   }
 
+  static void ComputeAdaptivePoolParameters(
+      const paddle::framework::ExecutionContext& ctx,
+      const std::vector<int64_t>& src_tz, std::vector<int64_t>& ksize,
+      std::vector<int64_t>& strides) {
+    if (ctx.Attr<bool>("adaptive")) {
+      // (jczaja): oneDNN is supporting only unchangable in size pool window
+      PADDLE_ENFORCE_EQ(
+          src_tz[src_tz.size() - 1] % ksize[1], 0,
+          platform::errors::Unimplemented(
+              "Input dim must be divisible by corressponding ksize dim."));
+      PADDLE_ENFORCE_EQ(
+          src_tz[src_tz.size() - 2] % ksize[0], 0,
+          platform::errors::Unimplemented(
+              "Input dim must be divisible by corressponding ksize dim."));
+      ksize[0] = src_tz[src_tz.size() - 2] / ksize[0];
+      ksize[1] = src_tz[src_tz.size() - 1] / ksize[1];
+      strides[0] = ksize[0];
+      strides[1] = ksize[1];
+    }
+  }
+
  private:
   static inline int ComputeCeiledOutput(int input_size, int kernel_size,
                                         int padding, int stride) {
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index c1b81159aca97..c5e8ff807a2d3 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -14,19 +14,18 @@
 
 #pragma once
 
-#include <cstdio>
-#include <stdexcept>
-
 #include <time.h>
+
+#include <cstdio>
 #include <memory>
+#include <stdexcept>
 #include <string>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-#include <dlfcn.h>     //  dladdr
-#include <execinfo.h>  // backtrace
+#include <dlfcn.h>  // dladdr
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <algorithm>  // std::accumulate
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index be4d90597e1e1..c8e5048421cca 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -481,8 +481,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
-           py::arg("zero_copy") = false,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
+           py::arg("zero_copy") = false,
            py::arg("passes_filter") = std::vector<std::string>(),
            py::arg("ops_filter") = std::vector<std::string>())
       .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled)
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 9bc603c0ecc2c..ee6e541c9e6c6 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -49,6 +49,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
     {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
     {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
+    {"hierarchical_sigmoid",
+     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0929febc4d46f..0ee725c302215 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,9 +36,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
@@ -142,6 +142,17 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }
 
+bool SupportsBfloat16() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
+    return true;
+  else
+    return false;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -421,10 +432,12 @@ PYBIND11_MODULE(core_noavx, m) {
     return map_output;
   });
 
-  m.def("save_op_compatible_info", [](framework::ProgramDesc &desc) {
-    framework::OpCompatibleMap op_compatible_map;
-    op_compatible_map.InitOpCompatibleMap();
-    return op_compatible_map.ConvertToProto(desc.OpCompatibleMap());
+  m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
+    framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
+    framework::compatible::SaveOpVersions(
+        framework::compatible::OpVersionRegistrar::GetInstance()
+            .GetVersionMap(),
+        &pb_vmap);
   });
 
   m.def(
@@ -1302,9 +1315,6 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
-    **Note**:
-        For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device.
-        The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable.
 
     CUDAPlace is a descriptor of a device.
     It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1323,8 +1333,10 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          gpu_place = fluid.CUDAPlace(0)
+          import paddle
+
+          place = paddle.CUDAPlace(0)
+          paddle.disable_static(place)
 
         )DOC")
       .def("__init__",
@@ -1661,6 +1673,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
+  m.def("supports_bfloat16", SupportsBfloat16);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
   m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index d587081fbac8a..ad4bc20f9f0b1 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -4,37 +4,26 @@ function(train_test TARGET_NAME)
     set(multiValueArgs ARGS)
     cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    set(arg_list "")
-    if(train_test_ARGS)
-        foreach(arg ${train_test_ARGS})
-            list(APPEND arg_list "_${arg}")
-        endforeach()
+    if (NOT APPLE AND NOT WIN32)
+        cc_test(test_train_${TARGET_NAME}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_shared
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
     else()
-        list(APPEND arg_list "_")
+        cc_test(test_train_${TARGET_NAME}${arg}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_api
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+    endif()
+    set_tests_properties(test_train_${TARGET_NAME}
+            PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
+    if(NOT WIN32 AND NOT APPLE)
+        set_tests_properties(test_train_${TARGET_NAME}
+                PROPERTIES TIMEOUT 150)
     endif()
-    foreach(arg ${arg_list})
-        string(REGEX REPLACE "^_$" "" arg "${arg}")
-        if (NOT APPLE AND NOT WIN32)
-            cc_test(test_train_${TARGET_NAME}${arg}
-                    SRCS test_train_${TARGET_NAME}.cc
-                    DEPS paddle_fluid_shared
-                    ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        else()
-            cc_test(test_train_${TARGET_NAME}${arg}
-                    SRCS test_train_${TARGET_NAME}.cc
-                    DEPS paddle_fluid_api
-                    ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        endif()
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
-        if(NOT WIN32 AND NOT APPLE)
-            set_tests_properties(test_train_${TARGET_NAME}${arg}
-                    PROPERTIES TIMEOUT 150)
-        endif()
-    endforeach()
 endfunction(train_test)
 
 
 if(WITH_TESTING)
-  train_test(recognize_digits ARGS mlp conv)
+  train_test(recognize_digits)
 endif()
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index e7b698e1a34e2..fb993439bb8e4 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -32,16 +32,15 @@ DEFINE_string(dirname, "", "Directory of the train model.");
 
 namespace paddle {
 
-void Train() {
-  CHECK(!FLAGS_dirname.empty());
+void Train(std::string model_dir) {
   framework::InitDevices(false);
   const auto cpu_place = platform::CPUPlace();
   framework::Executor executor(cpu_place);
   framework::Scope scope;
 
   auto train_program = inference::Load(
-      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
-      FLAGS_dirname + "__params_combined__");
+      &executor, &scope, model_dir + "__model_combined__.main_program",
+      model_dir + "__params_combined__");
 
   std::string loss_name = "";
   for (auto op_desc : train_program->Block(0).AllOps()) {
@@ -87,6 +86,10 @@ void Train() {
   EXPECT_LT(last_loss, first_loss);
 }
 
-TEST(train, recognize_digits) { Train(); }
+TEST(train, recognize_digits) {
+  CHECK(!FLAGS_dirname.empty());
+  Train(FLAGS_dirname + "recognize_digits_mlp.train.model/");
+  Train(FLAGS_dirname + "recognize_digits_conv.train.model/");
+}
 
 }  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 84713d513fb68..0af32da4e690b 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -235,7 +235,6 @@
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
@@ -272,6 +271,7 @@
 
 from . import jit
 from . import static
+from . import amp
 
 # high-level api
 from .hapi import Model
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
new file mode 100644
index 0000000000000..32587938512c4
--- /dev/null
+++ b/python/paddle/amp/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .auto_cast import auto_cast
+from .grad_scaler import GradScaler
+
+__all__ = ['auto_cast', 'GradScaler']
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
new file mode 100644
index 0000000000000..e33f6e2afc846
--- /dev/null
+++ b/python/paddle/amp/auto_cast.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.amp import amp_guard
+
+__all__ = ['auto_cast']
+
+
+def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
+    """
+    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
+    If enabled, the input data type (float32 or float16) of each operator is decided 
+    by autocast algorithm for better performance. 
+    
+    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
+    imperative mode.
+
+    Args:
+        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
+        custom_white_list(set|list, optional): The custom white_list.
+        custom_black_list(set|list, optional): The custom black_list.
+        
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+
+        conv2d = paddle.nn.Conv2d(3, 2, 3, bias_attr=False)
+        data = paddle.rand([10, 3, 32, 32])
+
+        with paddle.amp.auto_cast():
+            conv = conv2d(data)
+            print(conv.dtype) # FP16
+
+        with paddle.amp.auto_cast(enable=False):
+            conv = conv2d(data)
+            print(conv.dtype) # FP32
+
+    """
+    return amp_guard(enable, custom_white_list, custom_black_list)
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
new file mode 100644
index 0000000000000..9476f3765b3bc
--- /dev/null
+++ b/python/paddle/amp/grad_scaler.py
@@ -0,0 +1,136 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.amp import AmpScaler
+
+__all__ = ['GradScaler']
+
+
+class GradScaler(AmpScaler):
+    """
+    GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph
+    mode. It controls the scaling of loss, helps avoiding numerical overflow.
+    The object of this class has two methods `scale()`, `minimize()`.
+
+    `scale()` is used to multiply the loss by a scale ratio.
+    `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
+
+    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
+    dynamic graph mode.
+
+    Args:
+        enable(bool, optional): Enable loss scaling or not. Default is True.
+        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
+        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+                        scaling. Default is 2.0.
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+                        the loss scaling. Default is 0.5.
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+                                steps with finite gradients. Default is 1000.
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+                                    accumulated steps with nan or inf gradients. Default is 2.
+        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+    Returns:
+        An AmpScaler object.
+
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+
+        model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        data = paddle.rand([10, 3, 32, 32])
+        with paddle.amp.auto_cast():
+            conv = model(data)
+            loss = paddle.reduce_mean(conv) 
+            scaled = scaler.scale(loss)  # scale the loss 
+            scaled.backward()            # do backward
+            scaler.minimize(optimizer, scaled)  # update parameters     
+    """
+
+    def __init__(self,
+                 enable=True,
+                 init_loss_scaling=2.**15,
+                 incr_ratio=2.0,
+                 decr_ratio=0.5,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=1,
+                 use_dynamic_loss_scaling=True):
+        super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio,
+                                         decr_ratio, incr_every_n_steps,
+                                         decr_every_n_nan_or_inf,
+                                         use_dynamic_loss_scaling)
+
+    def scale(self, var):
+        """
+        Multiplies a Tensor by the scale factor and returns scaled outputs.  
+        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
+
+        Args:
+            var (Tensor):  The tensor to scale.
+        Returns:
+            The scaled tensor or original tensor.
+        
+        Examples:
+            .. code-block:: python
+
+            import paddle
+
+            model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            with paddle.amp.auto_cast():
+                conv = model(data)
+                loss = paddle.reduce_mean(conv) 
+                scaled = scaler.scale(loss)  # scale the loss 
+                scaled.backward()            # do backward
+                scaler.minimize(optimizer, scaled)  # update parameters  
+        """
+        return super(GradScaler, self).scale(var)
+
+    def minimize(self, optimizer, *args, **kwargs):
+        """
+        This function is similar as `Optimizer.minimize()`, which performs parameters updating.
+        
+        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
+        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+
+        Finally, the loss scaling ratio is updated.
+
+        Args:
+            optimizer(Optimizer):  The optimizer used to update parameters.
+            args:  Arguments, which will be forward to `optimizer.minimize()`.
+            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+
+            model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            with paddle.amp.auto_cast():
+                conv = model(data)
+                loss = paddle.reduce_mean(conv) 
+                scaled = scaler.scale(loss)  # scale the loss 
+                scaled.backward()            # do backward
+                scaler.minimize(optimizer, scaled)  # update parameters  
+        """
+        return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 1fc29ad042883..c7798b15c67fe 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -744,13 +744,13 @@ def adaptive_localsgd(self):
             strategy.adaptive_localsgd = True # by default this is false
 
         """
-        return self.strategy.localsgd
+        return self.strategy.adaptive_localsgd
 
     @adaptive_localsgd.setter
     @is_strict_auto
     def adaptive_localsgd(self, flag):
         if isinstance(flag, bool):
-            self.strategy.localsgd = flag
+            self.strategy.adaptive_localsgd = flag
         else:
             print("WARNING: adaptive_localsgd should have value of bool type")
 
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 3fdd6e9248303..7eb3a5659654a 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -187,6 +187,8 @@ def init(self, role_maker=None, is_collective=False):
 
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
+            if self.worker_num() == 1:
+                return
             if parallel_helper._is_parallel_ctx_initialized():
                 warnings.warn(
                     "The dygraph parallel environment has been initialized.")
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index deba3b4a17d1b..ce9826d7e59ae 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -530,13 +530,6 @@ def _get_heter_worker_endpoint(self):
         return self._heter_trainer_endpoints[(self._current_id) %
                                              self._heter_worker_num()]
 
-    def _get_heter_worker_device(self):
-        """
-        Returns:
-            string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
-        """
-        return self._heter_trainer_device.upper()
-
 
 class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
@@ -677,88 +670,99 @@ def _is_heter_worker(self):
         return self._role == Role.HETER_WORKER
 
     def _ps_env(self):
-        try:
-            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
-            # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
-            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
-
-            if self._server_endpoints is None:
-                # back to non_distributed execution.
-                self._server_endpoints = ""
-                self._trainers_num = 1
-                self._role = Role.WORKER
-                self._current_id = 0
-                self._nodes_num = 1
-                self._heter_trainers_num = 0
-                self._heter_trainer_endpoints = None
-                self._non_distributed = True
-                return
-
-            self._server_endpoints = self._server_endpoints.split(",")
-
-            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-            if self._worker_endpoints:
-                self._worker_endpoints = self._worker_endpoints.split(",")
-            else:
-                self._worker_endpoints = []
+        # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
+        # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
+        self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST", None)
+
+        if self._server_endpoints is None:
+            # back to non_distributed execution.
+            self._server_endpoints = ""
+            self._trainers_num = 1
+            self._role = Role.WORKER
+            self._current_id = 0
+            self._nodes_num = 1
+            self._heter_trainers_num = 0
+            self._heter_trainer_endpoints = None
+            self._non_distributed = True
+            return
+
+        self._server_endpoints = self._server_endpoints.split(",")
+
+        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
+        if self._worker_endpoints != None:
+            self._worker_endpoints = self._worker_endpoints.split(",")
+        else:
+            self._worker_endpoints = []
+
+        trainers_num = os.getenv("PADDLE_TRAINERS_NUM", None)
+        if trainers_num == None:
+            raise ValueError(
+                "Can not find PADDLE_TRAINERS_NUM, please check your environment."
+            )
+        trainers_num = int(trainers_num)
 
-            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
-            training_role = os.environ["TRAINING_ROLE"]
+        training_role = os.getenv("TRAINING_ROLE", None)
+        if training_role == None:
+            raise ValueError(
+                "Can not find TRAINING_ROLE, please check your environment.")
 
-            if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+        if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+            raise ValueError(
+                "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
+                format(training_role))
+
+        # For heter parameter server env setting
+        heter_trainer_eplist = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST",
+                                         "")
+        if heter_trainer_eplist != "":
+            try:
+                heter_trainer_eplist = os.environ[
+                    "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
+            except:
                 raise ValueError(
-                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
-                    format(training_role))
-
-            # For heter parameter server env setting
-            heter_trainer_eplist = os.getenv(
-                "PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
-            heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
-                                             None)
-            if heter_trainer_eplist and heter_trainer_device:
-                try:
-                    heter_trainer_eplist = os.environ[
-                        "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
-                except:
-                    raise ValueError(
-                        "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
-                    )
-
-                self._is_heter_parameter_server_mode = True
-                heter_trainers_num = len(heter_trainer_eplist)
-                current_node_device = heter_trainer_device.upper()
-                if current_node_device not in ["CPU", "GPU", "XPU"]:
-                    raise ValueError(
-                        "Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
-                        format(heter_trainer_device))
-                self._heter_trainer_device = current_node_device
-            else:
-                self._is_heter_parameter_server_mode = False
-                heter_trainers_num = 0
-
-            if training_role == "TRAINER":
-                role = Role.WORKER
-                current_id = int(os.environ["PADDLE_TRAINER_ID"])
-                if len(self._worker_endpoints) > 0:
-                    self._cur_endpoint = self._worker_endpoints[current_id]
-            elif training_role == "PSERVER":
-                role = Role.SERVER
-                port = os.environ["PADDLE_PORT"]
-                ip = os.environ["POD_IP"]
-                self._cur_endpoint = ip + ":" + port
-                current_id = self._server_endpoints.index(self._cur_endpoint)
-            elif training_role == "HETER_TRAINER":
-                role = Role.HETER_WORKER
-                cur_ip = os.environ["POD_IP"]
-                cur_port = os.environ["PADDLE_PORT"]
-                curr_endpoint = ":".join([cur_ip, cur_port])
-                current_id = heter_trainer_eplist.index(curr_endpoint)
-            else:
+                    "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
+                )
+
+            self._is_heter_parameter_server_mode = True
+            heter_trainers_num = len(heter_trainer_eplist)
+        else:
+            self._is_heter_parameter_server_mode = False
+            heter_trainers_num = 0
+
+        if training_role == "TRAINER":
+            role = Role.WORKER
+            current_id = os.getenv("PADDLE_TRAINER_ID", None)
+            if current_id == None:
                 raise ValueError(
-                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
-        except ValueError as e:
-            raise ValueError(
-                "Something wrong with PaddleCloud, please check environment")
+                    "Can not find PADDLE_TRAINER_ID, please check your environment."
+                )
+            current_id = int(current_id)
+            if len(self._worker_endpoints) > 0:
+                self._cur_endpoint = self._worker_endpoints[current_id]
+        elif training_role == "PSERVER":
+            role = Role.SERVER
+            port = os.getenv("PADDLE_PORT", None)
+            if port == None:
+                raise ValueError(
+                    "Can not find PADDLE_PORT, please check your environment.")
+            ip = os.getenv("POD_IP", None)
+            if ip == None:
+                raise ValueError(
+                    "Can not find POD_IP, please check your environment.")
+            self._cur_endpoint = ip + ":" + port
+            current_id = self._server_endpoints.index(self._cur_endpoint)
+        elif training_role == "HETER_TRAINER":
+            role = Role.HETER_WORKER
+            cur_port = os.getenv("PADDLE_PORT", None)
+            if cur_port == None:
+                raise ValueError(
+                    "Can not find PADDLE_PORT, please check your environment.")
+            cur_ip = os.getenv("POD_IP", None)
+            if cur_ip == None:
+                raise ValueError(
+                    "Can not find POD_IP, please check your environment.")
+            curr_endpoint = ":".join([cur_ip, cur_port])
+            current_id = heter_trainer_eplist.index(curr_endpoint)
 
         self._trainers_num = trainers_num
         self._role = role
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 015d59b516e94..2e23a915454fa 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -89,14 +89,16 @@ def _parse_args():
         description='''start paddle training using multi-process mode.
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
+    base_group = parser.add_argument_group("Base Parameters")
 
-    # Optional arguments for the launch helper
-    parser.add_argument(
-        "--ips",
+    base_group.add_argument(
+        "--log_dir",
         type=str,
-        default="127.0.0.1",
-        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
-    parser.add_argument(
+        default="log",
+        help="The path for each process's log.If it's not set, the log will printed to default pipe."
+    )
+
+    base_group.add_argument(
         "--gpus",
         type=str,
         default=None,
@@ -104,22 +106,7 @@ def _parse_args():
         "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
     )
 
-    parser.add_argument(
-        "--servers", type=str, default="", help="User defined servers ip:port")
-    parser.add_argument(
-        "--workers", type=str, default="", help="User defined workers ip:port")
-    parser.add_argument("--worker_num", type=int, help="number of workers")
-
-    parser.add_argument("--server_num", type=int, help="number of servers")
-
-    parser.add_argument(
-        "--log_dir",
-        type=str,
-        default="log",
-        help="The path for each process's log.If it's not set, the log will printed to default pipe."
-    )
-    # positional
-    parser.add_argument(
+    base_group.add_argument(
         "training_script",
         type=str,
         help="The full path to the single GPU training "
@@ -127,8 +114,34 @@ def _parse_args():
         "followed by all the arguments for the "
         "training script")
 
-    # rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
+    base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+    # Optional arguments for the launch helper
+    # for collective
+    collective_group = parser.add_argument_group("Collective Parameters")
+    collective_group.add_argument(
+        "--ips",
+        type=str,
+        default="127.0.0.1",
+        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
+
+    ps_group = parser.add_argument_group("Parameter-Server Parameters")
+    # for parameter server
+    ps_group.add_argument(
+        "--servers", type=str, default="", help="User defined servers ip:port")
+    ps_group.add_argument(
+        "--workers", type=str, default="", help="User defined workers ip:port")
+    ps_group.add_argument(
+        "--heter_workers",
+        type=str,
+        default="",
+        help="User defined heter workers ip:port")
+
+    ps_group.add_argument("--worker_num", type=int, help="number of workers")
+    ps_group.add_argument("--server_num", type=int, help="number of servers")
+    ps_group.add_argument(
+        "--heter_worker_num", type=int, help="number of heter_workers")
+
     return parser.parse_args()
 
 
@@ -166,35 +179,6 @@ def get_cluster_from_args(args, gpus):
     return get_cluster(node_ips, node_ip, trainer_endpoints, gpus)
 
 
-def get_gpus(gpus):
-    if gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
-        res_gpus = [str(x) for x in range(0, gpus_num)]
-    else:
-        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
-        if cuda_visible_devices is None or cuda_visible_devices == "":
-            res_gpus = [x.strip() for x in gpus.split(',')]
-        else:
-            # change gpus into relative values
-            # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
-            # therefore gpus=0,1,2,3
-            cuda_visible_devices_list = cuda_visible_devices.split(',')
-            for x in gpus.split(','):
-                assert x in cuda_visible_devices_list, "Can't find "\
-                    "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
-                    % (x, cuda_visible_devices)
-            res_gpus = [
-                cuda_visible_devices_list.index(x.strip())
-                for x in gpus.split(',')
-            ]
-            logger.info("Change selected_gpus into reletive values. --ips:{} "
-                        "will change into relative_ips:{} according to your "
-                        "CUDA_VISIBLE_DEVICES:{}".format(
-                            gpus, res_gpus, cuda_visible_devices_list))
-
-    return res_gpus
-
-
 def launch_collective(args):
     # parse arguments, used for cloud-single-machine and local
     gpus = get_gpus(args.gpus)
@@ -245,209 +229,37 @@ def launch_collective(args):
         shutil.rmtree(gloo_rendezvous_dir)
 
 
-def launch_ps(args):
-    ports = None
-    start_port = 6170
-    if args.server_num:
-        server_num = args.server_num
-        ports = get_ports(server_num, 0)
-        server_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    else:
-        assert args.servers != "", "The setting of CPU mode must be either server_num or servers."
-        server_endpoints = args.servers
-    server_endpoints_ips = [
-        x.strip().split(":")[0] for x in server_endpoints.split(",")
-    ]
-    server_endpoints_port = [
-        x.strip().split(":")[1] for x in server_endpoints.split(",")
+def launch_ps(args, distribute_mode):
+    cloud_flag = cloud_utils.use_paddlecloud()
+
+    # for ps-cpu on paddlecloud
+    if cloud_flag and distribute_mode == DistributeMode.PS:
+        direct_start(args)
+        return
+    elif cloud_flag and distribute_mode == DistributeMode.PS_HETER:
+        cloud_ps_heter_env_set(args)
+        args.workers = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        args.servers = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
+        args.heter_workers = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST")
+
+    ps_launcher = ParameterServerLauncher(args, distribute_mode)
+    ps_launcher.start_ps()
+    return
+
+
+def which_distributed_mode(args):
+    ps_args = [
+        '--worker_num',
+        '--server_num',
+        '--heter_worker_num',
+        '--servers',
+        '--workers',
+        '--heter_workers',
     ]
-    server_num = len(server_endpoints_ips)
-
-    if args.worker_num:
-        worker_num = args.worker_num
-        ports = get_ports(worker_num, server_num)
-        worker_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    else:
-        assert args.workers != "", "The setting of CPU mode must be either worker_num or workers."
-        worker_endpoints = args.workers
-    worker_endpoints_ips = [
-        x.strip().split(":")[0] for x in worker_endpoints.split(",")
-    ]
-    worker_num = len(worker_endpoints_ips)
-    node_ips = list(set(server_endpoints_ips + worker_endpoints_ips))
-    worker_endpoints_len = [
-        len(x.strip().split(":")) for x in worker_endpoints.split(",")
-    ]
-    if 1 in worker_endpoints_len:
-        # if no port value in worker_endpoints, will set default port values.
-        worker_endpoints_port = range(start_port + server_num,
-                                      start_port + server_num + worker_num, 1)
-    else:
-        worker_endpoints_port = [
-            x.strip().split(":")[1] for x in worker_endpoints.split(",")
-        ]
-
-    # local train
-    if len(set(node_ips)) == 1:
-        current_node_ip = node_ips[0]
-    else:
-        _, current_node_ip = get_host_name_ip()
-
-    assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-        % (current_node_ip, node_ips)
-    node_rank = node_ips.index(current_node_ip)
-    logger.debug(
-        "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
-        format(node_ips, current_node_ip, node_rank, server_endpoints_port))
-
-    cluster = Cluster(hdfs=None)
-    server_rank = 0
-    worker_rank = 0
-    for node_rank, ip in enumerate(node_ips):
-        pod = Pod()
-        pod.rank = node_rank
-        pod.addr = ip
-        for i in range(len(server_endpoints_ips)):
-            if ip == server_endpoints_ips[i]:
-                server = Trainer()
-                server.endpoint = "%s:%s" % (ip, server_endpoints_port[i])
-                server.rank = server_rank
-                server_rank += 1
-                pod.servers.append(server)
-        for j in range(len(worker_endpoints_ips)):
-            if ip == worker_endpoints_ips[j]:
-                worker = Trainer()
-                worker.endpoint = "%s:%s" % (ip, worker_endpoints_port[i])
-                worker.rank = worker_rank
-                worker_rank += 1
-                pod.workers.append(worker)
-
-        cluster.pods.append(pod)
-
-    pod_rank = node_ips.index(current_node_ip)
-    pod = cluster.pods[pod_rank]
-
-    default_env = os.environ.copy()
-    current_env = copy.copy(default_env)
-
-    gloo_rendezvous_dir = tempfile.mkdtemp()
-    # add gloo env
-    current_env["PADDLE_WITH_GLOO"] = "1"
-    current_env["PADDLE_GLOO_RENDEZVOUS"] = "3"
-    current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
-
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
-    procs = []
-    cmds = []
-    log_fns = []
-    for idx, cur_server in enumerate(pod.servers):
-        proc_env = {
-            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
-            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
-            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
-            "TRAINING_ROLE": "PSERVER",
-            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
-        }
-        current_env.update(proc_env)
-
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-        cmds.append(cmd)
-
-        if idx == 0:
-            logger.info(
-                "Local server start {} processes. First process distributed "
-                "environment info (Only For Debug): {}".format(
-                    len(pod.servers),
-                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
-
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
-            log_fns.append(fn)
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-
-        tp = TrainerProc()
-        tp.proc = proc
-        tp.rank = cur_server.rank
-        tp.local_rank = idx
-        tp.log_fn = fn
-        tp.log_offset = fn.tell() if fn else None
-        tp.cmd = cmd
-
-        procs.append(tp)
-
-    for idx, cur_worker in enumerate(pod.workers):
-        proc_env = {
-            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
-            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
-            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
-        }
-        current_env.update(proc_env)
-
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-        cmds.append(cmd)
-
-        if idx == 0:
-            logger.info(
-                "Local worker start {} processes. First process distributed "
-                "environment info (Only For Debug): {}".format(
-                    len(pod.workers),
-                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
-
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
-            log_fns.append(fn)
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-
-        tp = TrainerProc()
-        tp.proc = proc
-        tp.rank = cur_worker.rank
-        tp.local_rank = idx
-        tp.log_fn = fn
-        tp.log_offset = fn.tell() if fn else None
-        tp.cmd = cmd
-
-        procs.append(tp)
-
-    logger.info(
-        "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*".
-        format(args.log_dir, args.log_dir))
-    # only wait worker to finish here
-    for i, proc in enumerate(procs):
-        if i < len(pod.servers):
-            continue
-        procs[i].proc.wait()
-        if len(log_fns) > 0:
-            log_fns[i].close()
-
-    print("all workers exit, going to finish parameter server", file=sys.stderr)
-    for i in range(len(pod.servers)):
-        if len(log_fns) > 0:
-            log_fns[i].close()
-        procs[i].proc.terminate()
-    print("all parameter server are killed", file=sys.stderr)
-
-    if os.path.exists(gloo_rendezvous_dir):
-        shutil.rmtree(gloo_rendezvous_dir)
+    collective_args = ['--ips']
 
+    ps_heter_args = ["--heter_worker_num", "--heter_workers"]
 
-def launch():
-    args = _parse_args()
-    logger = get_logger()
-    _print_arguments(args)
-    ps_args = ['--worker_num', '--server_num', '--servers', '--workers']
-    collective_args = ['--ips', '--gpus']
     has_ps_args = [
         ps_arg for ps_arg in ps_args if ps_arg in " ".join(sys.argv[1:-1])
     ]
@@ -455,23 +267,46 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
+
+    if len(has_ps_args) > 1 and len(has_collective_args) > 1:
+        raise ValueError(
+            "Only one mode(Collective or Parameter-Server) can be selected at the same time, but more than one configuration was received."
+        )
+
     if fluid.core.is_compiled_with_cuda():
         cuda_device_num = fluid.core.get_cuda_device_count()
     else:
         cuda_device_num = 0
 
-    if len(has_ps_args) > 0 or cuda_device_num == 0:
-        logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format(
-            has_ps_args))
-        launch_ps(args)
+    if len(has_ps_args) > 0:
+        logger.info(
+            "Run parameter-sever mode. pserver arguments:{}, cuda count:{}".
+            format(has_ps_args, cuda_device_num))
+        has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
+        if len(has_ps_heter_args) > 0:
+            return DistributeMode.PS_HETER
+        else:
+            return DistributeMode.PS
     elif len(has_collective_args) > 0:
         logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
                     format(has_collective_args, cuda_device_num))
-        launch_collective(args)
+        return DistributeMode.COLLECTIVE
     else:
         logger.warning(
             "Not found distinct arguments. Default use gpu collective mode")
+        return DistributeMode.COLLECTIVE
+
+
+def launch():
+    args = _parse_args()
+    logger = get_logger()
+    _print_arguments(args)
+
+    distribute_mode = which_distributed_mode(args)
+    if distribute_mode == DistributeMode.COLLECTIVE:
         launch_collective(args)
+    else:
+        launch_ps(args, distribute_mode)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 7540cd9f4c1f3..35782e0b04c5a 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -21,13 +21,27 @@
 import copy
 import sys
 import subprocess
+import tempfile
+import shutil
 from contextlib import closing
 import socket
+import warnings
 
+import paddle
+import paddle.fluid as fluid
 logger = logging.getLogger("root")
 logger.propagate = False
 
 
+class DistributeMode:
+    """
+    There are various mode for fleetrun, each of them is designed for different model.
+    """
+    COLLECTIVE = 0
+    PS = 1
+    PS_HETER = 2
+
+
 class Cluster(object):
     def __init__(self, hdfs):
         self.job_server = None
@@ -144,14 +158,16 @@ def __init__(self):
         self.trainers = []
         self.servers = []
         self.workers = []
+        self.heter_workers = []
         self.gpus = []
 
     def __str__(self):
         return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
-            workers:{}".format(self.rank, self.id, self.addr, self.port,
-                               self.gpus, [str(t) for t in self.trainers],
-                               [str(s) for s in self.servers],
-                               [str(w) for w in self.workers])
+            workers:{} heter_workers:{}".format(
+            self.rank, self.id, self.addr, self.port, self.gpus, [
+                str(t) for t in self.trainers
+            ], [str(s) for s in self.servers], [str(w) for w in self.workers],
+            [str(h) for h in self.heter_workers])
 
     def __eq__(self, pod):
         if self.rank != pod.rank or \
@@ -262,7 +278,7 @@ def terminate_local_procs(procs):
                 p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
-    #wait all process terminiated
+    # wait all process terminiated
     time.sleep(3)
     for step in range(0, 50):
         alive = False
@@ -406,10 +422,10 @@ def start_local_trainers(cluster,
     else:
         current_env = copy.copy(envs)
 
-    #paddle broadcast ncclUniqueId use socket, and
-    #proxy maybe make trainers unreachable, so delete them.
-    #if we set them to "", grpc will log error message "bad uri"
-    #so just delete them.
+    # paddle broadcast ncclUniqueId use socket, and
+    # proxy maybe make trainers unreachable, so delete them.
+    # if we set them to "", grpc will log error message "bad uri"
+    # so just delete them.
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
 
@@ -518,3 +534,524 @@ def watch_local_trainers(procs, nranks):
         raise
 
     return alive
+
+
+def get_gpus(gpus):
+    if gpus is None:
+        gpus_num = fluid.core.get_cuda_device_count()
+        res_gpus = [str(x) for x in range(0, gpus_num)]
+    else:
+        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices is None or cuda_visible_devices == "":
+            res_gpus = [x.strip() for x in gpus.split(',')]
+        else:
+            # change gpus into relative values
+            # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
+            # therefore gpus=0,1,2,3
+            cuda_visible_devices_list = cuda_visible_devices.split(',')
+            for x in gpus.split(','):
+                assert x in cuda_visible_devices_list, "Can't find "\
+                    "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
+                    % (x, cuda_visible_devices)
+            res_gpus = [
+                cuda_visible_devices_list.index(x.strip())
+                for x in gpus.split(',')
+            ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            gpus, res_gpus, cuda_visible_devices_list))
+
+    return res_gpus
+
+
+def direct_start(args):
+    # run ps-cpu mode on paddlecloud, using given envs
+    cmd = [sys.executable, "-u", args.training_script] + \
+        args.training_script_args
+    proc = subprocess.Popen(cmd)
+    proc.wait()
+    return
+
+
+def get_custom_endpoints(origin_endpoints, offset=0):
+    """
+    origin_endpoint: ip:port
+    user_define_endpoint: ip:(port+offset)
+    """
+    assert origin_endpoints != None
+    paddle_user_define_endpoints_list = []
+    for ip_port in origin_endpoints.split(","):
+        ip = ip_port.split(":")[0]
+        port = ip_port.split(":")[1]
+        new_port = int(port) + offset
+        paddle_user_define_endpoints_list.append(":".join((ip, str(new_port))))
+    paddle_user_define_endpoints = ",".join(paddle_user_define_endpoints_list)
+    return paddle_user_define_endpoints
+
+
+def cloud_ps_heter_env_set(args):
+    environs = {}
+
+    paddle_trainer_endpoints = os.getenv("TRAINER_IP_PORT_LIST", "")
+    assert paddle_trainer_endpoints != None
+
+    paddle_pserver_endpoints = os.getenv("PSERVER_IP_PORT_LIST", "")
+    assert paddle_pserver_endpoints != None
+
+    # hard code for paddlecloud custom-framework
+    avilable_ports = os.getenv("TRAINER_PORTS", "").split(",")
+    assert len(
+        avilable_ports
+    ) > 3, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit"
+
+    # hard code for paddlecloud custom-framework
+    trainers_num = len(paddle_pserver_endpoints.split(","))
+    assert trainers_num != 0
+    environs["PADDLE_TRAINERS_NUM"] = trainers_num
+    environs["TRAINERS_NUM"] = trainers_num
+
+    # hard code for paddlecloud custom-framework
+    environs["PADDLE_HETER_TRAINER_IP_PORT_LIST"] = paddle_trainer_endpoints
+    environs["PADDLE_PSERVERS_IP_PORT_LIST"] = paddle_pserver_endpoints
+    environs["PADDLE_TRAINER_ENDPOINTS"] = get_custom_endpoints(
+        paddle_pserver_endpoints, 1)
+    heter_worker_num = len(paddle_trainer_endpoints.split(","))
+    if (args.heter_worker_num != None) and (
+            heter_worker_num != args.heter_worker_num):
+        warnings.warn(
+            "Your fleetrun setting: heter_worker_num is {}, but we find {} device can be used, this setting has been changed.".
+            format(args.heter_worker_num, heter_worker_num))
+        args.heter_worker_num = heter_worker_num
+
+    for k, v in environs.items():
+        os.environ[k] = str(v)
+    logger.info("Set heter parameter server env: {}".format(
+        pretty_print_envs(environs)))
+
+
+class ParameterServerLauncher(object):
+    def __init__(self, args, distribute_mode):
+        self.args = args
+        self.distribute_mode = distribute_mode
+        self.server_num = 0
+        self.worker_num = 0
+        self.heter_worker_num = 0
+
+        self.server_endpoints = ""
+        self.server_endpoints_ips = []
+        self.server_endpoints_port = []
+
+        self.worker_endpoints = ""
+        self.worker_endpoints_ips = []
+        self.worker_endpoints_port = []
+
+        self.heter_worker_endpoints = ""
+        self.heter_worker_endpoints_ips = []
+        self.heter_worker_endpoints_port = []
+
+        self.is_local = True
+        self.current_node_ip = ""
+
+        self.get_role_endpoints(args)
+
+    def get_role_endpoints(self, args):
+        # get server envs
+        if args.server_num:
+            self.server_num = args.server_num
+            if args.servers:
+                assert len(
+                    args.servers.split(",")
+                ) == self.server_num, "The server_num and servers doesn't match. Expect servers endpoints num epual to server_num, but received servers enpoint num: {} and server_num {}".format(
+                    len(args.servers.split(",")), self.server_num)
+                self.server_endpoints = args.servers
+            else:
+                ports = get_ports(self.server_num, 0)
+                self.server_endpoints = ",".join(
+                    ["127.0.0.1:" + str(x) for x in ports])
+        else:
+            assert args.servers != "", "The setting of Parameter-Server must has server_num or servers."
+            self.server_endpoints = args.servers
+            self.server_num = len(self.server_endpoints.split(","))
+
+        # get worker envs
+        if args.worker_num:
+            self.worker_num = args.worker_num
+            if args.workers:
+                assert len(
+                    args.workers.split(",")
+                ) == self.worker_num, "The worker_num and workers doesn't match. Expect workers endpoints num epual to worker_num, but received workers enpoint num: {} and worker_num {}".format(
+                    len(args.workers.split(",")), self.worker_num)
+
+                self.worker_endpoints = args.workers
+            else:
+                ports = get_ports(self.worker_num, self.server_num)
+                self.worker_endpoints = ",".join(
+                    ["127.0.0.1:" + str(x) for x in ports])
+        else:
+            assert args.workers != "", "The setting of Parameter-Server must has worker_num or workers."
+            worker_endpoints_ips = [
+                x.strip().split(":")[0] for x in args.workers.split(",")
+            ]
+            self.worker_num = len(worker_endpoints_ips)
+            worker_endpoints_len = [
+                len(x.strip().split(":")) for x in args.workers.split(",")
+            ]
+
+            if 1 in worker_endpoints_len:
+                # if no port value in worker_endpoints, will set default port values.
+                start_port = 6170
+                worker_endpoints_port = range(
+                    start_port + self.server_num,
+                    start_port + self.server_num + self.worker_num, 1)
+                # create endpoints str
+                worker_endpoints = []
+                for i in range(self.worker_num):
+                    worker_endpoints.append(":".join((worker_endpoints_ips[
+                        i], str(worker_endpoints_port[i]))))
+                self.worker_endpoints = ",".join(worker_endpoints)
+            else:
+                self.worker_endpoints = args.workers
+
+        # get heter worker envs
+        if self.distribute_mode == DistributeMode.PS_HETER:
+            if args.heter_worker_num:
+                self.heter_worker_num = args.heter_worker_num
+                if args.heter_workers:
+                    assert len(
+                        args.heter_workers.split(",")
+                    ) == self.heter_worker_num, "The heter_worker_num and heter_workers doesn't match. Expect heter_workers endpoints num epual to heter_worker_num, but received heter_workers enpoint num: {} and heter_worker_num {}".format(
+                        len(args.heter_workers.split(",")),
+                        self.heter_worker_num)
+                    self.heter_worker_endpoints = args.heter_workers
+                else:
+                    ports = get_ports(self.heter_worker_num,
+                                      self.server_num + self.worker_num)
+                    self.heter_worker_endpoints = ",".join(
+                        ["127.0.0.1:" + str(x) for x in ports])
+            else:
+                assert args.heter_workers != "", "The setting of Parameter-Server heter mode must has heter_worker_num or heter_workers."
+                self.heter_worker_endpoints = args.heter_workers
+                self.heter_worker_num = len(
+                    self.heter_worker_endpoints.split(","))
+
+        # check local or user define
+        self.server_endpoints_ips = [
+            x.strip().split(":")[0] for x in self.server_endpoints.split(",")
+        ]
+        self.worker_endpoints_ips = [
+            x.strip().split(":")[0] for x in self.worker_endpoints.split(",")
+        ]
+        self.server_endpoints_port = [
+            x.strip().split(":")[1] for x in self.server_endpoints.split(",")
+        ]
+        self.worker_endpoints_port = [
+            x.strip().split(":")[1] for x in self.worker_endpoints.split(",")
+        ]
+        self.node_ips = list(
+            set(self.server_endpoints_ips + self.worker_endpoints_ips))
+        if self.distribute_mode == DistributeMode.PS_HETER:
+            self.heter_worker_endpoints_ips = [
+                x.strip().split(":")[0]
+                for x in self.heter_worker_endpoints.split(",")
+            ]
+            self.heter_worker_endpoints_port = [
+                x.strip().split(":")[1]
+                for x in self.heter_worker_endpoints.split(",")
+            ]
+            self.node_ips = list(
+                set(self.node_ips + self.heter_worker_endpoints_ips))
+
+        if len(set(self.node_ips)) == 1:
+            self.is_local = True
+            self.current_node_ip = self.node_ips[0]
+        else:
+            self.is_local = False
+            pod_ip = os.getenv("POD_IP", None)
+            if pod_ip == None:
+                _, self.current_node_ip = get_host_name_ip()
+            else:
+                self.current_node_ip = pod_ip
+            assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
+                % (self.current_node_ip, self.node_ips)
+        self.node_rank = self.node_ips.index(self.current_node_ip)
+
+        logger.debug(
+            "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".
+            format(self.node_ips, self.current_node_ip, self.node_rank))
+
+    def start_ps(self):
+        cluster = Cluster(hdfs=None)
+        server_rank = 0
+        worker_rank = 0
+        heter_worker_rank = 0
+
+        for node_rank, ip in enumerate(self.node_ips):
+            pod = Pod()
+            pod.rank = node_rank
+            pod.addr = ip
+            for i in range(len(self.server_endpoints_ips)):
+                if ip == self.server_endpoints_ips[i]:
+                    server = Trainer()
+                    server.endpoint = "%s:%s" % (ip,
+                                                 self.server_endpoints_port[i])
+                    server.rank = server_rank
+                    server_rank += 1
+                    pod.servers.append(server)
+            for j in range(len(self.worker_endpoints_ips)):
+                if ip == self.worker_endpoints_ips[j]:
+                    worker = Trainer()
+                    worker.endpoint = "%s:%s" % (ip,
+                                                 self.worker_endpoints_port[j])
+                    worker.rank = worker_rank
+                    worker_rank += 1
+                    pod.workers.append(worker)
+            for k in range(len(self.heter_worker_endpoints_ips)):
+                if ip == self.heter_worker_endpoints_ips[k]:
+                    heter_worker = Trainer()
+                    heter_worker.endpoint = "%s:%s" % (
+                        ip, self.heter_worker_endpoints_port[k])
+                    heter_worker.rank = heter_worker_rank
+                    heter_worker_rank += 1
+                    pod.heter_workers.append(heter_worker)
+
+            cluster.pods.append(pod)
+
+        pod = cluster.pods[self.node_rank]
+        self.gloo_rendezvous_dir = tempfile.mkdtemp()
+
+        # 3. subproces start
+        self.procs = {"worker": [], "server": [], "heter_worker": []}
+        self.cmds = {"worker": [], "server": [], "heter_worker": []}
+        self.log_fns = {"worker": [], "server": [], "heter_worker": []}
+
+        self.start_pod_server(self.args, pod)
+        self.start_pod_worker(self.args, pod)
+        self.start_pod_heter_worker(self.args, pod)
+
+        logger.info(
+            "Please check servers, workers and heter_worker logs in {}/workerlog.*, {}/serverlog.* and {}/heterlog.*".
+            format(self.args.log_dir, self.args.log_dir, self.args.log_dir))
+
+        # 4. wait for finish training
+        if len(self.procs["worker"]) > 0:
+            # if node has worker procs
+            # only wait worker to finish here
+            for i, proc in enumerate(self.procs["worker"]):
+                self.procs["worker"][i].proc.wait()
+                if len(self.log_fns["worker"]) > 0:
+                    self.log_fns["worker"][i].close()
+            logger.info(
+                "all workers exit, going to finish parameter server and heter_worker."
+            )
+            if len(self.procs["heter_worker"]) > 0:
+                for i, proc in enumerate(self.procs["heter_worker"]):
+                    self.log_fns["heter_worker"][i].close()
+                    self.procs["heter_worker"][i].proc.terminate()
+                logger.info("all heter_worker are killed")
+
+            if len(self.procs["server"]) > 0:
+                for i, proc in enumerate(self.procs["server"]):
+                    self.log_fns["server"][i].close()
+                    self.procs["server"][i].proc.terminate()
+                logger.info("all parameter server are killed")
+
+        else:
+            # if node has not worker procs
+            # blocking training process
+            if len(self.procs["server"]) > 0:
+                for i, proc in enumerate(self.procs["server"]):
+                    self.procs["server"][i].proc.wait()
+
+            if len(self.procs["heter_worker"]) > 0:
+                for i, proc in enumerate(self.procs["heter_worker"]):
+                    self.procs["heter_worker"][i].proc.wait()
+
+        if os.path.exists(self.gloo_rendezvous_dir):
+            shutil.rmtree(self.gloo_rendezvous_dir)
+
+    def start_pod_server(self, args, pod):
+        default_env = os.environ.copy()
+        current_env = copy.copy(default_env)
+        current_env.pop("http_proxy", None)
+        current_env.pop("https_proxy", None)
+        for idx, cur_server in enumerate(pod.servers):
+            proc_env = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST":
+                self.heter_worker_endpoints,
+                "PADDLE_PORT": cur_server.endpoint.split(":")[1],
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": str(self.worker_num),
+                "POD_IP": cur_server.endpoint.split(":")[0],
+                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir
+            }
+            current_env.update(proc_env)
+
+            cmd = [sys.executable, "-u", args.training_script
+                   ] + args.training_script_args
+            self.cmds["server"].append(cmd)
+
+            if idx == 0:
+                logger.info(
+                    "Local server start {} processes. First process distributed "
+                    "environment info (Only For Debug): {}".format(
+                        len(pod.servers),
+                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
+                                                     ))))
+
+            if args.log_dir is not None:
+                os.system("mkdir -p {}".format(args.log_dir))
+                fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
+                self.log_fns["server"].append(fn)
+                proc = subprocess.Popen(
+                    cmd, env=current_env, stdout=fn, stderr=fn)
+            else:
+                proc = subprocess.Popen(cmd, env=current_env)
+
+            tp = TrainerProc()
+            tp.proc = proc
+            tp.rank = cur_server.rank
+            tp.local_rank = idx
+            tp.log_fn = fn
+            tp.log_offset = fn.tell() if fn else None
+            tp.cmd = cmd
+
+            self.procs["server"].append(tp)
+
+    def start_pod_worker(self, args, pod):
+        default_env = os.environ.copy()
+        current_env = copy.copy(default_env)
+        current_env.pop("http_proxy", None)
+        current_env.pop("https_proxy", None)
+
+        heter_device_num = 0
+        device_list = []
+        if fluid.core.is_compiled_with_cuda():
+            device_list = get_gpus(args.gpus)
+            heter_device_num = len(device_list)
+        elif fluid.core.is_compiled_with_xpu():
+            heter_device_num = fluid.core.get_xpu_device_count()
+            device_list = [str(x) for x in range(0, heter_device_num)]
+
+        for idx, cur_worker in enumerate(pod.workers):
+            device_id = str(device_list[idx % heter_device_num])
+            proc_env = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_TRAINERS_NUM": str(self.worker_num),
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST":
+                self.heter_worker_endpoints,
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": str(cur_worker.rank),
+                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
+                "FLAGS_selected_gpus": "0",
+                "FLAGS_selected_xpus": "0",
+                "CUDA_VISIBLE_DEVICES": device_id,
+                "XPU_VISIBLE_DEVICES": device_id,
+            }
+            current_env.update(proc_env)
+
+            cmd = [sys.executable, "-u", args.training_script
+                   ] + args.training_script_args
+            self.cmds["worker"].append(cmd)
+
+            if idx == 0:
+                logger.info(
+                    "Local worker start {} processes. First process distributed "
+                    "environment info (Only For Debug): {}".format(
+                        len(pod.workers),
+                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
+                                                     ))))
+
+            if args.log_dir is not None:
+                os.system("mkdir -p {}".format(args.log_dir))
+                fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
+                self.log_fns["worker"].append(fn)
+                proc = subprocess.Popen(
+                    cmd, env=current_env, stdout=fn, stderr=fn)
+            else:
+                proc = subprocess.Popen(cmd, env=current_env)
+
+            tp = TrainerProc()
+            tp.proc = proc
+            tp.rank = cur_worker.rank
+            tp.local_rank = idx
+            tp.log_fn = fn
+            tp.log_offset = fn.tell() if fn else None
+            tp.cmd = cmd
+
+            self.procs["worker"].append(tp)
+
+    def start_pod_heter_worker(self, args, pod):
+        default_env = os.environ.copy()
+        current_env = copy.copy(default_env)
+        current_env.pop("http_proxy", None)
+        current_env.pop("https_proxy", None)
+
+        heter_device_num = 0
+        device_list = []
+        if fluid.core.is_compiled_with_cuda():
+            device_list = get_gpus(args.gpus)
+            heter_device_num = len(device_list)
+        elif fluid.core.is_compiled_with_xpu():
+            heter_device_num = fluid.core.get_xpu_device_count()
+            device_list = [str(x) for x in range(0, heter_device_num)]
+        assert heter_device_num != 0
+
+        for idx, cur_heter_worker in enumerate(pod.heter_workers):
+            device_id = str(device_list[idx % heter_device_num])
+            proc_env = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST":
+                self.heter_worker_endpoints,
+                "PADDLE_PORT": cur_heter_worker.endpoint.split(":")[1],
+                "TRAINING_ROLE": "HETER_TRAINER",
+                "PADDLE_TRAINERS_NUM": str(self.worker_num),
+                "POD_IP": cur_heter_worker.endpoint.split(":")[0],
+                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
+                "FLAGS_selected_gpus": "0",
+                "FLAGS_selected_xpus": "0",
+                "CUDA_VISIBLE_DEVICES": device_id,
+                "XPU_VISIBLE_DEVICES": device_id,
+            }
+            current_env.update(proc_env)
+
+            cmd = [sys.executable, "-u", args.training_script
+                   ] + args.training_script_args
+            self.cmds["heter_worker"].append(cmd)
+
+            if idx == 0:
+                logger.info(
+                    "Local heter_worker start {} processes. First process distributed "
+                    "environment info (Only For Debug): {}".format(
+                        len(pod.heter_workers),
+                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
+                                                     ))))
+
+            if args.log_dir is not None:
+                os.system("mkdir -p {}".format(args.log_dir))
+                fn = open("%s/heterlog.%d" % (args.log_dir, idx), "w")
+                self.log_fns["heter_worker"].append(fn)
+                proc = subprocess.Popen(
+                    cmd, env=current_env, stdout=fn, stderr=fn)
+            else:
+                proc = subprocess.Popen(cmd, env=current_env)
+
+            tp = TrainerProc()
+            tp.proc = proc
+            tp.rank = cur_heter_worker.rank
+            tp.local_rank = idx
+            tp.log_fn = fn
+            tp.log_offset = fn.tell() if fn else None
+            tp.cmd = cmd
+
+            self.procs["heter_worker"].append(tp)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index ad96e1426694f..283589c5f3320 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -19,16 +19,14 @@ class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(AMPOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.amp_opt = None
+        self.wrapped_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
             "RecomputeOptimizer",
-            "LocalSGDOptimizer",
             "GradientMergeOptimizer",
             "GraphExecutionOptimizer",
-            "AdaptiveLocalSGDOptimizer",
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
@@ -37,6 +35,24 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
         super(AMPOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _init_wrapped_opt(self):
+        if self.wrapped_opt is not None:
+            return
+
+        config = self.user_defined_strategy.amp_configs
+
+        custom_white_list = set(config['custom_white_list'])
+        custom_black_list = set(config['custom_black_list'])
+        custom_black_varnames = set(config['custom_black_varnames'])
+        amp_lists = mixed_precision.AutoMixedPrecisionLists(
+            custom_white_list, custom_black_list, custom_black_varnames)
+
+        self.wrapped_opt = mixed_precision.decorate(
+            self.inner_opt, amp_lists, config['init_loss_scaling'],
+            config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'],
+            config['incr_ratio'], config['decr_ratio'],
+            config['use_dynamic_loss_scaling'])
+
     def _can_apply(self):
         if not self.role_maker._is_collective:
             return False
@@ -60,26 +76,31 @@ def _enable_strategy(self, dist_strategy, context):
             "use_dynamic_loss_scaling": True
         }
 
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        # maybe inner_opt of other meta optimizer
+        self._init_wrapped_opt()
+        return self.wrapped_opt.backward(loss, startup_program, parameter_list,
+                                         no_grad_set, callbacks)
+
+    def apply_gradients(self, params_grads):
+        return self.wrapped_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.wrapped_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        if self.amp_opt is None:
-            config = self.user_defined_strategy.amp_configs
-            custom_white_list = set(config['custom_white_list'])
-            custom_black_list = set(config['custom_black_list'])
-            custom_black_varnames = set(config['custom_black_varnames'])
-            amp_lists = mixed_precision.AutoMixedPrecisionLists(
-                custom_white_list, custom_black_list, custom_black_varnames)
-
-            self.amp_opt = mixed_precision.decorate(
-                self.inner_opt, amp_lists, config['init_loss_scaling'],
-                config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'],
-                config['incr_ratio'], config['decr_ratio'],
-                config['use_dynamic_loss_scaling'])
-
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
-            self.amp_opt.minimize(loss, startup_program,
+            self.wrapped_opt.minimize(loss, startup_program,
                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 6806a479d30f4..9990021c8506a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -85,6 +85,13 @@ def backward(self,
         return self.dgc_opt.backward(loss, startup_program, parameter_list,
                                      no_grad_set, callbacks)
 
+    def apply_gradients(self, params_grads):
+        return self.dgc_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.dgc_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index df9887759e16f..64d54ae3bab03 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -98,6 +98,10 @@ def backward(self,
     def apply_gradients(self, params_grads):
         return self.lamb_opt.apply_gradients(params_grads=params_grads)
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.lamb_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 609d8b85e714c..32c6be505a546 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -85,6 +85,10 @@ def backward(self,
     def apply_gradients(self, params_grads):
         return self.lars_opt.apply_gradients(params_grads=params_grads)
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.lars_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 9f094978d842a..91030f0762934 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -24,7 +24,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ['AMPOptimizer']
         self.meta_optimizers_black_list = [
             "GraphExecutionOptimizer",
             "AdaptiveLocalSGDOptimizer",
@@ -195,7 +195,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ['AMPOptimizer']
         self.meta_optimizers_black_list = [
             "GraphExecutionOptimizer", "LocalSGDOptimizer"
         ]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 38ad41f8836b4..83345cb6f623e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -74,6 +74,8 @@ def _build_trainer_programs(self, compiled_config):
             _startup = worker.delet_extra_optimizes_pass(_startup,
                                                          compiled_config)
 
+            compiled_config.set_origin_ps_main_program(_main)
+            compiled_config.set_origin_ps_startup_program(_startup)
             # for heter program
             if self.role_maker._is_heter_parameter_server_mode:
                 from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
@@ -91,6 +93,8 @@ def _build_trainer_programs(self, compiled_config):
         else:
             _main = worker.append_send_ops_pass(_main, compiled_config)
             _startup = _startup
+            compiled_config.set_origin_ps_main_program(_main)
+            compiled_config.set_origin_ps_startup_program(_startup)
 
         return _main, _startup
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 59ca7e633099e..ea2b67ac4bd1f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -18,15 +18,14 @@
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(RecomputeOptimizer, self).__init__(optimizer)
-        #self.inner_opt = RO(optimizer)
         self.inner_opt = optimizer
-        self.wrapped_opt = RO(optimizer)
+        self.wrapped_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
-            "GradientMergeOptimizer",
             "GraphExecutionOptimizer",
+            "DGCOptimizer",
         ]
         self.meta_optimizers_black_list = []
 
@@ -34,8 +33,15 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(RecomputeOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        self.wrapped_opt._set_checkpoints(
-            list(user_defined_strategy.recompute_configs["checkpoints"]))
+
+    def _init_wrapped_opt(self):
+        if self.wrapped_opt is not None:
+            return
+
+        configs = self.user_defined_strategy.recompute_configs
+
+        self.wrapped_opt = RO(self.inner_opt)
+        self.wrapped_opt._set_checkpoints(list(configs["checkpoints"]))
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -62,14 +68,24 @@ def backward(self,
                  parameter_list=None,
                  no_grad_set=None,
                  callbacks=None):
+        # maybe inner_opt of other meta optimizer
+        self._init_wrapped_opt()
         return self.wrapped_opt.backward(loss, startup_program, parameter_list,
                                          no_grad_set, callbacks)
 
+    def apply_gradients(self, params_grads):
+        return self.wrapped_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.wrapped_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 42be7e869d9a7..266c7d0f405bf 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -210,18 +210,23 @@ def get_sparse_attrs():
             warnings.warn("communicator has been initialized, skip")
 
     def _get_executor(self):
-        if self.role_maker._is_heter_worker():
-            if self.role_maker._get_heter_worker_device() == "GPU":
-                gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-                executor = Executor(fluid.CUDAPlace(gpu_id))
-            elif self.role_maker._get_heter_worker_device() == "XPU":
-                xpu_id = int(os.getenv("FLAGS_selected_xpus", "0"))
-                executor = Executor(fluid.XPUPlace(xpu_id))
-            else:
-                raise ValueError("Not Support Device {}".format(
-                    self.role_maker._get_heter_worker_device()))
-        else:
-            executor = fluid.Executor(fluid.CPUPlace())
+        executor = fluid.Executor(fluid.CPUPlace())
+        if self.role_maker._is_heter_parameter_server_mode:
+            heter_worker_device_guard = self.context[
+                "valid_strategy"].a_sync_configs[
+                    "heter_worker_device_guard"].upper()
+            if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
+                raise ValueError("Heter Worker Not Support Device {}".format(
+                    heter_worker_device_guard))
+            if self.role_maker._is_heter_worker():
+                if heter_worker_device_guard == "GPU":
+                    executor = Executor(
+                        fluid.CUDAPlace(
+                            int(os.getenv("FLAGS_selected_gpus", "0"))))
+                elif heter_worker_device_guard == "XPU":
+                    executor = Executor(
+                        fluid.XPUPlace(
+                            int(os.getenv("FLAGS_selected_xpus", "0"))))
         return executor
 
     def _init_server(self, *args, **kwargs):
@@ -233,12 +238,14 @@ def _init_server(self, *args, **kwargs):
             model_dirname = None
 
         executor = self._get_executor()
+        if self.role_maker._is_heter_worker() and self.context[
+                "valid_strategy"].a_sync_configs["launch_barrier"]:
+            # for heter trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
         executor.run(fluid.default_startup_program())
 
         if self.role_maker._is_heter_worker():
             self._init_worker()
-
-        if self.role_maker._is_heter_worker():
             return
 
         if not model_dirname:
@@ -470,13 +477,13 @@ def _save_distributed_params(self, executor, dirname, context,
 
     def _save_distributed_persistables(self, executor, dirname, main_program):
         dense_ctx = self.compiled_strategy.get_communicator_recv_context(
-            recv_type=1)
+            recv_type=1, use_origin_program=True)
 
         sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
-            recv_type=2)
+            recv_type=2, use_origin_program=True)
 
         distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
-            recv_type=3)
+            recv_type=3, use_origin_program=True)
 
         recv_dense_varnames = self._save_dense_params(executor, dirname,
                                                       dense_ctx, main_program)
@@ -528,7 +535,7 @@ def _ps_inference_save_persistables(self,
             )
 
         if main_program is None:
-            main_program = fluid.default_main_program()
+            main_program = self.compiled_strategy.get_origin_ps_main_program()
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 35204affb3fd1..ff3e882229ae8 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -28,13 +28,14 @@
 from .fluid import core
 from .fluid.framework import in_dygraph_mode
 from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
+from .tensor import arange, gather_nd, concat, multinomial
 import math
 import numpy as np
 import warnings
 
 from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 
-__all__ = ['Distribution', 'Uniform', 'Normal']
+__all__ = ['Distribution', 'Uniform', 'Normal', 'Categorical']
 
 
 class Distribution(object):
@@ -640,3 +641,318 @@ def kl_divergence(self, other):
         t1 = (t1 * t1)
         return elementwise_add(
             0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
+
+
+class Categorical(Distribution):
+    """
+    Categorical distribution is a discrete probability distribution that 
+    describes the possible results of a random variable that can take on 
+    one of K possible categories, with the probability of each category 
+    separately specified.
+
+    The probability mass function (pmf) is:
+
+    .. math::
+
+        pmf(k; p_i) = \prod_{i=1}^{k} p_i^{[x=i]}
+
+    In the above equation:
+
+    * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise.
+
+    Args:
+        logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+          y = paddle.rand([6])
+          print(y.numpy())
+          # [0.6365463 , 0.7278677 , 0.90260243, 
+          # 0.5226815 , 0.35837543, 0.13981032]
+
+          cat = Categorical(x)
+          cat2 = Categorical(y)
+
+          cat.sample([2,3])
+          # [[5, 1, 1],
+          # [0, 1, 2]]
+
+          cat.entropy()
+          # [1.71887]
+
+          cat.kl_divergence(cat2)
+          # [0.0278455]
+
+          value = paddle.to_tensor([2,1,3])
+          cat.probs(value)
+          # [0.341613 0.342648 0.03123]
+
+          cat.log_prob(value)
+          # [-1.07408 -1.07105 -3.46638]
+
+    """
+
+    def __init__(self, logits, name=None):
+        """
+        Args:
+            logits(list|numpy.ndarray|Variable): The logits input of categorical distribution. The data type is float32 or float64.
+        """
+        if not in_dygraph_mode():
+            check_type(logits, 'logits', (np.ndarray, tensor.Variable, list),
+                       'Categorical')
+
+        self.name = name if name is not None else 'Categorical'
+        self.dtype = 'float32'
+
+        if self._validate_args(logits):
+            self.logits = logits
+            self.dtype = convert_dtype(logits.dtype)
+        else:
+            if isinstance(logits, np.ndarray) and str(
+                    logits.dtype) in ['float32', 'float64']:
+                self.dtype = logits.dtype
+            self.logits = self._to_tensor(logits)[0]
+            if self.dtype != convert_dtype(self.logits.dtype):
+                self.logits = tensor.cast(self.logits, dtype=self.dtype)
+
+    def sample(self, shape):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): Shape of the generated samples.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          cat.sample([2,3])
+          # [[5, 1, 1],
+          # [0, 1, 2]]
+
+        """
+        name = self.name + '_sample'
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+
+        num_samples = np.prod(np.array(shape))
+
+        logits_shape = list(self.logits.shape)
+        if len(logits_shape) > 1:
+            sample_shape = shape + logits_shape[:-1]
+            logits = nn.reshape(self.logits,
+                                [np.prod(logits_shape[:-1]), logits_shape[-1]])
+        else:
+            sample_shape = shape
+            logits = self.logits
+
+        sample_index = multinomial(logits, num_samples, True)
+        return nn.reshape(sample_index, sample_shape, name=name)
+
+    def kl_divergence(self, other):
+        """The KL-divergence between two Categorical distributions.
+
+        Args:
+            other (Categorical): instance of Categorical. The data type is float32.
+
+        Returns:
+            Variable: kl-divergence between two Categorical distributions.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+          y = paddle.rand([6])
+          print(y.numpy())
+          # [0.6365463 , 0.7278677 , 0.90260243, 
+          # 0.5226815 , 0.35837543, 0.13981032]
+
+          cat = Categorical(x)
+          cat2 = Categorical(y)
+
+          cat.kl_divergence(cat2)
+          # [0.0278455]
+
+        """
+        name = self.name + '_kl_divergence'
+        if not in_dygraph_mode():
+            check_type(other, 'other', Categorical, 'kl_divergence')
+
+        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        other_logits = other.logits - nn.reduce_max(
+            other.logits, dim=-1, keep_dim=True)
+        e_logits = ops.exp(logits)
+        other_e_logits = ops.exp(other_logits)
+        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
+        prob = e_logits / z
+        kl = nn.reduce_sum(
+            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
+            dim=-1,
+            keep_dim=True,
+            name=name)
+
+        return kl
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Variable: Shannon entropy of Categorical distribution. The data type is float32.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          cat.entropy()
+          # [1.71887]
+
+        """
+        name = self.name + '_entropy'
+        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        e_logits = ops.exp(logits)
+        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        prob = e_logits / z
+
+        neg_entropy = nn.reduce_sum(
+            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
+        entropy = nn.scale(neg_entropy, scale=-1.0, name=name)
+        return entropy
+
+    def probs(self, value):
+        """Probabilities of the given category (``value``).
+
+        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as 
+        category, and the others represents the different distributions.
+        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the 
+        same number of distributions as ``logits``.
+        If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
+        with ``logits. That is, ``value[:-1] = logits[:-1]``.
+
+        Args:
+          value (Tensor): The input tensor represents the selected category index.
+
+        Returns:
+          Tensor: probability according to the category index.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          value = paddle.to_tensor([2,1,3])
+          cat.probs(value)
+          # [0.341613 0.342648 0.03123]
+
+        """
+        name = self.name + '_probs'
+
+        dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True)
+        prob = self.logits / dist_sum
+
+        shape = list(prob.shape)
+        value_shape = list(value.shape)
+        if len(shape) == 1:
+            num_value_in_one_dist = np.prod(value_shape)
+            index_value = nn.reshape(value, [num_value_in_one_dist, 1])
+            index = index_value
+        else:
+            num_dist = np.prod(shape[:-1])
+            num_value_in_one_dist = value_shape[-1]
+            prob = nn.reshape(prob, [num_dist, shape[-1]])
+            if len(value_shape) == 1:
+                value = nn.expand(value, [num_dist])
+                value_shape = shape[:-1] + value_shape
+            index_value = nn.reshape(value, [num_dist, -1, 1])
+            if shape[:-1] != value_shape[:-1]:
+                raise ValueError(
+                    "shape of value {} must match shape of logits {}".format(
+                        str(value_shape[:-1]), str(shape[:-1])))
+
+            index_prefix = nn.unsqueeze(
+                arange(
+                    num_dist, dtype=index_value.dtype), axes=-1)
+            index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist])
+            index_prefix = nn.unsqueeze(index_prefix, axes=-1)
+
+            if index_value.dtype != index_prefix.dtype:
+                tensor.cast(index_prefix, dtype=index_value.dtype)
+            index = concat([index_prefix, index_value], axis=-1)
+
+        # value is the category index to search for the corresponding probability.
+        select_prob = gather_nd(prob, index)
+        return nn.reshape(select_prob, value_shape, name=name)
+
+    def log_prob(self, value):
+        """Log probabilities of the given category. Refer to ``probs`` method.
+
+        Args:
+          value (Tensor): The input tensor represents the selected category index.
+
+        Returns:
+          Tensor: Log probability.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          value = paddle.to_tensor([2,1,3])
+
+          cat.log_prob(value)
+          # [-1.07408 -1.07105 -3.46638]
+
+        """
+        name = self.name + '_log_prob'
+
+        return nn.log(self.probs(value), name=name)
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 0e7a9dbea2561..505d6fef8fb53 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -26,8 +26,8 @@
 from .dygraph import base as imperative_base
 
 __all__ = [
-    'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
-    'GradientClipByNorm', 'GradientClipByGlobalNorm'
+    'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
+    'ClipGradByNorm', 'ClipGradByGlobalNorm'
 ]
 
 
@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
             error_clip._append_clip_op(block, grad_n)
 
 
-class GradientClipBase(object):
-    def __init__(self, need_clip=None):
-        if need_clip is not None and not callable(need_clip):
-            raise TypeError(
-                "The type of need_clip must be funciton, and it can filter out "
-                "parameter that does't need gradient clip. This function must return "
-                "True or False, and True means that clipping is required. Please refer to "
-                "API documention of GradientClipByGlobalNorm / GradientClipByNorm "
-                "/GradientClipByValue.")
-        self._need_clip_func = need_clip
+class ClipGradBase(object):
+    def __init__(self):
+        super(ClipGradBase, self).__init__()
 
     def __str__(self):
         raise NotImplementedError()
@@ -144,7 +137,7 @@ def __call__(self, params_grads):
                 if getattr(p, 'gradient_clip_attr', None) is not None:
                     warnings.warn(
                         "'set_gradient_clip' will be ineffective, because you have "
-                        "set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' "
+                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
                         "is redundant and you can remove it.")
                     break
             return self._static_clip(params_grads)
@@ -156,7 +149,7 @@ def _create_operators(self, param, grad):
         raise NotImplementedError()
 
 
-class GradientClipByValue(GradientClipBase):
+class ClipGradByValue(ClipGradBase):
     """
     Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
     
@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
     
     - Any values greater than max are set to ``max``.
 
-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
     
     Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
     (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    Note:
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
     
     Args:
         max (float): The maximum value to clip by.
         min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
             automatically. In this case, ``max`` must be greater than 0.
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
 
     Examples:
         .. code-block:: python
@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
             loss.backward()
 
-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByValue(min=-1, max=1)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
             sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
             sdg.step()
     """
 
-    def __init__(self, max, min=None, need_clip=None):
-        super(GradientClipByValue, self).__init__(need_clip)
+    def __init__(self, max, min=None):
+        super(ClipGradByValue, self).__init__()
         if min is None:
             assert (max > 0.0)
             min = -max
@@ -214,7 +199,7 @@ def __init__(self, max, min=None, need_clip=None):
         self.min = float(min)
 
     def __str__(self):
-        return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
+        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
 
     @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
@@ -222,7 +207,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
             new_grad = layers.clip(x=g, min=self.min, max=self.max)
@@ -236,8 +221,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     params_and_grads.append((p, g))
                     continue
 
@@ -256,7 +240,7 @@ def _create_operators(self, param, grad):
         return param, new_grad
 
 
-class GradientClipByNorm(GradientClipBase):
+class ClipGradByNorm(ClipGradBase):
     """
     Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
     
@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
     
     - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
     
-    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
     
     Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
     (for example: :ref:`api_paddle_optimizer_SGD`).
@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
     .. math::
         norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
 
+    Note:
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
     Args:
         clip_norm(float): The maximum norm value.
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
 
     Examples:
         .. code-block:: python
@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
             loss.backward()
 
-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
             sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
             sdg.step()
     """
 
-    def __init__(self, clip_norm, need_clip=None):
-        super(GradientClipByNorm, self).__init__(need_clip)
+    def __init__(self, clip_norm):
+        super(ClipGradByNorm, self).__init__()
         self.clip_norm = float(clip_norm)
 
     def __str__(self):
@@ -333,7 +309,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
             new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
@@ -347,8 +323,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     params_and_grads.append((p, g))
                     continue
 
@@ -367,7 +342,7 @@ def _create_operators(self, param, grad):
         return param, new_grad
 
 
-class GradientClipByGlobalNorm(GradientClipBase):
+class ClipGradByGlobalNorm(ClipGradBase):
     """
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
     :math:`t\_list` , and limit it to ``clip_norm`` .
@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
     
     - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
     
-    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
     
     Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
     (for example: :ref:`api_paddle_optimizer_SGD`).
@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
 
         global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
 
+    Note:
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
     Args:
         clip_norm (float): The maximum norm value.
-        group_name (str, optional): The group name for this clip. Default value is ``default_group``
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
+        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
 
     Examples:
         .. code-block:: python
@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
             loss.backward()
 
-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
             sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
             sdg.step()
     """
 
-    def __init__(self, clip_norm, group_name="default_group", need_clip=None):
-        super(GradientClipByGlobalNorm, self).__init__(need_clip)
+    def __init__(self, clip_norm, group_name="default_group"):
+        super(ClipGradByGlobalNorm, self).__init__()
         self.clip_norm = float(clip_norm)
         self.group_name = group_name
 
@@ -443,7 +410,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
@@ -469,7 +436,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
             new_grad = layers.elementwise_mul(x=g, y=clip_var)
@@ -484,8 +451,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     continue
                 merge_grad = g
                 with p.block.program._optimized_guard([p, g]):
@@ -518,8 +484,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     params_and_grads.append((p, g))
                     continue
 
@@ -670,9 +635,9 @@ def network():
                   "This method can reduce the mistakes, please "
                   "refer to documention of 'optimizer'.")
 
-    if not isinstance(clip, GradientClipBase):
+    if not isinstance(clip, ClipGradBase):
         raise TypeError(
-            "'clip' should be an instance of GradientClipBase's derived class")
+            "'clip' should be an instance of ClipGradBase's derived class")
     if program is None:
         program = framework.default_main_program()
 
@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
             clip_attr = getattr(p, 'gradient_clip_attr', None)
             if clip_attr is None:
                 return param_grads
-            if not isinstance(clip_attr, GradientClipBase):
+            if not isinstance(clip_attr, ClipGradBase):
                 raise TypeError(
                     "clip attribute should be an instance of GradientClipBase")
 
@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
                     op._set_attr('op_role_var', correct_p_g)
 
 
-ClipByValue = GradientClipByValue
-ClipByNorm = GradientClipByNorm
-ClipByGlobalNorm = GradientClipByGlobalNorm
+GradientClipBase = ClipGradBase
+GradientClipByValue = ClipGradByValue
+GradientClipByNorm = ClipGradByNorm
+GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index ac6493b1c2969..d0543bb90dd14 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -1525,10 +1525,10 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
             grid = fluid.data(name='grid', shape=[None, 12, 8, 10, 6], dtype='float32')
 
             # without offset
-            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=False)
+            output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=False)
             
             # has offset
-            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=True)
+            output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=True)
 
     """
     helper = LayerHelper("bilateral_slice", **locals())
@@ -1541,7 +1541,9 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
     inputs = {'X': x, 'Guide': guide, 'Grid': grid}
-
+    if paddle.fluid.in_dygraph_mode():
+        attrs = ('has_offset', has_offset)
+        return getattr(core.ops, "bilateral_slice")(x, grid, guide, *attrs)
     helper.append_op(
         type='bilateral_slice',
         inputs=inputs,
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c9112ac849ce0..529c664e7083c 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -16,6 +16,7 @@
 from ... import default_startup_program
 from ... import layers
 from ... import unique_name
+from ... import program_guard
 from . import fp16_utils
 from .fp16_utils import rewrite_program
 from .fp16_utils import update_role_var_grad
@@ -58,21 +59,40 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling,
         self._optimizer = optimizer
         self._amp_lists = amp_lists
         self._param_grads = None
-        self._train_program = default_main_program()
-        self._startup_prog = default_startup_program()
+        self._train_program = None
+
         self._scaled_loss = None
-        self._loss_scaling = layers.create_global_var(
-            name=unique_name.generate("loss_scaling"),
-            shape=[1],
-            value=init_loss_scaling,
-            dtype='float32',
-            persistable=True)
+        self._loss_scaling = None
+        self._init_loss_scaling = init_loss_scaling
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
         if self._use_dynamic_loss_scaling:
             self._incr_every_n_steps = incr_every_n_steps
             self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
             self._incr_ratio = incr_ratio
             self._decr_ratio = decr_ratio
+            self._num_good_steps = None
+            self._num_bad_steps = None
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        return self._loss_scaling
+
+    def get_scaled_loss(self):
+        """Return the scaled loss.
+        It's useful when you feed customed loss into executor.
+        """
+        return self._scaled_loss
+
+    def _init_amp_var(self):
+        self._loss_scaling = layers.create_global_var(
+            name=unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=self._init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+
+        if self._use_dynamic_loss_scaling:
             self._num_good_steps = layers.create_global_var(
                 name=unique_name.generate("num_good_steps"),
                 shape=[1],
@@ -86,28 +106,16 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 dtype='int32',
                 persistable=True)
 
-        # Ensure the data type of learning rate vars is float32 (same as the 
+        # Ensure the data type of learning rate vars is float32 (same as the
         # master parameter dtype)
-        if isinstance(optimizer._learning_rate, float):
-            optimizer._learning_rate_map[default_main_program()] = \
-                        layers.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[1],
-                        value=float(optimizer._learning_rate),
-                        dtype='float32',
-                        persistable=True)
-
-    def get_loss_scaling(self):
-        """Return the real-time loss scaling factor.
-        """
-        return self._loss_scaling
-
-    def get_scaled_loss(self):
-        """Return the scaled loss.
-        It's useful when you feed customed loss into executor.
-        """
-
-        return self._scaled_loss
+        if isinstance(self._optimizer._learning_rate, float):
+            self._optimizer._learning_rate_map[default_main_program()] = \
+                    layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._optimizer._learning_rate),
+                    dtype='float32',
+                    persistable=True)
 
     def backward(self,
                  loss,
@@ -131,16 +139,21 @@ def backward(self,
             A list of (param, grad), which is a tuple of a parameter and its 
             gradient respectively, and the scaled loss.
         """
-        rewrite_program(self._train_program, self._amp_lists)
-        self._scaled_loss = loss * self._loss_scaling
-        self._params_grads = self._optimizer.backward(
-            self._scaled_loss, startup_program, parameter_list, no_grad_set,
-            callbacks)
-        # Change the op_role_var attr for some ops, so that gradients
-        # transferred across GPUs can be FP16.
-        update_role_var_grad(self._train_program, self._params_grads)
-
-        return self._params_grads
+        train_program = loss.block.program
+        self._train_program = train_program
+
+        with program_guard(train_program, startup_program):
+            self._init_amp_var()
+
+            rewrite_program(train_program, self._amp_lists)
+            self._scaled_loss = loss * self._loss_scaling
+            params_grads = self._optimizer.backward(
+                self._scaled_loss, startup_program, parameter_list, no_grad_set,
+                callbacks)
+            # Change the op_role_var attr for some ops, so that gradients
+            # transferred across GPUs can be FP16.
+            update_role_var_grad(train_program, params_grads)
+        return params_grads
 
     def apply_gradients(self, params_grads):
         """
@@ -182,6 +195,12 @@ def apply_gradients(self, params_grads):
 
         return optimize_ops
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        program = loss.block.program
+        with program_guard(program, startup_program):
+            optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -207,7 +226,8 @@ def minimize(self,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_gradients(scaled_params_grads)
+        optimize_ops = self.apply_optimize(loss, startup_program,
+                                           scaled_params_grads)
 
         return optimize_ops, scaled_params_grads
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index dadc756c43ecc..45df381b63183 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -66,6 +66,7 @@ def __init__(self,
         self._fc_ops = ['fc']
         self._relu_ops = ['relu', 'relu6']
         self._matmul_ops = ['matmul']
+        self._gru_ops = ['fusion_gru']
         self._weight_scales = {}
         # Collect the Input and Output sclaes from Fake quant models
         self._var_quant_scales = {}
@@ -449,8 +450,43 @@ def _compute_var_scales(ops, w_name, axis):
                     self._var_quant_scales[weight_var_name] = (use_unsigned_int,
                                                                lod_tensor)
 
+        def _compute_gru_weight_scales(wx_name, wh_name):
+            for op in graph.all_op_nodes():
+                if op.op().type() in self._gru_ops:
+                    wx_var_name = op.input(wx_name)[0]
+                    wh_var_name = op.input(wh_name)[0]
+                    wx = np.array(self._load_param(self._scope, wx_var_name))
+                    wh = np.array(self._load_param(self._scope, wh_var_name))
+                    OC = wh.shape[0]
+                    scale_ur = 1.0 / np.max(np.abs(
+                        np.concatenate(
+                            [
+                                wx[:, :2 * OC], wh.flatten()[:2 * OC * OC]
+                                .reshape(OC, 2 * OC)
+                            ],
+                            axis=0)),
+                                            axis=0)
+                    scale_o = 1.0 / np.max(np.abs(
+                        np.concatenate(
+                            [
+                                wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:]
+                                .reshape(OC, OC)
+                            ],
+                            axis=0)),
+                                           axis=0)
+
+                    gru_weights_scale = np.concatenate(
+                        [scale_ur, scale_o]).astype('float')
+
+                    lod_tensor = self._convert_scale2tensor(gru_weights_scale)
+                    use_unsigned_int = False
+                    self._var_quant_scales[wx_var_name] = (use_unsigned_int,
+                                                           lod_tensor)
+
         _compute_var_scales(self._conv_ops, "Filter", axis=1)
         _compute_var_scales(self._fc_ops, "W", axis=0)
+        _compute_var_scales(self._gru_ops, "WeightH", axis=0)
+        _compute_gru_weight_scales("WeightX", "WeightH")
         return graph
 
     def _find_avg_pooling_ids(self, graph):
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index b5a8d90194331..eba881a2637ae 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -758,6 +758,7 @@ def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
             attrs={
                 'bit_length': quant_bits,
                 'quant_axis': quant_axis,
+                'is_test': self._is_test,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
@@ -1125,7 +1126,7 @@ def apply(self, graph):
                     self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
-# Remove all fake dequant op
+        # Remove all fake dequant op
         ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
@@ -1331,16 +1332,25 @@ def _is_float(self, v):
 
     def _quant(self, x, scale, num_bits, quant_axis):
         assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
+        bnt = (1 << (num_bits - 1)) - 1
+
+        def _clip(x, scale):
+            x[x > scale] = scale
+            x[x < -scale] = -scale
+            return x
+
         if isinstance(scale, list):
             for i, s in enumerate(scale):
                 if quant_axis == 0:
-                    x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                    x[i] = _clip(x[i], s)
+                    x[i] = np.round(x[i] / s * bnt)
                 else:
-                    x[:, i] = np.round(x[:, i] / s * (
-                        (1 << (num_bits - 1)) - 1))
-            return x
+                    x[:, i] = _clip(x[:, i], s)
+                    x[:, i] = np.round(x[:, i] / s * bnt)
         else:
-            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+            x = _clip(x, scale)
+            x = np.round(x / scale * bnt)
+        return x
 
 
 class ConvertToInt8Pass(object):
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index dd4bea06572fb..6c02076eae0de 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -98,18 +98,16 @@ function(download_quant_model install_dir data_file)
     endif()
 endfunction()
 
-function(save_quant_ic_model_test target quant_model_dir fp32_model_save_path int8_model_save_path)
+function(save_quant_ic_model_test target quant_model_dir int8_model_save_path)
     py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
             ARGS --quant_model_path ${quant_model_dir}
-	         --fp32_model_save_path ${fp32_model_save_path}
 	         --int8_model_save_path ${int8_model_save_path}
 		 --debug)
 endfunction()
 
-function(save_quant_nlp_model_test target quant_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize)
+function(save_quant_nlp_model_test target quant_model_dir int8_model_save_path ops_to_quantize)
     py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
             ARGS --quant_model_path ${quant_model_dir}
-	         --fp32_model_save_path ${fp32_model_save_path}
 	         --int8_model_save_path ${int8_model_save_path}
 		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
@@ -227,8 +225,6 @@ if(LINUX AND WITH_MKLDNN)
 	set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
 	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE})
 
-	set(QUANT2_NLP_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
-
 	# Quant2 Ernie
 	set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
 	set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
@@ -236,17 +232,25 @@ if(LINUX AND WITH_MKLDNN)
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
 	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
 	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE})
-	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE})
+	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
+	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+
+	# Quant2 GRU
+	set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
+	set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
+	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE})
+	set(QUANT2_GRU_OPS_TO_QUANTIZE "fusion_gru")
 
 	### Save FP32 model or INT8 model from Quant model
         
 	set(QUANT2_INT8_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8")
-	set(QUANT2_FP32_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_fp32")
-	save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_FP32_RESNET50_SAVE_PATH} ${QUANT2_INT8_RESNET50_SAVE_PATH})
+	save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_INT8_RESNET50_SAVE_PATH})
 
 	set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8")
-	set(QUANT2_FP32_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_fp32")
-	save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_FP32_ERNIE_SAVE_PATH} ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE})
+	save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+
+	set(QUANT2_INT8_GRU_SAVE_PATH "${QUANT_INSTALL_DIR}/GRU_quant2_int8")
+	save_quant_nlp_model_test(save_quant2_model_gru ${QUANT2_GRU_MODEL_DIR}/GRU_quant_acc ${QUANT2_INT8_GRU_SAVE_PATH} ${QUANT2_GRU_OPS_TO_QUANTIZE})
 
 	# Convert Quant2 model to dot and pdf files 
 	set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file")
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index df505cf2435e7..eb924e13a7e4f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -31,6 +31,7 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 paddle.enable_static()
 
@@ -231,10 +232,11 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./mnist_infer_model"
+        path = "./qat_infer_model/lenet"
+        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            model_path=path,
+            path=path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -245,12 +247,12 @@ def test_qat_save(self):
         else:
             place = core.CPUPlace()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=path,
-                executor=exe,
-                model_filename="__model__",
-                params_filename="__variables__"))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="lenet" + INFER_MODEL_SUFFIX,
+             params_filename="lenet" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            model_path="./dynamic_mnist",
+            path="./dynamic_mnist/model",
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index 80d388ac0da62..ddf37a0ebf8c2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -31,6 +31,7 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 paddle.enable_static()
 
@@ -231,10 +232,11 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./mnist_infer_model"
+        path = "./qat_infer_model/mnist"
+        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            model_path=path,
+            path=path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -245,12 +247,12 @@ def test_qat_save(self):
         else:
             place = core.CPUPlace()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=path,
-                executor=exe,
-                model_filename="__model__",
-                params_filename="__variables__"))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="mnist" + INFER_MODEL_SUFFIX,
+             params_filename="mnist" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            model_path="./dynamic_mnist",
+            path="./dynamic_mnist/model",
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 9a14c4cdf14a4..ad116c2597064 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -205,8 +205,15 @@ def pre_load(dso_name):
     load_dso(dso_path)
 
 
-def get_glibc_ver():
-    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
+def get_libc_ver():
+    ldd_glibc = run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
+    if ldd_glibc is not None:
+        return ("glibc", ldd_glibc)
+
+    ldd_musl = run_shell_command("ldd 2>&1 | awk '/Version/{print $NF}'")
+    if ldd_musl is not None:
+        return ("musl", ldd_musl)
+    return (None, None)
 
 
 def less_than_ver(a, b):
@@ -231,13 +238,14 @@ def to_list(s):
 # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
 # So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'.
 # The final solution is to upgrade glibc to > 2.22 on the target system.
-if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(),
-                                                          '2.23'):
-    try:
-        pre_load('libgomp')
-    except Exception as e:
-        # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
-        sys.stderr.write('Error: Can not preload libgomp.so')
+if platform.system().lower() == 'linux':
+    libc_type, libc_ver = get_libc_ver()
+    if libc_type == 'glibc' and less_than_ver(libc_ver, '2.23'):
+        try:
+            pre_load('libgomp')
+        except Exception as e:
+            # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+            sys.stderr.write('Error: Can not preload libgomp.so')
 
 load_noavx = False
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f4ea4d670e600..fb87ea4455d34 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -24,8 +24,8 @@
 import warnings
 from .. import core
 from .base import guard
-from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 
 __all__ = [
     'save_dygraph',
@@ -33,35 +33,23 @@
 ]
 
 
-# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table,
-# ensure compatibility when user still use keep_name_table argument
-def deprecate_keep_name_table(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        def __warn_and_build_configs__(keep_name_table):
-            warnings.warn(
-                "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
-                DeprecationWarning)
-            config = SaveLoadConfig()
-            config.keep_name_table = keep_name_table
-            return config
-
-        # deal with arg `keep_name_table`
-        if len(args) > 1 and isinstance(args[1], bool):
-            args = list(args)
-            args[1] = __warn_and_build_configs__(args[1])
-        # deal with kwargs
-        elif 'keep_name_table' in kwargs:
-            kwargs['config'] = __warn_and_build_configs__(kwargs[
-                'keep_name_table'])
-            kwargs.pop('keep_name_table')
-        else:
-            # do nothing
-            pass
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.fluid.load_dygraph` is not supported."
+                % (key))
 
-        return func(*args, **kwargs)
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+    inner_config.keep_name_table = configs.get('keep_name_table', None)
 
-    return wrapper
+    return inner_config
 
 
 @dygraph_only
@@ -132,12 +120,12 @@ def save_dygraph(state_dict, model_path):
         pickle.dump(model_dict, f, protocol=2)
 
 
+# NOTE(chenweihang): load_dygraph will deprecated in future, we don't 
+# support new loading features for it
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
 # @dygraph_only
-@deprecate_save_load_configs
-@deprecate_keep_name_table
-def load_dygraph(model_path, config=None):
+def load_dygraph(model_path, **configs):
     '''
     :api_attr: imperative
     
@@ -152,10 +140,13 @@ def load_dygraph(model_path, config=None):
     Args:
         model_path(str) : The file prefix store the state_dict. 
             (The path should Not contain suffix '.pdparams') 
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
-            object that specifies additional configuration options, these options 
-            are for compatibility with ``jit.save/io.save_inference_model`` formats. 
-            Default None.
+        **configs (dict, optional): other save configuration options for compatibility. We do not 
+            recommend using these configurations, if not necessary, DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x ``save_inference_model`` 
+            save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x ``save_inference_model`` 
+            save format. No default file name, save variables separately by default.
 
     Returns:
         state_dict(dict) : the dict store the state_dict
@@ -196,8 +187,7 @@ def load_dygraph(model_path, config=None):
     opti_file_path = model_prefix + ".pdopt"
 
     # deal with argument `config`
-    if config is None:
-        config = SaveLoadConfig()
+    config = _parse_load_config(configs)
 
     if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
         # Load state dict by `save_dygraph` save format
@@ -246,7 +236,6 @@ def load_dygraph(model_path, config=None):
                 persistable_var_dict = _construct_params_and_buffers(
                     model_prefix,
                     programs,
-                    config.separate_params,
                     config.params_filename,
                     append_suffix=False)
 
@@ -255,9 +244,9 @@ def load_dygraph(model_path, config=None):
                 for var_name in persistable_var_dict:
                     para_dict[var_name] = persistable_var_dict[var_name].numpy()
 
-                # if __variables.info__ exists, we can recover structured_name
-                var_info_path = os.path.join(model_prefix,
-                                             EXTRA_VAR_INFO_FILENAME)
+                # if *.info exists, we can recover structured_name
+                var_info_filename = str(config.params_filename) + ".info"
+                var_info_path = os.path.join(model_prefix, var_info_filename)
                 if os.path.exists(var_info_path):
                     with open(var_info_path, 'rb') as f:
                         extra_var_info = pickle.load(f)
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index 8a8787da3a543..bfcb43f5f677c 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -34,27 +34,26 @@ class Sequential(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
             data = np.random.uniform(-1, 1, [30, 10]).astype('float32')
-            with fluid.dygraph.guard():
-                data = fluid.dygraph.to_variable(data)
-                # create Sequential with iterable Layers
-                model1 = fluid.dygraph.Sequential(
-                    fluid.Linear(10, 1), fluid.Linear(1, 2)
-                )
-                model1[0]  # access the first layer
-                res1 = model1(data)  # sequential execution
-
-                # create Sequential with name Layer pairs
-                model2 = fluid.dygraph.Sequential(
-                    ('l1', fluid.Linear(10, 2)),
-                    ('l2', fluid.Linear(2, 3))
-                )
-                model2['l1']  # access l1 layer
-                model2.add_sublayer('l3', fluid.Linear(3, 3))  # add sublayer
-                res2 = model2(data)  # sequential execution
+            data = paddle.to_tensor(data)
+            # create Sequential with iterable Layers
+            model1 = paddle.nn.Sequential(
+                paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2)
+            )
+            model1[0]  # access the first layer
+            res1 = model1(data)  # sequential execution
+
+            # create Sequential with name Layer pairs
+            model2 = paddle.nn.Sequential(
+                ('l1', paddle.nn.Linear(10, 2)),
+                ('l2', paddle.nn.Linear(2, 3))
+            )
+            model2['l1']  # access l1 layer
+            model2.add_sublayer('l3', paddle.nn.Linear(3, 3))  # add sublayer
+            res2 = model2(data)  # sequential execution
 
     """
 
@@ -99,15 +98,15 @@ class ParameterList(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            class MyLayer(fluid.Layer):
+            class MyLayer(paddle.nn.Layer):
                 def __init__(self, num_stacked_param):
                     super(MyLayer, self).__init__()
                     # create ParameterList with iterable Parameters
-                    self.params = fluid.dygraph.ParameterList(
-                        [fluid.layers.create_parameter(
+                    self.params = paddle.nn.ParameterList(
+                        [paddle.create_parameter(
                             shape=[2, 2], dtype='float32')] * num_stacked_param)
 
                 def forward(self, x):
@@ -119,27 +118,26 @@ def forward(self, x):
                                     "Y": p},
                             outputs={"Out": tmp},
                             attrs={"x_num_col_dims": 1,
-                                   "y_num_col_dims": 1})
+                                    "y_num_col_dims": 1})
                         x = tmp
                     return x
 
             data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32')
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data_np)
-                num_stacked_param = 4
-                model = MyLayer(num_stacked_param)
-                print(len(model.params))  # 4
-                res = model(x)
-                print(res.shape)  # [5, 2]
-
-                replaced_param = fluid.layers.create_parameter(shape=[2, 3], dtype='float32')
-                model.params[num_stacked_param - 1] = replaced_param  # replace last param
-                res = model(x)
-                print(res.shape)  # [5, 3]
-                model.params.append(fluid.layers.create_parameter(shape=[3, 4], dtype='float32'))  # append param
-                print(len(model.params))  # 5
-                res = model(x)
-                print(res.shape)  # [5, 4]
+            x = paddle.to_tensor(data_np)
+            num_stacked_param = 4
+            model = MyLayer(num_stacked_param)
+            print(len(model.params))  # 4
+            res = model(x)
+            print(res.shape)  # [5, 2]
+
+            replaced_param = paddle.create_parameter(shape=[2, 3], dtype='float32')
+            model.params[num_stacked_param - 1] = replaced_param  # replace last param
+            res = model(x)
+            print(res.shape)  # [5, 3]
+            model.params.append(paddle.create_parameter(shape=[3, 4], dtype='float32'))  # append param
+            print(len(model.params))  # 5
+            res = model(x)
+            print(res.shape)  # [5, 4]
     """
 
     def __init__(self, parameters=None):
@@ -183,14 +181,15 @@ class LayerList(Layer):
 
     Examples:
         .. code-block:: python
-            import paddle.fluid as fluid
+
+            import paddle
             import numpy as np
 
-            class MyLayer(fluid.Layer):
+            class MyLayer(paddle.nn.Layer):
                 def __init__(self):
                     super(MyLayer, self).__init__()
-                    self.linears = fluid.dygraph.LayerList(
-                        [fluid.dygraph.Linear(10, 10) for i in range(10)])
+                    self.linears = paddle.nn.LayerList(
+                        [paddle.nn.Linear(10, 10) for i in range(10)])
 
                 def forward(self, x):
                     # LayerList can act as an iterable, or be indexed using ints
@@ -239,13 +238,13 @@ def append(self, sublayer):
 
         Examples:
             .. code-block:: python
-                import paddle.fluid as fluid
 
-                with fluid.dygraph.guard():
-                    linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)])
-                    another = fluid.dygraph.Linear(10, 10)
-                    linears.append(another)
-                    print(len(linears))  # 11
+                import paddle
+
+                linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)])
+                another = paddle.nn.Linear(10, 10)
+                linears.append(another)
+                print(len(linears))  # 11
         """
         self.add_sublayer(str(len(self)), sublayer)
         return self
@@ -260,13 +259,13 @@ def insert(self, index, sublayer):
 
         Examples:
             .. code-block:: python
-                import paddle.fluid as fluid
 
-                with fluid.dygraph.guard():
-                    linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)])
-                    another = fluid.dygraph.Linear(10, 10)
-                    linears.insert(3, another)
-                    print(linears[3] is another)  # True
+                import paddle
+
+                linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)])
+                another = paddle.nn.Linear(10, 10)
+                linears.insert(3, another)
+                print(linears[3] is another)  # True
         """
         assert isinstance(index, int) and \
                0 <= index < len(self._sub_layers), \
@@ -284,14 +283,14 @@ def extend(self, sublayers):
 
         Examples:
             .. code-block:: python
-                import paddle.fluid as fluid
-
-                with fluid.dygraph.guard():
-                    linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)])
-                    another_list = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(5)])
-                    linears.extend(another_list)
-                    print(len(linears))  # 15
-                    print(another_list[0] is linears[10])  # True
+
+                import paddle
+
+                linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)])
+                another_list = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(5)])
+                linears.extend(another_list)
+                print(len(linears))  # 15
+                print(another_list[0] is linears[10])  # True
         """
         offset = len(self)
         for i, sublayer in enumerate(sublayers):
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 4a3dacbd1acae..a10adeb14aa7d 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -31,8 +31,10 @@
 
 __all__ = ['TranslatedLayer']
 
-VARIABLE_FILENAME = "__variables__"
-EXTRA_VAR_INFO_FILENAME = "__variables.info__"
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
+INFER_PARAMS_INFO_SUFFIX = ".pdiparams.info"
+
 LOADED_VAR_SUFFIX = "load"
 PARAMETER_NAME_PREFIX = "param"
 BUFFER_NAME_PREFIX = "buffer"
@@ -424,11 +426,8 @@ def _load_persistable_vars_by_program(model_path,
     return load_var_dict
 
 
-def _load_persistable_vars(model_path,
-                           var_info_path,
-                           program_holder,
-                           separate_params=False,
-                           params_filename=None):
+def _load_persistable_vars(model_path, var_info_path, program_holder,
+                           params_filename):
     # 1. load extra var info
     with open(var_info_path, 'rb') as f:
         extra_var_info = pickle.load(f)
@@ -464,33 +463,22 @@ def _load_persistable_vars(model_path,
             new_var = framework._varbase_creator(
                 name=new_name, persistable=True)
 
-        # load separate vars
-        if separate_params is True:
-            framework._dygraph_tracer().trace_op(
-                type='load',
-                inputs={},
-                outputs={'Out': new_var},
-                attrs={'file_path': os.path.join(model_path, name)})
-
         new_var.stop_gradient = extra_var_info[name]['stop_gradient']
         load_var_dict[new_name] = new_var
         load_var_list.append(new_var)
 
     # 3. load all vars
-    if separate_params is False:
-        if params_filename is not None:
-            var_file_path = os.path.join(model_path, params_filename)
-        else:
-            var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
-        if not os.path.exists(var_file_path):
-            if len(extra_var_info) != 0:
-                raise ValueError("The model to be loaded is incomplete.")
-        else:
-            framework._dygraph_tracer().trace_op(
-                type='load_combine',
-                inputs={},
-                outputs={'Out': load_var_list},
-                attrs={'file_path': var_file_path})
+    assert params_filename is not None, "params_filename should not be None."
+    var_file_path = os.path.join(model_path, params_filename)
+    if not os.path.exists(var_file_path):
+        if len(extra_var_info) != 0:
+            raise ValueError("The model to be loaded is incomplete.")
+    else:
+        framework._dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
 
     return load_var_dict
 
@@ -532,14 +520,13 @@ def _construct_program_holders(model_path, model_filename=None):
 
 def _construct_params_and_buffers(model_path,
                                   programs,
-                                  separate_params=False,
                                   params_filename=None,
                                   append_suffix=True):
-    var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+    var_info_filename = str(params_filename) + ".info"
+    var_info_path = os.path.join(model_path, var_info_filename)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
-                                          programs['forward'], separate_params,
-                                          params_filename)
+                                          programs['forward'], params_filename)
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
@@ -700,18 +687,16 @@ def _construct(model_path, configs=None):
             raise ValueError("There is no directory named '%s'" % model_path)
         model_filename = None
         params_filename = None
-        separate_params = False
         if configs is not None:
             model_filename = configs.model_filename
             params_filename = configs.params_filename
-            separate_params = configs.separate_params
 
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
         # 2. load layer parameters & buffers
-        persistable_vars = _construct_params_and_buffers(
-            model_path, programs, separate_params, params_filename)
+        persistable_vars = _construct_params_and_buffers(model_path, programs,
+                                                         params_filename)
 
         # 3. construct TranslatedLayer object
         translated_layer = TranslatedLayer(programs, persistable_vars)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 194ebafb08eef..6cdd13fba82ac 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -29,7 +29,7 @@
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
+from paddle.fluid.dygraph.io import TranslatedLayer, INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
 from paddle.fluid.framework import Block, ParamBase, Program, Variable
@@ -39,7 +39,7 @@
 
 __all__ = [
     'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
-    'set_verbosity', 'save', 'load', 'SaveLoadConfig'
+    'set_verbosity', 'save', 'load'
 ]
 
 
@@ -228,73 +228,7 @@ def decorated(python_func):
     return decorated
 
 
-class SaveLoadConfig(object):
-    """
-    The additional configuration options may be used in function 
-    ``paddle.jit.save/load`` and ``paddle.load`` .
-    
-    Examples:
-        1. Using ``SaveLoadConfig`` when saving model
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-
-            class SimpleNet(nn.Layer):
-                def __init__(self, in_size, out_size):
-                    super(SimpleNet, self).__init__()
-                    self._linear = nn.Linear(in_size, out_size)
-
-                @paddle.jit.to_static
-                def forward(self, x):
-                    y = self._linear(x)
-                    z = self._linear(y)
-                    return z
-
-            # enable dygraph mode
-            paddle.disable_static() 
-
-            # train model
-            net = SimpleNet(8, 8)
-            adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-            x = paddle.randn([4, 8], 'float32')
-            for i in range(10):
-                out = net(x)
-                loss = paddle.tensor.mean(out)
-                loss.backward()
-                adam.step()
-                adam.clear_grad()
-
-            # use SaveLoadconfig when saving model
-            model_path = "simplenet.example.model"
-            config = paddle.SaveLoadConfig()
-            config.model_filename = "__simplenet__"
-            paddle.jit.save(
-                layer=net,
-                model_path=model_path,
-                config=config)
-
-        2. Using ``SaveLoadConfig`` when loading model
-
-        .. code-block:: python
-
-            import paddle
-
-            # enable dygraph mode
-            paddle.disable_static() 
-
-            # use SaveLoadconfig when loading model
-            model_path = "simplenet.example.model"
-            config = paddle.SaveLoadConfig()
-            config.model_filename = "__simplenet__"
-            infer_net = paddle.jit.load(model_path, config=config)
-            # inference
-            x = paddle.randn([4, 8], 'float32')
-            pred = infer_net(x)
-    """
-
+class _SaveLoadConfig(object):
     def __init__(self):
         self._output_spec = None
         self._model_filename = None
@@ -316,335 +250,105 @@ def __init__(self):
 
     @property
     def output_spec(self):
-        """
-        Selects the output targets of the saved model ( ``paddle.jit.TranslatedLayer`` ).
-        By default, all return variables of original Layer's forward function
-        are kept as the output of the saved TranslatedLayer.
-
-        The ``output_spec`` type should be list[Variable]. If the provided ``output_spec``
-        list is not all output variables, the saved model will be pruned according to the
-        given ``output_spec`` list.
-
-        .. note::
-            The ``output_spec`` is only used when saving model.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        loss = paddle.tensor.mean(z)
-                        return z, loss
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out, loss = net(x)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                # use SaveLoadconfig.output_spec
-                model_path = "simplenet.example.model.output_spec"
-                config = paddle.SaveLoadConfig()
-                config.output_spec = [out]
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                infer_net = paddle.jit.load(model_path)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._output_spec
 
     @output_spec.setter
     def output_spec(self, spec):
+        if spec is None:
+            return
         if not isinstance(spec, list):
             raise TypeError(
-                "The SaveLoadConfig.output_spec should be 'list', but received input type is %s."
+                "The config `output_spec` should be 'list', but received input type is %s."
                 % type(input))
             for var in spec:
                 if not isinstance(var, core.VarBase):
                     raise TypeError(
-                        "The element in SaveLoadConfig.output_spec list should be 'Variable', but received element's type is %s."
+                        "The element in config `output_spec` list should be 'Variable', but received element's type is %s."
                         % type(var))
         self._output_spec = spec
 
     @property
     def model_filename(self):
-        """
-        The name of file to save the translated program of target Layer.
-        Default filename is :code:`__model__` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                # saving with configs.model_filename
-                model_path = "simplenet.example.model.model_filename"
-                config = paddle.SaveLoadConfig()
-                config.model_filename = "__simplenet__"
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                # loading with configs.model_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._model_filename
 
     @model_filename.setter
     def model_filename(self, filename):
+        if filename is None:
+            return
         if not isinstance(filename, six.string_types):
             raise TypeError(
-                "The SaveLoadConfig.model_filename should be str, but received input's type is %s."
+                "The config `model_filename` should be str, but received input's type is %s."
                 % type(filename))
         if len(filename) == 0:
-            raise ValueError(
-                "The SaveLoadConfig.model_filename is empty string.")
+            raise ValueError("The config `model_filename` is empty string.")
         self._model_filename = filename
 
     @property
     def params_filename(self):
-        """
-        The name of file to save all persistable variables in target Layer. 
-        Default file name is :code:`__variables__` .
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                model_path = "simplenet.example.model.params_filename"
-                config = paddle.SaveLoadConfig()
-                config.params_filename = "__params__"
-
-                # saving with configs.params_filename
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                # loading with configs.params_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._params_filename
 
     @params_filename.setter
     def params_filename(self, filename):
+        if filename is None:
+            return
         if not isinstance(filename, six.string_types):
             raise TypeError(
-                "The SaveLoadConfig.params_filename should be str, but received input's type is %s."
+                "The config `params_filename` should be str, but received input's type is %s."
                 % type(filename))
         if len(filename) == 0:
-            raise ValueError(
-                "The SaveLoadConfig.params_filename is empty string.")
+            raise ValueError("The config `params_filename` is empty string.")
         self._params_filename = filename
 
-    # NOTE: [why not use params_filename=None control params saved separately]
-    # The new save interface does not recommend parameters to be saved separately. 
-    # Here, the concept should be separated as clearly as possible. 
-    # Setting params_filename=None only means that the saved file name is set 
-    # and without any other meaning. New separate_params control for file saved
-    # separately can makes the concept clearer.
-    @property
-    def separate_params(self):
-        """
-        Configure whether to save the Layer parameters as separete files.
-        (In order to be compatible with the behavior of ``paddle.static.save_inference_model`` )
-
-        If True, each parameter will be saved to a file separately, the file name is the parameter name,
-        and the SaveLoadConfig.params_filename configuration will not take effect. Default False.
-
-        .. note::
-            Only used for ``paddle.jit.save`` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                model_path = "simplenet.example.model.separate_params"
-                config = paddle.SaveLoadConfig()
-                config.separate_params = True
-
-                # saving with configs.separate_params
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-                # [result] the saved model directory contains:
-                # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
-
-                # loading with configs.params_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
-        return self._separate_params
-
-    @separate_params.setter
-    def separate_params(self, value):
-        if not isinstance(value, bool):
-            raise TypeError(
-                "The SaveLoadConfig.separate_params should be bool value, but received input's type is %s."
-                % type(value))
-        self._separate_params = value
-
     @property
     def keep_name_table(self):
-        """
-        Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
-        This dict is the debugging information saved when call ``paddle.save`` . 
-        It is generally only used for debugging and does not affect the actual training or inference. 
-        By default, it will not be retained in ``paddle.load`` result. Default: False.
-        
-        .. note::
-            Only used for ``paddle.load`` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-            
-                paddle.disable_static()
-
-                linear = paddle.nn.Linear(5, 1)
-
-                state_dict = linear.state_dict()
-                paddle.save(state_dict, "paddle_dy.pdparams")
-
-                config = paddle.SaveLoadConfig()
-                config.keep_name_table = True
-                para_state_dict = paddle.load("paddle_dy.pdparams", config)
-
-                print(para_state_dict)
-                # the name_table is 'StructuredToParameterName@@'
-                # {'bias': array([0.], dtype=float32), 
-                #  'StructuredToParameterName@@': 
-                #     {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, 
-                #  'weight': array([[ 0.04230034],
-                #     [-0.1222527 ],
-                #     [ 0.7392676 ],
-                #     [-0.8136974 ],
-                #     [ 0.01211023]], dtype=float32)}
-        """
         return self._keep_name_table
 
     @keep_name_table.setter
     def keep_name_table(self, value):
+        if value is None:
+            return
         if not isinstance(value, bool):
             raise TypeError(
-                "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s."
+                "The config `keep_name_table` should be bool value, but received input's type is %s."
                 % type(value))
         self._keep_name_table = value
 
 
+def _parse_save_configs(configs):
+    supported_configs = ['output_spec']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.jit.save` is not supported."
+                % (key))
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.output_spec = configs.get('output_spec', None)
+
+    return inner_config
+
+
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.jit.load` is not supported."
+                % (key))
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+
+    return inner_config
+
+
 def _get_input_var_names(inputs, input_spec):
     name_none_error = "The %s's name is None. " \
         "When using jit.save, please set InputSepc's name in " \
@@ -712,47 +416,88 @@ def _get_output_vars(outputs, output_spec):
     return result_list
 
 
-# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
-def deprecate_save_load_configs(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if 'configs' in kwargs:
-            kwargs['config'] = kwargs['configs']
-            kwargs.pop('configs')
-        return func(*args, **kwargs)
+# NOTE(chenweihang): [ Handling of use cases of API paddle.jit.load ]
+# `paddle.jit.load` may be used to load saved results of:
+# 1. Expected cases:
+#   - paddle.jit.save
+#   - paddle.static.save_inference_model
+#   - paddle.fluid.io.save_inference_model
+# 2. Error cases:
+#   - paddle.save: no .pdmodel for prefix
+#   - paddle.static.save: no .pdiparams but .pdparams exists
+#   - paddle.fluid.io.save_params/save_persistables: no __model__
+# TODO(chenweihang): polish error message in above error cases
+def _build_load_path_and_config(path, config):
+    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
+    # raise error, avoid confusing behavior
+    prefix_format_path = path + INFER_MODEL_SUFFIX
+    prefix_format_exist = os.path.exists(prefix_format_path)
+    directory_format_exist = os.path.isdir(path)
+    if prefix_format_exist and directory_format_exist:
+        raise ValueError(
+            "The %s.pdmodel and %s directory exist at the same time, "
+            "don't know which one to load, please make sure that the specified target "
+            "of ``path`` is unique." % (path, path))
+    elif not prefix_format_exist and not directory_format_exist:
+        raise ValueError("The ``path`` (%s) to load model not exists." % path)
+    else:
+        if prefix_format_exist:
+            file_prefix = os.path.basename(path)
+            model_path = os.path.dirname(path)
+            if config.model_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``model_filename`` config does "
+                    "not take effect.")
+            config.model_filename = file_prefix + INFER_MODEL_SUFFIX
+            if config.params_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``params_filename`` config does "
+                    "not take effect.")
+            config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            # Compatible with the old save_inference_model format
+            model_path = path
 
-    return wrapper
+    return model_path, config
 
 
-@deprecate_save_load_configs
 @switch_to_static_graph
-def save(layer, model_path, input_spec=None, config=None):
+def save(layer, path, input_spec=None, **configs):
     """
-    Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
+    Saves input Layer as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
     It will save the translated program and all related persistable 
-    variables of input declarative Layer to given ``model_path``.
+    variables of input Layer to given ``path``.
     
-    The default saved translated program file name is ``__model__``,
-    and the default saved persistable variables file name is ``__variables__``,
-    and it also saved some additional variable description information to file 
-    ``__variables.info__``, these additional information is used in fine-tuning.
+    ``path`` is the prefix of saved objects, and the saved translated program file 
+    suffix is ``.pdmodel``, the saved persistable variables file suffix is ``.pdiparams``,
+    and here also saved some additional variable description information to a file,  
+    its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
-      - :ref:`api_imperative_jit_load`
-      - :ref:`api_fluid_io_load_inference_model` (need pass ``params_filename='__variables__'``)
+      - ``paddle.jit.load`` 
+      - ``paddle.static.load_inference_model`` 
       - Other C++ inference APIs
 
     Args:
-        layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
-        model_path (str): the directory to save the model.
-        input_spec (list[Variable], optional): Describes the input of the saved model. 
+        layer (Layer): the Layer to be saved. The Layer should be decorated by `@paddle.jit.to_static`.
+        path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model. 
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
-            that specifies additional configuration options. Default None.
+        **configs (dict, optional): other save configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
+            By default, all return variables of original Layer's forward function are kept as the 
+            output of the saved model. If the provided ``output_spec`` list is not all output variables, 
+            the saved model will be pruned according to the given ``output_spec`` list. 
+
     Returns:
         None
 
@@ -804,10 +549,6 @@ def train(layer, loader, loss_fn, opt):
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-            # enable dygraph mode
-            place = paddle.CPUPlace()
-            paddle.disable_static(place) 
-
             # 1. train & save model.
 
             # create network
@@ -818,7 +559,6 @@ def train(layer, loader, loss_fn, opt):
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
             loader = paddle.io.DataLoader(dataset,
-                places=place,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
@@ -828,11 +568,11 @@ def train(layer, loader, loss_fn, opt):
             train(layer, loader, loss_fn, adam)
 
             # save
-            model_path = "linear.example.model"
-            paddle.jit.save(layer, model_path)
+            path = "example_model/linear"
+            paddle.jit.save(layer, path)
     """
 
-    # 1. input check
+    # 1. input build & check
     prog_translator = ProgramTranslator()
     if not prog_translator.enable_to_static:
         raise RuntimeError(
@@ -843,9 +583,17 @@ def train(layer, loader, loss_fn, opt):
             "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
-    configs = config
-    if configs is None:
-        configs = SaveLoadConfig()
+    # path check
+    file_prefix = os.path.basename(path)
+    if file_prefix == "":
+        raise ValueError(
+            "The input path MUST be format of dirname/file_prefix "
+            "[dirname\\file_prefix in Windows system], but received "
+            "file_prefix is empty string.")
+
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
 
     # avoid change user given input_spec
     inner_input_spec = None
@@ -866,6 +614,9 @@ def train(layer, loader, loss_fn, opt):
                     "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
+    # parse configs
+    configs = _parse_save_configs(configs)
+
     # 2. get program from Layer
     # TODO(chenweihang): add support for other method, not only forward
     if isinstance(layer.forward, StaticFunction):
@@ -927,9 +678,12 @@ def train(layer, loader, loss_fn, opt):
     # 5. save inference model
     from paddle.fluid.io import save_inference_model
 
-    # VARIABLE_FILENAME keep nameing style consistent with '__model__'
-    if configs.params_filename is None:
-        configs.params_filename = VARIABLE_FILENAME
+    # construct new save_inference_model arguments
+    model_path = dirname
+    # NOTE(chenweihang): because prefix contains model and params filename,
+    # so we don't support set model_filename & params_filename 
+    model_filename = file_prefix + INFER_MODEL_SUFFIX
+    params_filename = file_prefix + INFER_PARAMS_SUFFIX
 
     with scope_guard(scope):
         save_inference_model(
@@ -938,9 +692,8 @@ def train(layer, loader, loss_fn, opt):
             target_vars=output_vars,
             executor=Executor(_current_expected_place()),
             main_program=concrete_program.main_program.clone(),
-            model_filename=configs.model_filename,
-            params_filename=None
-            if configs.separate_params else configs.params_filename,
+            model_filename=model_filename,
+            params_filename=params_filename,
             export_for_deployment=configs._export_for_deployment,
             program_only=configs._program_only)
 
@@ -958,23 +711,23 @@ def train(layer, loader, loss_fn, opt):
         # Due to compatibility issues, we cannot change the original storage structure, 
         # but we can save these information in `jit.save` without changing the original 
         # storage to improve user experience. So we save extra information into
-        # file `__variables.info__`
-        extra_var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        # file `***.pdiparams.info`
+        extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
         with open(extra_var_info_path, 'wb') as f:
             pickle.dump(extra_var_info, f, protocol=2)
 
 
-@deprecate_save_load_configs
 @dygraph_only
-def load(model_path, config=None):
+def load(path, **configs):
     """
     :api_attr: imperative
 
-    Load model saved by :ref:`api_imperative_jit_save` or :ref:`api_fluid_io_save_inference_model`
-    as :ref:`api_imperative_TranslatedLayer`, then performing inference or fine-tune training.
+    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or 
+    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, 
+    then performing inference or fine-tune training.
 
     .. note::
-        For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
+        If you load model saved by ``paddle.static.save_inference_model`` ,
         there will be the following limitations when using it in fine-tuning:
         1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
         2. All saved model's feed targets need to be passed into TranslatedLayer's forward function.
@@ -982,15 +735,23 @@ def load(model_path, config=None):
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
     Args:
-        model_path (str): The directory path where the model is saved.
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
-            additional configuration options. Default None.
+        path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+        **configs (dict, optional): other load configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            ``save_inference_model`` save format. No default file name, save variables separately 
+            by default.
+
 
     Returns:
         TranslatedLayer: A Layer object can run saved translated model.
 
     Examples:
-        1. Load model saved by :ref:`api_imperative_jit_save` then performing inference and fine-tune training.
+        1. Load model saved by ``paddle.jit.save`` then performing inference and fine-tune training.
 
         .. code-block:: python
 
@@ -1039,10 +800,6 @@ def train(layer, loader, loss_fn, opt):
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-            # enable dygraph mode
-            place = paddle.CPUPlace()
-            paddle.disable_static(place) 
-
             # 1. train & save model.
 
             # create network
@@ -1053,7 +810,6 @@ def train(layer, loader, loss_fn, opt):
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
             loader = paddle.io.DataLoader(dataset,
-                places=place,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
@@ -1063,13 +819,13 @@ def train(layer, loader, loss_fn, opt):
             train(layer, loader, loss_fn, adam)
 
             # save
-            model_path = "linear.example.model"
-            paddle.jit.save(layer, model_path)
+            path = "example_model/linear"
+            paddle.jit.save(layer, path)
 
             # 2. load model
 
             # load
-            loaded_layer = paddle.jit.load(model_path)
+            loaded_layer = paddle.jit.load(path)
 
             # inference
             loaded_layer.eval()
@@ -1082,15 +838,17 @@ def train(layer, loader, loss_fn, opt):
             train(loaded_layer, loader, loss_fn, adam)
 
 
-        2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
+        2. Load model saved by ``paddle.fluid.io.save_inference_model`` then performing and fine-tune training.
 
         .. code-block:: python
 
             import numpy as np
             import paddle
             import paddle.fluid as fluid
+            import paddle.static as static
             import paddle.nn as nn
             import paddle.optimizer as opt
+            import paddle.nn.functional as F
 
             BATCH_SIZE = 16
             BATCH_NUM = 4
@@ -1112,18 +870,18 @@ def __getitem__(self, idx):
                 def __len__(self):
                     return self.num_samples
 
-            image = fluid.data(name='image', shape=[None, 784], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            pred = fluid.layers.fc(input=image, size=10, act='softmax')
-            loss = fluid.layers.cross_entropy(input=pred, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            image = static.data(name='image', shape=[None, 784], dtype='float32')
+            label = static.data(name='label', shape=[None, 1], dtype='int64')
+            pred = static.nn.fc(input=image, size=10, act='softmax')
+            loss = F.cross_entropy(input=pred, label=label)
+            avg_loss = paddle.mean(loss)
 
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
             optimizer.minimize(avg_loss)
 
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
+            place = paddle.CPUPlace()
+            exe = static.Executor(place)
+            exe.run(static.default_startup_program())
 
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
@@ -1138,7 +896,7 @@ def __len__(self):
             # 1. train and save inference model
             for data in loader():
                 exe.run(
-                    fluid.default_main_program(),
+                    static.default_main_program(),
                     feed=data, 
                     fetch_list=[avg_loss])
 
@@ -1179,6 +937,10 @@ def __len__(self):
                     print("Epoch {} batch {}: loss = {}".format(
                         epoch_id, batch_id, np.mean(loss.numpy())))
     """
+    # 1. construct correct config
+    config = _parse_load_config(configs)
+    model_path, config = _build_load_path_and_config(path, config)
+
     return TranslatedLayer._construct(model_path, config)
 
 
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 88e24e7e1ea99..3ae6d384be7e3 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -62,10 +62,6 @@ def remove(self):
 
 class Layer(core.Layer):
     """
-    :alias_main: paddle.nn.Layer
-	:alias: paddle.nn.Layer
-	:old_api: paddle.fluid.dygraph.layers.Layer
-
     Dynamic graph Layer based on OOD, includes the parameters of the layer, the structure of the forward graph and so on.
 
     Parameters:
@@ -74,16 +70,16 @@ class Layer(core.Layer):
             can be "my_layer_0.w_n", where "w" is the parameter
             base name and "n" is an unique suffix auto-generated.
             If None, prefix name will be snake cased class name. Default: None.
-        dtype(str or core.VarDesc.VarType, optional): data type of this parameter.
+        dtype(str, optional): data type of this parameter.
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16".
-                Default: ``core.VarDesc.VarType.FP32``
+                Default: "float32"
     
     Returns:
         None
     """
 
-    def __init__(self, name_scope=None, dtype=core.VarDesc.VarType.FP32):
+    def __init__(self, name_scope=None, dtype="float32"):
         self.training = True
         if name_scope is None:
             name_scope = _convert_camel_to_snake(self.__class__.__name__)
@@ -110,6 +106,30 @@ def train(self):
 
         Returns:
             None
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        self._dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        temp = self._linear(input)
+                        temp = self._dropout(temp)
+                        return temp
+
+                x = paddle.randn([10, 1], 'float32')
+                mylayer = MyLayer()
+                mylayer.eval()  # set mylayer._dropout to eval mode
+                out = mylayer(x)
+                mylayer.train()  # set mylayer._dropout to train mode
+                out = mylayer(x)
+
         """
         # global setting
         framework._dygraph_tracer().train_mode()
@@ -125,6 +145,29 @@ def eval(self):
 
         Returns:
             None
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        self._dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        temp = self._linear(input)
+                        temp = self._dropout(temp)
+                        return temp
+
+                x = paddle.randn([10, 1], 'float32')
+                mylayer = MyLayer()
+                mylayer.eval()  # set mylayer._dropout to eval mode
+                out = mylayer(x)
+                print(out)
+
         """
         # global setting
         framework._dygraph_tracer().eval_mode()
@@ -149,15 +192,13 @@ def apply(self, fn):
 
               import paddle
               import paddle.nn as nn
-              
-              paddle.disable_static()
-              
+
               net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
 
               def init_weights(layer):
                   if type(layer) == nn.Linear:
                       print('before init weight:', layer.weight.numpy())
-                      new_weight = paddle.fill_constant(layer.weight.shape, layer.weight.dtype, value=0.9)
+                      new_weight = paddle.full(shape=layer.weight.shape, dtype=layer.weight.dtype, fill_value=0.9)
                       layer.weight.set_value(new_weight)
                       print('after init weight:', layer.weight.numpy())
 
@@ -177,6 +218,23 @@ def full_name(self):
 
         Returns:
             str: full name of this layer.
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class LinearNet(paddle.nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__(name_scope = "demo_linear_net")
+                        self._linear = paddle.nn.Linear(1, 1)
+
+                    def forward(self, x):
+                        return self._linear(x)
+
+                linear_net = LinearNet()
+                print(linear_net.full_name())   # demo_linear_net_0
+
         """
         return self._full_name
 
@@ -197,34 +255,33 @@ def register_forward_post_hook(self, hook):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              import numpy as np
+                import paddle
+                import numpy as np
+
+                # the forward_post_hook change the output of the layer: output = output * 2
+                def forward_post_hook(layer, input, output):
+                    # user can use layer, input and output for information statistis tasks
 
-              # the forward_post_hook change the output of the layer: output = output * 2 
-              def forward_post_hook(layer, input, output):
-                  # user can use layer, input and output for information statistis tasks
+                    # change the output
+                    return output * 2
 
-                  # change the output 
-                  return output * 2
+                linear = paddle.nn.Linear(13, 5)
 
-              with fluid.dygraph.guard():
-                  linear = fluid.Linear(13, 5, dtype="float32")
+                # register the hook
+                forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook)
 
-                  # register the hook
-                  forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook)
-                  
-                  value1 = np.arange(26).reshape(2, 13).astype("float32")
-                  in1 = fluid.dygraph.to_variable(value1)
-                  
-                  out0 = linear(in1)
-                  
-                  # remove the hook
-                  forward_post_hook_handle.remove()
+                value1 = np.arange(26).reshape(2, 13).astype("float32")
+                in1 = paddle.to_tensor(value1)
 
-                  out1 = linear(in1)
+                out0 = linear(in1)
 
-                  # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
-                  assert (out0.numpy() == (out1.numpy()) * 2).any()
+                # remove the hook
+                forward_post_hook_handle.remove()
+
+                out1 = linear(in1)
+
+                # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
+                assert (out0.numpy() == (out1.numpy()) * 2).any()
         """
         hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
         self._forward_post_hooks[hook_remove_helper._hook_id] = hook
@@ -249,36 +306,35 @@ def register_forward_pre_hook(self, hook):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              import numpy as np
+                import paddle
+                import numpy as np
 
-              # the forward_post_hook change the input of the layer: input = input * 2
-              def forward_pre_hook(layer, input):
-                  # user can use layer and input for information statistis tasks
+                # the forward_post_hook change the input of the layer: input = input * 2
+                def forward_pre_hook(layer, input):
+                    # user can use layer and input for information statistis tasks
 
-                  # change the input
-                  input_return = (input[0] * 2)
-                  return input_return
+                    # change the input
+                    input_return = (input[0] * 2)
+                    return input_return
 
-              with fluid.dygraph.guard():
-                  linear = fluid.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
 
-                  # register the hook
-                  forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook)
+                # register the hook
+                forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook)
 
-                  value0 = np.arange(26).reshape(2, 13).astype("float32")
-                  in0 = fluid.dygraph.to_variable(value0)
-                  out0 = linear(in0)
+                value0 = np.arange(26).reshape(2, 13).astype("float32")
+                in0 = paddle.to_tensor(value0)
+                out0 = linear(in0)
 
-                  # remove the hook
-                  forward_pre_hook_handle.remove()
+                # remove the hook
+                forward_pre_hook_handle.remove()
 
-                  value1 = value0 * 2
-                  in1 = fluid.dygraph.to_variable(value1)
-                  out1 = linear(in1)
+                value1 = value0 * 2
+                in1 = paddle.to_tensor(value1)
+                out1 = linear(in1)
 
-                  # hook change the linear's input to input * 2, so out0 is equal to out1.
-                  assert (out0.numpy() == out1.numpy()).any()
+                # hook change the linear's input to input * 2, so out0 is equal to out1.
+                assert (out0.numpy() == out1.numpy()).any()
         """
         hook_remove_helper = HookRemoveHelper(self._forward_pre_hooks)
         self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
@@ -294,17 +350,37 @@ def create_parameter(self,
         
         Parameters:
             shape(list): Shape of the parameter.
-            attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_fluid_ParamAttr`. Default: None.
-            dtype(str or core.VarDesc.VarType or str, optional): Data type of this parameter.
+            attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_paddle_ParamAttr`. Default: None.
+            dtype(str, optional): Data type of this parameter.
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16". Default: "float32".
             is_bias(bool, optional): if this is a bias parameter. Default: False.
             default_initializer(Initializer, optional): the default initializer for this parameter.
-                If set None, default initializer will be set to :ref:`api_fluid_initializer_XavierInitializer` and :ref:`api_fluid_initializer_ConstantInitializer`
+                If set None, default initializer will be set to paddle.nn.initializer.Xavier and paddle.nn.initializer.Constant
                 for non-bias and bias parameter, respectively. Default: None.
 
         Returns:
-            :ref:`api_guide_Variable_en` : created parameter.
+            :Tensor, created parameter.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        w_tmp = self.create_parameter([1,1])
+                        self.add_parameter("w_tmp", w_tmp)
+
+                    def forward(self, input):
+                        return self._linear(input)
+
+                mylayer = MyLayer()
+                for name, param in mylayer.named_parameters():
+                    print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
+
         """
         temp_attr = copy.deepcopy(attr)
         if isinstance(temp_attr, six.string_types) and temp_attr == "":
@@ -313,24 +389,40 @@ def create_parameter(self,
                                              default_initializer)
 
     # TODO: Add more parameter list when we need them
-    def create_variable(self,
-                        name=None,
-                        persistable=None,
-                        dtype=None,
-                        type=core.VarDesc.VarType.LOD_TENSOR):
+    def create_variable(self, name=None, persistable=None, dtype=None):
         """Create Variable for this layer.
 
         Parameters:
             name(str, optional): name of the variable. Please refer to :ref:`api_guide_Name` . Default: None
             persistable(bool, optional): if set this variable persistable. Default: False
-            dtype(str or core.VarDesc.VarType, optional): data type of this parameter.
+            dtype(str, optional): data type of this parameter.
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16".
-                If set None, it will be ``core.VarDesc.VarType.FP32``. Default: None
-            type(core.VarDesc.VarType, optional): type of the variable. No need to set this parameter. Default: ``core.VarDesc.VarType.LOD_TENSOR``
+                If set None, it will be "float32". Default: None
 
         Returns:
-            :ref:`api_guide_Variable_en` : created Variable.
+            Tensor, created Variable.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLinear(paddle.nn.Layer):
+                    def __init__(self,
+                                in_features,
+                                out_features):
+                        super(MyLinear, self).__init__()
+                        self.linear = paddle.nn.Linear( 10, 10)
+                            
+                        self.back_var = self.create_variable(name = "linear_tmp_0", dtype=self._dtype)
+                    
+                    def forward(self, input):
+                        out = self.linear(input)
+                        paddle.assign( out, self.back_var)
+                        
+                        return out
+
         """
         if name is not None:
             var_name = ".".join([self._full_name, name])
@@ -339,7 +431,10 @@ def create_variable(self,
                 [self._full_name, "_generated_var"]))
 
         return self._helper.main_program.current_block().create_var(
-            name=var_name, persistable=persistable, dtype=dtype, type=type)
+            name=var_name,
+            persistable=persistable,
+            dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR)
 
     def parameters(self, include_sublayers=True):
         """Returns a list of all Parameters from current layer and its sub-layers.
@@ -348,7 +443,16 @@ def parameters(self, include_sublayers=True):
             include_sublayers(bool, optional): Whether include the parameters of sublayers. If True, also include the parameters from sublayers. Default: True
 
         Returns:
-            list of :ref:`api_guide_Variable_en` : a list of Parameters.
+            list of Tensor : a list of Parameters.
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+
+            linear = paddle.nn.Linear(1,1)
+            print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
+
         """
         ret = [
             param
@@ -366,16 +470,15 @@ def children(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    
-                    layer_list = list(model.children())
+                linear1 = paddle.nn.Linear(10, 3)
+                linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(linear1, linear2)
+
+                layer_list = list(model.children())
 
-                    print(layer_list)
+                print(layer_list)   # [<paddle.nn.layer.common.Linear object at 0x7f7b8113f830>, <paddle.nn.layer.common.Linear object at 0x7f7b8113f950>]
 
         """
         for _, layer in self.named_children():
@@ -391,14 +494,15 @@ def named_children(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    for prefix, layer in model.named_children():
-                        print(prefix, layer)
+                linear1 = paddle.nn.Linear(10, 3)
+                linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(linear1, linear2)
+                for prefix, layer in model.named_children():
+                    print(prefix, layer)
+                    # ('0', <paddle.nn.layer.common.Linear object at 0x7fb61ed85830>)
+                    # ('1', <paddle.nn.layer.common.Linear object at 0x7fb61ed85950>)
 
         """
         memo = set()
@@ -415,6 +519,26 @@ def sublayers(self, include_sublayers=True):
 
         Returns:
             list of Layer : a list of sub layers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        self._dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        temp = self._linear(input)
+                        temp = self._dropout(temp)
+                        return temp
+
+                mylayer = MyLayer()
+                print(mylayer.sublayers())  # [<paddle.nn.layer.common.Linear object at 0x7f44b58977d0>, <paddle.nn.layer.common.Dropout object at 0x7f44b58978f0>]
+
         """
         ret = [
             layer
@@ -438,14 +562,13 @@ def named_parameters(self, prefix='', include_sublayers=True):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    for name, param in model.named_parameters():
-                        print(name, param)
+                fc1 = paddle.nn.Linear(10, 3)
+                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(fc1, fc2)
+                for name, param in model.named_parameters():
+                    print(name, param)
 
         """
         params_set = set()
@@ -483,14 +606,13 @@ def named_sublayers(self,
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    for prefix, layer in model.named_sublayers():
-                        print(prefix, layer)
+                fc1 = paddle.nn.Linear(10, 3)
+                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(fc1, fc2)
+                for prefix, layer in model.named_sublayers():
+                    print(prefix, layer)
 
         """
         if layers_set is None:
@@ -510,11 +632,11 @@ def named_sublayers(self,
                         layers_set=layers_set):
                     yield p, l
 
-    def register_buffer(self, name, variable, persistable=True):
+    def register_buffer(self, name, tensor, persistable=True):
         """
-        Registers a variable as buffer into the layer.
+        Registers a tensor as buffer into the layer.
 
-        `buffer` is a non-parameteric variable and will not be updated by optimizer,
+        `buffer` is a non-trainable tensor and will not be updated by optimizer,
         but is necessary for evaluation and inference. For example, the mean and variance in BatchNorm layers.
         The registered buffer is persistable by default, and will be saved into
         `state_dict` alongside parameters. If set persistable=False, it registers
@@ -525,7 +647,7 @@ def register_buffer(self, name, variable, persistable=True):
         Parameters:
             name (string): name of the buffer. The buffer can be accessed
                 from this layer using the given name
-            variable (Variable): the variable to be registered as buffer.
+            tensor (Tensor): the tensor to be registered as buffer.
             persistable (bool): whether the buffer is part of this layer's
                 state_dict.
 
@@ -536,16 +658,15 @@ def register_buffer(self, name, variable, persistable=True):
             .. code-block:: python
 
                 import numpy as np
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    linear = fluid.Linear(10, 3)
-                    value = np.array([0]).astype("float32")
-                    buffer = fluid.dygraph.to_variable(value)
-                    linear.register_buffer("buf_name", buffer, persistable=True)
-                    
-                    # get the buffer by attribute.
-                    print(linear.buf_name)
+                linear = paddle.nn.Linear(10, 3)
+                value = np.array([0]).astype("float32")
+                buffer = paddle.to_tensor(value)
+                linear.register_buffer("buf_name", buffer, persistable=True)
+
+                # get the buffer by attribute.
+                print(linear.buf_name)
 
         """
 
@@ -565,12 +686,12 @@ def register_buffer(self, name, variable, persistable=True):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif variable is not None and not type(variable) == core.VarBase:
+        elif tensor is not None and not type(tensor) == core.VarBase:
             raise TypeError(
                 "The registered buffer should be a core.VarBase, but received {}.".
-                format(type(variable).__name__))
+                format(type(tensor).__name__))
         else:
-            self._buffers[name] = variable
+            self._buffers[name] = tensor
             if persistable:
                 self._non_persistable_buffer_names_set.discard(name)
             else:
@@ -584,7 +705,21 @@ def buffers(self, include_sublayers=True):
             include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
 
         Returns:
-            list of :ref:`api_guide_Variable_en` : a list of buffers.
+            list of Tensor : a list of buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+
+                linear = paddle.nn.Linear(10, 3)
+                value = np.array([0]).astype("float32")
+                buffer = paddle.to_tensor(value)
+                linear.register_buffer("buf_name", buffer, persistable=True)
+
+                print(linear.buffers())     # == print([linear.buf_name])
+
         """
         ret = [
             buffer
@@ -595,7 +730,7 @@ def buffers(self, include_sublayers=True):
 
     def named_buffers(self, prefix='', include_sublayers=True):
         """
-        Returns an iterator over all buffers in the Layer, yielding tuple of name and Variable.
+        Returns an iterator over all buffers in the Layer, yielding tuple of name and Tensor.
 
         Parameters:
             prefix(str, optional): Prefix to prepend to all buffer names. Default: ''.
@@ -603,31 +738,30 @@ def named_buffers(self, prefix='', include_sublayers=True):
                 If True, also include the named buffers from sublayers. Default: True.
 
         Yields:
-            (string, Variable): Tuple of name and Variable
+            (string, Tensor): Tuple of name and tensor
 
         Examples:
             .. code-block:: python
 
                 import numpy as np
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    buffer1 = fluid.dygraph.to_variable(np.array([0]).astype("float32"))
-                    # register a variable as buffer by specific `persistable`
-                    fc1.register_buffer("buf_name_1", buffer1, persistable=True)
+                fc1 = paddle.nn.Linear(10, 3)
+                buffer1 = paddle.to_tensor(np.array([0]).astype("float32"))
+                # register a tensor as buffer by specific `persistable`
+                fc1.register_buffer("buf_name_1", buffer1, persistable=True)
 
-                    fc2 = fluid.Linear(3, 10)
-                    buffer2 = fluid.dygraph.to_variable(np.array([1]).astype("float32"))
-                    # register a buffer by assigning an attribute with Variable.
-                    # The `persistable` can only be False by this way.
-                    fc2.buf_name_2 = buffer2
+                fc2 = paddle.nn.Linear(3, 10)
+                buffer2 = paddle.to_tensor(np.array([1]).astype("float32"))
+                # register a buffer by assigning an attribute with Tensor.
+                # The `persistable` can only be False by this way.
+                fc2.buf_name_2 = buffer2
 
-                    model = fluid.dygraph.Sequential(fc1, fc2)
+                model = paddle.nn.Sequential(fc1, fc2)
 
-                    # get all named buffers
-                    for name, buffer in model.named_buffers():
-                        print(name, buffer)
+                # get all named buffers
+                for name, buffer in model.named_buffers():
+                    print(name, buffer)
 
         """
         buffers_set = set()
@@ -654,19 +788,18 @@ def clear_gradients(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
                 import numpy as np
 
-                with fluid.dygraph.guard():
-                    value = np.arange(26).reshape(2, 13).astype("float32")
-                    a = fluid.dygraph.to_variable(value)
-                    linear = fluid.Linear(13, 5, dtype="float32")
-                    adam = fluid.optimizer.Adam(learning_rate=0.01, 
-                                                parameter_list=linear.parameters())
-                    out = linear(a)
-                    out.backward()
-                    adam.minimize(out)
-                    linear.clear_gradients()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                            parameters=linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                linear.clear_gradients()
 
         """
         for p in self.parameters():
@@ -726,6 +859,32 @@ def add_sublayer(self, name, sublayer):
             sublayer(Layer): an instance of Layer.
         Returns:
             Layer: the sublayer passed in.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MySequential(paddle.nn.Layer):
+                    def __init__(self, *layers):
+                        super(MySequential, self).__init__()
+                        if len(layers) > 0 and isinstance(layers[0], tuple):
+                            for name, layer in layers:
+                                self.add_sublayer(name, layer)
+                        else:
+                            for idx, layer in enumerate(layers):
+                                self.add_sublayer(str(idx), layer)
+
+                    def forward(self, input):
+                        for layer in self._sub_layers.values():
+                            input = layer(input)
+                        return input
+
+                fc1 = paddle.nn.Linear(10, 3)
+                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = MySequential(fc1, fc2)
+                for prefix, layer in model.named_sublayers():
+                    print(prefix, layer)
         """
         assert isinstance(sublayer, core.Layer)
 
@@ -742,6 +901,25 @@ def add_parameter(self, name, parameter):
             parameter(Parameter): an instance of Parameter.
         Returns:
             Parameter: the parameter passed in.
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        w_tmp = self.create_parameter([1,1])
+                        self.add_parameter("w_tmp", w_tmp)
+
+                    def forward(self, input):
+                        return self._linear(input)
+
+                mylayer = MyLayer()
+                for name, param in mylayer.named_parameters():
+                    print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
+
         """
         if '_parameters' not in self.__dict__:
             raise RuntimeError(
@@ -871,24 +1049,23 @@ def __dir__(self):
         Return a list. Get all parameters, buffers(non-parameter variables), sublayers, method and attr of Layer.
 
         Examples:
-            import paddle.fluid as fluid
-            import numpy as np
-
-            fluid.dygraph.enable_dygraph()
+            .. code-block:: python
+                import paddle
+                import numpy as np
 
-            class Mylayer(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(Mylayer, self).__init__()
-                    self.linear1 = fluid.dygraph.Linear(10, 10)
-                    self.linear2 = fluid.dygraph.Linear(5, 5)
-                    self.conv2d = fluid.dygraph.Conv2D(3, 2, 3)
-                    self.embedding = fluid.dygraph.Embedding(size=[128, 16])
-                    self.h_0 = fluid.dygraph.to_variable(np.zeros([10, 10]).astype('float32'))
+                class Mylayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(Mylayer, self).__init__()
+                        self.linear1 = paddle.nn.Linear(10, 10)
+                        self.linear2 = paddle.nn.Linear(5, 5)
+                        self.conv2d = paddle.nn.Conv2d(3, 2, 3)
+                        self.embedding = paddle.nn.Embedding(128, 16)
+                        self.h_0 = paddle.to_tensor(np.zeros([10, 10]).astype('float32'))
 
-            mylayer = Mylayer()
-            print(dir(mylayer))
-            # only parts are shown, because of list have too much content
-            # ['__call__', '__class__',  ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train']
+                mylayer = Mylayer()
+                print(dir(mylayer))
+                # only parts are shown, because of list have too much content
+                # ['__call__', '__class__',  ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train']
 
         """
         method = dir(self.__class__)
@@ -918,12 +1095,12 @@ def state_dict(self,
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
+                emb = paddle.nn.Embedding(10, 10)
+
+                state_dict = emb.state_dict()
+                paddle.save( state_dict, "paddle_dy.pdparams")
 
         '''
 
@@ -967,16 +1144,12 @@ def set_state_dict(self,
             .. code-block:: python
 
                 import paddle
-                
-                paddle.disable_static()
-                
+
                 emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
                 paddle.save(state_dict, "paddle_dy.pdparams")
-                
                 para_state_dict = paddle.load("paddle_dy.pdparams")
-
                 emb.set_state_dict(para_state_dict)
 
         '''
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 05269028acc40..1a488844dec21 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -895,9 +895,6 @@ def forward(self, input):
 
 class Linear(layers.Layer):
     """
-    :alias_main: paddle.nn.Linear
-	:alias: paddle.nn.Linear,paddle.nn.layer.Linear,paddle.nn.layer.common.Linear
-	:old_api: paddle.fluid.dygraph.Linear
     
     Fully-connected linear transformation layer:
 
diff --git a/python/paddle/fluid/dygraph/static_runner.py b/python/paddle/fluid/dygraph/static_runner.py
index d482077cd4f2a..e8738da07e993 100644
--- a/python/paddle/fluid/dygraph/static_runner.py
+++ b/python/paddle/fluid/dygraph/static_runner.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from paddle.fluid.dygraph.jit import SaveLoadConfig
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import TranslatedLayer
 
 
@@ -31,7 +31,7 @@ class StaticModelRunner(object):
     """
 
     def __new__(cls, model_dir, model_filename=None, params_filename=None):
-        configs = SaveLoadConfig()
+        configs = _SaveLoadConfig()
         if model_filename is not None:
             configs.model_filename = model_filename
         if params_filename is not None:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 3dc30767e5aa4..f5660c3fc91a1 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -54,11 +54,11 @@ def global_scope():
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
+          import paddle
           import numpy
 
-          fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
-          numpy.array(fluid.global_scope().find_var("data").get_tensor())
+          paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
+          numpy.array(paddle.static.global_scope().find_var("data").get_tensor())
     """
     return g_scope
 
@@ -94,12 +94,13 @@ def scope_guard(scope):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy
+            paddle.enable_static()
 
-            new_scope = fluid.Scope()
-            with fluid.scope_guard(new_scope):
-                 fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+            new_scope = paddle.static.Scope()
+            with paddle.static.scope_guard(new_scope):
+                 paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
             numpy.array(new_scope.find_var("data").get_tensor())
     """
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 61ffb60b1105d..52c1e5d5e16c1 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -380,31 +380,35 @@ def cuda_places(device_ids=None):
         For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device.
         The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable.
 
-    This function creates a list of :code:`fluid.CUDAPlace` objects.
+    This function creates a list of :code:`paddle.CUDAPlace` objects.
 
     If :code:`device_ids` is None, environment variable of
     :code:`FLAGS_selected_gpus` would be checked first. For example, if
     :code:`FLAGS_selected_gpus=0,1,2`, the returned list would
-    be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    be [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
     If :code:`FLAGS_selected_gpus` is not set, all visible
     gpu places would be returned according to the :code:`CUDA_VISIBLE_DEVICES` environment variable.
 
     If :code:`device_ids` is not None, it should be the device
     ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
     the returned list would be 
-    [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
     
     Parameters:
         device_ids (list or tuple of int, optional): list of GPU device ids.
 
     Returns:
-        list of fluid.CUDAPlace: Created GPU place list.
+        list of paddle.CUDAPlace: Created GPU place list.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            cuda_places = fluid.cuda_places()
+            import paddle
+            import paddle.static as static
+            
+            paddle.enable_static()
+
+            cuda_places = static.cuda_places()
 
     """
     assert core.is_compiled_with_cuda(), \
@@ -418,7 +422,7 @@ def cuda_places(device_ids=None):
 
 def cpu_places(device_count=None):
     """
-    This function creates a list of :code:`fluid.CPUPlace` objects, and returns the created list.
+    This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
     
     If :code:`device_count` is None, the device count would
     be determined by environment variable :code:`CPU_NUM`. 
@@ -431,13 +435,17 @@ def cpu_places(device_count=None):
         device_count (int, optional): device number. Default: None.
 
     Returns:
-        list of fluid.CPUPlace: Created list of CPU places.
+        list of paddle.CPUPlace: Created list of CPU places.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            cpu_places = fluid.cpu_places()
+            import paddle
+            import paddle.static as static
+            
+            paddle.enable_static()
+
+            cpu_places = static.cpu_places()
     """
 
     if device_count is None:
@@ -5115,6 +5123,8 @@ class Parameter(Variable):
             be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
     """
 
     def __init__(self,
@@ -5154,6 +5164,8 @@ def __init__(self,
 
         self.do_model_average = kwargs.get('do_model_average', None)
 
+        self.need_clip = kwargs.get('need_clip', True)
+
         self.is_distributed = False
 
     def __str__(self):
@@ -5186,7 +5198,7 @@ def to_string(self, throw_on_error, with_details=False):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "do_model_average")
+                               "do_model_average", "need_clip")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          cpt.to_text(getattr(self, attr_name)))
@@ -5218,6 +5230,8 @@ class ParamBase(core.VarBase):
             be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
     """
 
     @dygraph_only
@@ -5257,6 +5271,8 @@ def __init__(self, shape, dtype, **kwargs):
 
         self.do_model_average = kwargs.get('do_model_average', None)
 
+        self.need_clip = kwargs.get('need_clip', True)
+
         self.is_distributed = False
         # self.block = default_main_program().global_block()
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index e348c67ae0461..90847382c86e1 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -133,6 +133,8 @@ def __init__(self, main_program, startup_program, strategy, role_maker):
 
         self.origin_main_program = main_program
         self.origin_startup_program = startup_program
+        self.origin_ps_main_program = main_program
+        self.origin_ps_startup_program = startup_program
 
         self.strategy = strategy
         self.role_maker = role_maker
@@ -153,6 +155,11 @@ def __init__(self, main_program, startup_program, strategy, role_maker):
 
         self._build_var_distributed()
 
+        # for heter-ps save variables
+        self.origin_merged_variables_pairs = list(self.merged_variables_pairs)
+        self.origin_merged_dense_pairs = list(self.merged_dense_pairs)
+        self.origin_merged_sparse_pairs = list(self.merged_sparse_pairs)
+
     def get_distributed_mode(self):
         trainer = self.strategy.get_trainer_runtime_config()
         return trainer.mode
@@ -214,6 +221,18 @@ def get_origin_main_program(self):
     def get_origin_startup_program(self):
         return self.origin_startup_program
 
+    def set_origin_ps_main_program(self, program):
+        self.origin_ps_main_program = program
+
+    def set_origin_ps_startup_program(self, program):
+        self.origin_ps_startup_program = program
+
+    def get_origin_ps_main_program(self):
+        return self.origin_ps_main_program
+
+    def get_origin_ps_startup_program(self):
+        return self.origin_ps_startup_program
+
     def get_sparse_varname_on_ps(self, is_distributed, endpoint=None):
         if not endpoint:
             endpoint = self.get_ps_endpoint()
@@ -378,7 +397,9 @@ def get_communicator_send_context(self):
             send_ctx[name] = ctx
         return send_ctx
 
-    def get_communicator_recv_context(self, recv_type=1):
+    def get_communicator_recv_context(self,
+                                      recv_type=1,
+                                      use_origin_program=False):
         # recv_type
         # 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL
         distibuted_varnames = get_sparse_tablenames(self.origin_main_program,
@@ -392,7 +413,8 @@ def get_communicator_recv_context(self, recv_type=1):
         sparse_recv_ctx = {}
         distributed_recv_ctx = {}
 
-        for merged in self.merged_variables_pairs:
+        variables_pairs = self.merged_variables_pairs if not use_origin_program else self.origin_merged_variables_pairs
+        for merged in variables_pairs:
             params = merged[0]
             if params.merged_var.name in sparse_varnames:
                 continue
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 7a92adf0a89dc..67c572d4988ce 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -729,31 +729,32 @@ class BilinearInitializer(Initializer):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
             import math
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.regularizer import L2Decay
+
             factor = 2
             C = 2
             B = 8
             H = W = 32
-            w_attr = fluid.param_attr.ParamAttr(
-                learning_rate=0., 
-                regularizer=fluid.regularizer.L2Decay(0.),
-                initializer=fluid.initializer.Bilinear())
-            x = fluid.data(name="data", shape=[B, 3, H, W], 
-                                  dtype="float32")
-            conv_up = fluid.layers.conv2d_transpose(
-                input=x,
-                num_filters=C,
-                output_size=None,
-                filter_size=2 * factor - factor % 2,
-                padding=int(math.ceil((factor - 1) / 2.)),
-                stride=factor,
-                groups=C,
-                param_attr=w_attr,
-                bias_attr=False)
-
-    Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
-    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
+            w_attr = paddle.ParamAttr(learning_rate=0.,
+                                      regularizer=L2Decay(0.),
+                                      initializer=nn.initializer.Bilinear())
+            data = paddle.rand([B, 3, H, W], dtype='float32')
+            conv_up = nn.ConvTranspose2d(3,
+                                         out_channels=C,
+                                         kernel_size=2 * factor - factor % 2,
+                                         padding=int(
+                                             math.ceil((factor - 1) / 2.)),
+                                         stride=factor,
+                                         weight_attr=w_attr,
+                                         bias_attr=False)
+            x = conv_up(data)
+
+    Where, `out_channels=C` and `groups=C` means this is channel-wise transposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`,
     This initializer will set a (K, K) interpolation kernel for every channel
     of the filter identically. The resulting shape of the output feature map
     will be (B, C, factor * H, factor * W). Note that the learning rate and the
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 529588c0846b5..0e3ee46fa46d1 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -220,24 +220,96 @@ def embedding(input,
     Returns:
         Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
 
-    Examples:
+    Static Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.enable_static()
+            
+            x = paddle.static.data(name="x", shape = [2, 4], dtype=np.int64)
+            embedding = paddle.nn.Embedding(10, 3,
+                        weight_attr=paddle.nn.initializer.Constant(value=1.0))
+            adam = paddle.optimizer.SGD(parameters=[embedding.weight], learning_rate=0.01)
+            output = embedding(x)
+            m_output=paddle.mean(output)
+            
+            adam.minimize(m_output)
+            
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            
+            x = np.array([[7, 2, 4, 5],[4, 3, 2, 9]], dtype=np.int64)
+            
+            # x is a Numpy.
+            # x.data = [[7, 2, 4, 5], [4, 3, 2, 9]]
+            # x.shape = [2, 4]
+            
+            out, = exe.run(paddle.static.default_main_program(), feed={'x':x}, fetch_list=[output])
+            
+            # out is a Numpy.
+            # out.data = [[1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [1., 1., 1.]],
+            #
+            #            [[1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [0., 0., 0.]]]
+            # out.shape = [2, 4, 3]
+
+
+    Dygraph Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import numpy as np
-          data = fluid.data(name='x', shape=[None, 10], dtype='int64')
-
-          # example 1
-          emb_1 = fluid.embedding(input=data, size=[128, 64])
-
-          # example 2: load custom or pre-trained word vectors
-          weight_data = np.random.random(size=(128, 100))  # word vectors with numpy format
-          w_param_attrs = fluid.ParamAttr(
-              name="emb_weight",
-              learning_rate=0.5,
-              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
-              trainable=True)
-          emb_2 = fluid.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')   
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            
+            x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+            
+            # x is a Tensor.
+            # x.data = [[3], [4], [5]]
+            # x.shape = [3, 1]
+            x = paddle.to_tensor(x_data, stop_gradient=False)
+            
+            # embedding weight shape = [10, 3]
+            embedding = paddle.nn.Embedding(10, 3, sparse=True)
+            
+            # embedding weight data = [10, 3]
+            w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            
+            # embedding.weight.shape = [10, 3]
+            # embedding.weight.data =
+            #                        [[2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.]]
+            embedding.weight.set_value(w0)
+            
+            adam = paddle.optimizer.Adam(
+                parameters=[embedding.weight], learning_rate=0.01)
+            adam.clear_grad()
+            
+            # out is Tensor
+            # out.shape: [3, 1, 3]
+            # out.layout: NCHW
+            # out.dtype: float
+            # out.data: [2 2 2 2 2 2 2 2 2]
+            out = embedding(x)
+            
+            out.backward()
+            adam.step()
+
     """
 
     helper = LayerHelper('embedding', **locals())
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index fe5b683bdeaa3..bb55aeb70d1f2 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1346,7 +1346,7 @@ def save_inference_model(dirname,
         append_fetch_ops(main_program, fetch_var_names)
 
         main_program.desc._set_version()
-        paddle.fluid.core.save_op_compatible_info(main_program.desc)
+        paddle.fluid.core.save_op_version_info(main_program.desc)
         with open(model_basename, "wb") as f:
             f.write(main_program.desc.serialize_to_string())
     else:
@@ -1720,7 +1720,7 @@ def get_tensor(var):
     main_program = program.clone()
     program.desc.flush()
     main_program.desc._set_version()
-    paddle.fluid.core.save_op_compatible_info(program.desc)
+    paddle.fluid.core.save_op_version_info(program.desc)
 
     with open(model_path + ".pdmodel", "wb") as f:
         f.write(program.desc.serialize_to_string())
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 411ac6e51b1c8..0c77917c78190 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2297,11 +2297,6 @@ def copy_var_to_parent_block(var, layer_helper):
 
 def cond(pred, true_fn=None, false_fn=None, name=None):
     """
-    :api_attr: Static Graph
-	:alias_main: paddle.nn.cond
-	:alias: paddle.nn.cond,paddle.nn.control_flow.cond
-	:old_api: paddle.fluid.layers.cond
-    
     This API returns ``true_fn()`` if the predicate ``pred`` is true else
     ``false_fn()`` . Users could also set ``true_fn`` or ``false_fn`` to
     ``None`` if do nothing and this API will treat the callable simply returns
@@ -2323,17 +2318,18 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
         semantics. For example:
 
         .. code-block:: python
-        
-            import paddle.fluid as fluid
-            a = fluid.data(name='a', shape=[-1, 1], dtype='float32')
-            b = fluid.data(name='b', shape=[-1, 1], dtype='float32')
+
+            import paddle
+
+            a = paddle.zeros((1, 1))
+            b = paddle.zeros((1, 1))
             c = a * b
-            out = fluid.layers.cond(a < b, lambda: a + c, lambda: b * b)
+            out = paddle.nn.cond(a < b, lambda: a + c, lambda: b * b)
 
         No matter whether ``a < b`` , ``c = a * b`` will run.
 
     Args:
-        pred(Variable): A boolean tensor whose numel should be 1. The boolean
+        pred(Tensor): A boolean tensor whose numel should be 1. The boolean
             value determines whether to return the result of ``true_fn`` or
             ``false_fn`` .
         true_fn(callable, optional): A callable to be performed if ``pred`` is
@@ -2345,7 +2341,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
              refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable|list(Variable)|tuple(Variable): returns ``true_fn()`` if the
+        Tensor|list(Tensor)|tuple(Tensor): returns ``true_fn()`` if the
         predicate ``pred`` is true else ``false_fn()`` .
 
     Raises:
@@ -2356,10 +2352,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            from paddle.fluid.executor import Executor
-            from paddle.fluid.framework import Program, program_guard
+            import paddle
 
             #
             # pseudocode:
@@ -2369,32 +2362,28 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
             #     return 3, 2
             #
 
+
             def true_func():
-                return layers.fill_constant(
-                    shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
-                        shape=[2, 3], dtype='bool', value=True)
+                return paddle.fill_constant(shape=[1, 2], dtype='int32',
+                                            value=1), paddle.fill_constant(shape=[2, 3],
+                                                                           dtype='bool',
+                                                                           value=True)
+
 
             def false_func():
-                return layers.fill_constant(
-                    shape=[3, 4], dtype='float32', value=3), layers.fill_constant(
-                        shape=[4, 5], dtype='int64', value=2)
-
-            main_program = Program()
-            startup_program = Program()
-            with program_guard(main_program, startup_program):
-                x = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
-                y = layers.fill_constant(shape=[1], dtype='float32', value=0.23)
-                pred = layers.less_than(x, y)            
-                out = layers.cond(pred, true_func, false_func)
-                # out is a tuple containing 2 tensors
-
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            ret = exe.run(main_program, fetch_list=out)
+                return paddle.fill_constant(shape=[3, 4], dtype='float32',
+                                            value=3), paddle.fill_constant(shape=[4, 5],
+                                                                           dtype='int64',
+                                                                           value=2)
+
+            x = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
+            y = paddle.fill_constant(shape=[1], dtype='float32', value=0.23)
+            pred = paddle.less_than(x=x, y=y, name=None)
+            ret = paddle.nn.cond(pred, true_func, false_func)
+            # ret is a tuple containing 2 tensors
             # ret[0] = [[1 1]]
             # ret[1] = [[ True  True  True]
-            #           [ True  True  True]]
+            #           [ True  True  True]]            
 
     """
     if in_dygraph_mode():
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 3610efdd505bd..2b1449a94e6e5 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -1681,11 +1681,6 @@ def kldiv_loss(x, target, reduction='mean', name=None):
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
     '''
-    :alias_main: paddle.nn.functional.npair_loss
-	:alias: paddle.nn.functional.npair_loss,paddle.nn.functional.loss.npair_loss
-	:old_api: paddle.fluid.layers.npair_loss
-
-  **Npair Loss Layer**
 
   Read `Improved Deep Metric Learning with Multi class N pair Loss Objective\
        <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/\
@@ -1696,29 +1691,31 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
   takes the similarity matrix of anchor and positive as logits.
 
   Args:
-    anchor(Variable): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+    anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
                       the data type is float32 or float64.
-    positive(Variable): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+    positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
                       the data type is float32 or float64.
-    labels(Variable): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
+    labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
     l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
   Returns:
-    A Variable holding Tensor representing the npair loss, the data type is the same as 
+    A Tensor representing the npair loss, the data type is the same as 
     anchor, the shape is [1].
 
   Examples:
     .. code-block:: python
 
-       import paddle.fluid as fluid
-       anchor = fluid.data(
-                     name = 'anchor', shape = [18, 6], dtype = 'float32')
-       positive = fluid.data(
-                     name = 'positive', shape = [18, 6], dtype = 'float32')
-       labels = fluid.data(
-                     name = 'labels', shape = [18], dtype = 'float32')
+        import paddle
+        
+        DATATYPE = "float32"
+
+        anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+        positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+        labels = paddle.rand(shape=(18,), dtype=DATATYPE)
+        
+        npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
+        print(npair_loss.numpy())
 
-       npair_loss = fluid.layers.npair_loss(anchor, positive, labels, l2_reg = 0.002)
   '''
     check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
                              'npair_loss')
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bec47d9227e1a..a6402a2852c2a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9592,10 +9592,6 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 @templatedoc()
 def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
     """
-    :alias_main: paddle.nn.functional.hard_sigmoid
-	:alias: paddle.nn.functional.hard_sigmoid,paddle.nn.functional.activation.hard_sigmoid
-	:old_api: paddle.fluid.layers.hard_sigmoid
-
     ${comment}
     Parameters:
         x (${x_type}): ${x_comment}
@@ -9613,9 +9609,15 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             data = fluid.layers.fill_constant(shape=[3, 2], value=0.5, dtype='float32') # [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
             result = fluid.layers.hard_sigmoid(data) # [[0.6, 0.6], [0.6, 0.6], [0.6, 0.6]]
     """
+    if in_dygraph_mode():
+        return core.ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_sigmoid')
 
@@ -9802,10 +9804,6 @@ def prelu(x, mode, param_attr=None, name=None):
 @templatedoc()
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
     """
-    :alias_main: paddle.nn.functional.brelu
-	:alias: paddle.nn.functional.brelu,paddle.nn.functional.activation.brelu
-	:old_api: paddle.fluid.layers.brelu
-
     ${comment}
     Args:
         x(${x_type}): ${x_comment}
@@ -9821,7 +9819,9 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
     .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
             import numpy as np
+            paddle.enable_static()
 
             input_brelu = np.array([[-1,6],[1,15.6]])
             with fluid.dygraph.guard():
@@ -9831,6 +9831,9 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
                 #[[ 1.  6.]
                 #[ 1. 10.]]
     """
+    if in_dygraph_mode():
+        return core.ops.brelu(x, 't_min', t_min, 't_max', t_max)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'brelu')
 
     helper = LayerHelper('brelu', **locals())
@@ -10238,9 +10241,9 @@ def unstack(x, axis=0, num=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
-            y = fluid.layers.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
+            import paddle
+            x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
     helper = LayerHelper('unstack', **locals())
@@ -11014,7 +11017,7 @@ def slice(input, axes, starts, ends):
     return out
 
 
-@templatedoc()
+@deprecated(since='2.0.0', update_to="paddle.strided_slice")
 def strided_slice(input, axes, starts, ends, strides):
     """
     :alias_main: paddle.strided_slice
@@ -11092,7 +11095,9 @@ def strided_slice(input, axes, starts, ends, strides):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
 
+            paddle.enable_static()
             input = fluid.data(
                 name="input", shape=[3, 4, 5, 6], dtype='float32')
 
@@ -12415,12 +12420,17 @@ def clip_by_norm(x, max_norm, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            input = fluid.data(
-                name='data', shape=[None, 1], dtype='float32')
-            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input = paddle.to_tensor(data=np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32")
+            reward = paddle.nn.clip_by_norm(x=input, max_norm=1.0)
     """
 
+    if in_dygraph_mode():
+        return core.ops.clip_by_norm(x, 'max_norm', max_norm)
+
     helper = LayerHelper("clip_by_norm", **locals())
     check_variable_and_dtype(x, 'X', ['float32'], 'clip_by_norm')
     check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
@@ -12559,13 +12569,10 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.maxout")
 @templatedoc()
 def maxout(x, groups, name=None, axis=1):
     """
-    :alias_main: paddle.nn.functional.maxout
-	:alias: paddle.nn.functional.maxout,paddle.nn.functional.activation.maxout
-	:old_api: paddle.fluid.layers.maxout
-
     ${comment}
 
     Args:
@@ -12587,31 +12594,16 @@ def maxout(x, groups, name=None, axis=1):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             input = fluid.data(
                 name='data',
                 shape=[None, 256, 32, 32],
                 dtype='float32')
             out = fluid.layers.maxout(input, groups=2)
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
-
-    helper = LayerHelper("maxout", **locals())
-    if axis not in [1, -1, 3]:
-        raise ValueError(
-            "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
-            "Attr(axis): %s." % str(axis))
-    if axis == -1:
-        axis = 3
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="maxout",
-        inputs={"X": x},
-        attrs={"groups": groups,
-               "axis": axis},
-        outputs={"Out": out})
-    return out
+    return paddle.nn.functional.maxout(**locals())
 
 
 def space_to_depth(x, blocksize, name=None):
@@ -13182,12 +13174,10 @@ def add_position_encoding(input, alpha, beta, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
           import paddle.nn.functional as F
 
-          tensor = np.random.randn(16, 32, 64) 
-          tensor = paddle.to_tensor(tensor)
+          tensor = paddle.randn([16, 32, 64])
           position_tensor = F.add_position_encoding(
                 input=tensor, alpha=1.0, beta=1.0)
 
@@ -13258,10 +13248,11 @@ def bilinear_tensor_product(x,
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          layer1 = fluid.data("t1", shape=[-1, 5], dtype="float32")
-          layer2 = fluid.data("t2", shape=[-1, 4], dtype="float32")
-          tensor = fluid.layers.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
+            import paddle
+            paddle.enable_static()
+            layer1 = paddle.static.data("t1", shape=[-1, 5], dtype="float32")
+            layer2 = paddle.static.data("t2", shape=[-1, 4], dtype="float32")
+            tensor = paddle.static.nn.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
     """
     helper = LayerHelper('bilinear_tensor_product', **locals())
     dtype = helper.input_dtype('x')
@@ -13421,7 +13412,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     ${comment}
 
     Args:
-        x(Variable): ${x_comment}
+        x(Tensor): ${x_comment}
         seg_num(int): ${seg_num_comment}
         shift_ratio(float): ${shift_ratio_comment}
         name(str, optional): For detailed information, please refer
@@ -13429,7 +13420,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
                              None by default.
 
     Returns:
-        out(Variable): The temporal shifting result is a tensor variable with the
+        out(Tensor): The temporal shifting result is a tensor with the
         same shape and same data type as the input.
 
     Raises:
@@ -13438,9 +13429,11 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            input = fluid.data(name='input', shape=[None,4,2,2], dtype='float32')
-            out = fluid.layers.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([6, 4, 2, 2])
+            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
@@ -13452,6 +13445,10 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
+    if in_dygraph_mode():
+        return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                       shift_ratio)
+
     helper.append_op(
         type="temporal_shift",
         inputs={"X": x},
@@ -13540,15 +13537,15 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     """
     :api_attr: Static Graph
 
-    This OP is used to register customized Python OP to Paddle Fluid. The design
-    principe of py_func is that LodTensor and numpy array can be converted to each
+    This OP is used to register customized Python OP to Paddle. The design
+    principe of py_func is that Tensor and numpy array can be converted to each
     other easily. So you can use Python and numpy API to register a python OP.
 
     The forward  function of the registered OP is ``func`` and the backward function
     of that is  ``backward_func``. Paddle will call ``func`` at forward runtime and
     call ``backward_func`` at backward runtime(if ``backward_func`` is not  None).
-    ``x`` is the input of ``func``, whose type must be LoDTensor; ``out`` is
-    the output of ``func``, whose type can be either LoDTensor or numpy array.
+    ``x`` is the input of ``func``, whose type must be Tensor; ``out`` is
+    the output of ``func``, whose type can be either Tensor or numpy array.
 
     The input of the backward function ``backward_func`` is ``x``, ``out`` and
     the gradient of ``out``. If some variables of ``out`` have no gradient, the
@@ -13566,14 +13563,14 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         func (callable): The forward function of the registered OP. When the network
             is running, the forward output ``out`` will be calculated according to this
             function and the forward input ``x``. In ``func`` , it's suggested that we
-            actively convert LoDTensor into a numpy array, so that we can use Python and
+            actively convert Tensor into a numpy array, so that we can use Python and
             numpy API arbitrarily. If not, some operations of numpy may not be compatible.
         x (Variable|tuple(Variale)|list[Variale]): The input of the forward function ``func``.
-            It can be Variable|tuple(Variale)|list[Variale], where Variable is LoDTensor or
+            It can be Variable|tuple(Variale)|list[Variale], where Variable is Tensor or
             Tenosor. In addition, Multiple Variable should be passed in the form of tuple(Variale)
             or list[Variale].
         out (Variable|tuple(Variale)|list[Variale]): The output of the forward function ``func``,
-            it can be Variable|tuple(Variale)|list[Variale], where Variable can be either LoDTensor
+            it can be Variable|tuple(Variale)|list[Variale], where Variable can be either Tensor
             or numpy array. Since Paddle cannot automatically infer the shape and type of ``out``,
             you must create ``out`` in advance.
         backward_func (callable, optional): The backward function of the registered OP.
@@ -13594,16 +13591,18 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         .. code-block:: python
 
             # example 1:
-            import paddle.fluid as fluid
+            import paddle
             import six
 
-            # Creates a forward function, LodTensor can be input directly without
+            paddle.enable_static()
+
+            # Creates a forward function, Tensor can be input directly without
             # being converted into numpy array.
             def tanh(x):
                 return np.tanh(x)
 
             # Skip x in backward function and return the gradient of x
-            # LodTensor must be actively converted to numpy array, otherwise,
+            # Tensor must be actively converted to numpy array, otherwise,
             # operations such as +/- can't be used.
             def tanh_grad(y, dy):
                 return np.array(dy) * (1 - np.square(np.array(y)))
@@ -13613,36 +13612,38 @@ def debug_func(x):
                 print(x)
 
             def create_tmp_var(name, dtype, shape):
-                return fluid.default_main_program().current_block().create_var(
+                return paddle.static.default_main_program().current_block().create_var(
                     name=name, dtype=dtype, shape=shape)
 
             def simple_net(img, label):
                 hidden = img
                 for idx in six.moves.range(4):
-                    hidden = fluid.layers.fc(hidden, size=200)
+                    hidden = paddle.static.nn.fc(hidden, size=200)
                     new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
                         dtype=hidden.dtype, shape=hidden.shape)
 
                     # User-defined forward and backward
-                    hidden = fluid.layers.py_func(func=tanh, x=hidden,
+                    hidden = paddle.static.nn.py_func(func=tanh, x=hidden,
                         out=new_hidden, backward_func=tanh_grad,
                         skip_vars_in_backward_input=hidden)
 
-                    # User-defined debug functions that print out the input LodTensor
-                    fluid.layers.py_func(func=debug_func, x=hidden, out=None)
+                    # User-defined debug functions that print out the input Tensor
+                    paddle.static.nn.py_func(func=debug_func, x=hidden, out=None)
 
-                prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-                loss = fluid.layers.cross_entropy(input=prediction, label=label)
-                return fluid.layers.mean(loss)
+                prediction = paddle.static.nn.fc(hidden, size=10, act='softmax')
+                loss = paddle.static.nn.cross_entropy(input=prediction, label=label)
+                return paddle.mean(loss)
 
             # example 2:
-            # This example shows how to turn LoDTensor into numpy array and
+            # This example shows how to turn Tensor into numpy array and
             # use numpy API to register an Python OP
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
+            paddle.enable_static()
+
             def element_wise_add(x, y):
-                # LodTensor must be actively converted to numpy array, otherwise,
+                # Tensor must be actively converted to numpy array, otherwise,
                 # numpy.shape can't be used.
                 x = np.array(x)
                 y = np.array(y)
@@ -13658,24 +13659,24 @@ def element_wise_add(x, y):
                 return result
 
             def create_tmp_var(name, dtype, shape):
-                return fluid.default_main_program().current_block().create_var(
+                return paddle.static.default_main_program().current_block().create_var(
                             name=name, dtype=dtype, shape=shape)
 
             def py_func_demo():
-                start_program = fluid.default_startup_program()
-                main_program = fluid.default_main_program()
+                start_program = paddle.static.default_startup_program()
+                main_program = paddle.static.default_main_program()
 
                 # Input of the forward function
-                x = fluid.data(name='x', shape=[2,3], dtype='int32')
-                y = fluid.data(name='y', shape=[2,3], dtype='int32')
+                x = paddle.static.data(name='x', shape=[2,3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[2,3], dtype='int32')
 
                 # Output of the forward function, name/dtype/shape must be specified
                 output = create_tmp_var('output','int32', [3,1])
 
                 # Multiple Variable should be passed in the form of tuple(Variale) or list[Variale]
-                fluid.layers.py_func(func=element_wise_add, x=[x,y], out=output)
+                paddle.static.nn.py_func(func=element_wise_add, x=[x,y], out=output)
 
-                exe=fluid.Executor(fluid.CPUPlace())
+                exe=paddle.static.Executor(paddle.CPUPlace())
                 exe.run(start_program)
 
                 # Feed numpy array to main_program
@@ -14863,10 +14864,6 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
 @templatedoc()
 def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     """
-    :alias_main: paddle.nn.functional.hard_swish
-	:alias: paddle.nn.functional.hard_swish,paddle.nn.functional.activation.hard_swish
-	:old_api: paddle.fluid.layers.hard_swish
-
     This operator implements the hard_swish activation function.
     Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
     For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
@@ -14897,7 +14894,9 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     .. code-block:: python
 
         import paddle.fluid as fluid
+        import paddle
         import numpy as np
+        paddle.enable_static()
 
         DATATYPE='float32'
 
@@ -14912,6 +14911,10 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
         out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
         print(out)  # [[0.66666667, 1.66666667,3., 4.]]
     """
+    if in_dygraph_mode():
+        return core.ops.hard_swish(x, 'threshold', threshold, 'scale', scale,
+                                   'offset', offset)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_swish')
 
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 8ac46ad2648fd..57c2489194337 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -2443,23 +2443,17 @@ def lstm(input,
     input_shape = list(input.shape)
     input_size = input_shape[-1]
     weight_size = 0
+    num_dirrection = 2 if is_bidirec == True else 1
+
     for i in range(num_layers):
         if i == 0:
-            input_weight_size = (input_size * hidden_size) * 4
+            input_weight_size = (input_size * hidden_size) * 4 * num_dirrection
         else:
-            if is_bidirec:
-                input_weight_size = (hidden_size * 2 * hidden_size) * 4
-            else:
-                input_weight_size = (hidden_size * hidden_size) * 4
+            input_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
+        hidden_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
 
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-
-        if is_bidirec:
-            weight_size += (input_weight_size + hidden_weight_size) * 2
-            weight_size += hidden_size * 8 * 2
-        else:
-            weight_size += input_weight_size + hidden_weight_size
-            weight_size += hidden_size * 8
+        weight_size += input_weight_size + hidden_weight_size
+        weight_size += hidden_size * 8 * num_dirrection
 
     weight = helper.create_parameter(
         attr=helper.param_attr,
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2fba578ec077f..c633f7022d75e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -103,9 +103,9 @@ def create_parameter(shape,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            W = layers.create_parameter(shape=[784, 200], dtype='float32')
+            import paddle
+            paddle.enable_static()
+            W = paddle.static.create_parameter(shape=[784, 200], dtype='float32')
     """
     check_type(shape, 'shape', (list, tuple, numpy.ndarray), 'create_parameter')
     for item in shape:
@@ -161,9 +161,9 @@ def create_global_var(shape,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            var = layers.create_global_var(shape=[2,3], value=1.0, dtype='float32',
+            import paddle
+            paddle.enable_static()
+            var = paddle.static.create_global_var(shape=[2,3], value=1.0, dtype='float32',
                                            persistable=True, force_cpu=True, name='new_var')
     """
     check_type(shape, 'shape', (list, tuple, numpy.ndarray),
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 761f6409fed76..367be181f4725 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -731,9 +731,6 @@ def _process_distribute_lookuptable(self, param_grads):
                     outputs={"ParamOut": param_and_grad[0]})
         return new_param_grads, (table_param, table_grad), sgd_op
 
-    def _append_dgc_ops(self, param_and_grad):
-        pass
-
     def backward(self,
                  loss,
                  startup_program=None,
@@ -801,9 +798,6 @@ def backward(self,
             with program_guard(program, startup_program):
                 params_grads = append_backward(loss, parameter_list,
                                                act_no_grad_set, callbacks)
-                # Note: since we can't use all_reduce_op now,
-                # dgc_op should be the last op of one grad.
-                self._append_dgc_ops(params_grads)
         return params_grads
 
     def apply_gradients(self, params_grads):
@@ -1569,6 +1563,11 @@ def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
 
     @imperative_base.no_grad
     def apply_gradients(self, params_grads):
+        # Note: since we can't use all_reduce_op now,
+        # dgc_op should be the last op of one grad.
+        # Maybe need a grad allreduce pass.
+        self._append_dgc_ops(params_grads)
+
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
             self._process_distribute_lookuptable(params_grads)
@@ -4784,10 +4783,6 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
 
             params_grads = append_backward(
                 loss, parameter_list, no_grad_set, checkpoints=checkpoint_vars)
-            # Note: since we can't use all_reduce_op now,
-            #  dgc_op should be the last op of one grad.
-            if hasattr(self._optimizer, "_append_dgc_ops"):
-                self._optimizer._append_dgc_ops(params_grads)
         return params_grads
 
     def apply_optimize(self, loss, startup_program, params_grads):
@@ -4884,29 +4879,35 @@ class LookaheadOptimizer(object):
             import paddle
             import paddle.fluid as fluid
             import numpy as np
+            import numpy.random as random
 
-	    x = fluid.layers.data(name='x', shape=[2], dtype='float32')
-	    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-	    y = fluid.layers.fc(input=[x], size=2, act="softmax")
-	    loss = fluid.layers.cross_entropy(input=y, label=label)
-	    loss = fluid.layers.mean(x=loss)
-	    sgd = fluid.optimizer.SGD(learning_rate=0.01)
-	    optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
-                                            alpha=0.5,
-                                            k=5)
-	    optimizer.minimize(loss)
-	    main_program = fluid.default_main_program()
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
+            paddle.enable_static()
+        
+            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            y = fluid.layers.fc(input=[x], size=2, act="softmax")
+            loss = fluid.layers.cross_entropy(input=y, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
+                                                alpha=0.5,
+                                                k=5)
+            optimizer.minimize(loss)
+            main_program = fluid.default_main_program()
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    step = 0
-            while(step < 10):
-                step += 1
-		exe.run(fluid.default_main_program(),
-            	feed=feeder.feed(batch_data))
+            def train_reader(limit=5):
+                for i in range(limit):
+                    yield random.random([2]).astype('float32'), random.random([1]).astype('int64')
+            
+            feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
+            reader = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=50000),batch_size=1)
+            
+            for batch_data in reader():
+                exe.run(fluid.default_main_program(),
+                feed=feeder.feed(batch_data))
 
     """
 
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a76faf1059068..bf04239370693 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -36,8 +36,8 @@ class ParamAttr(object):
     
     Note:
         ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
 
     Parameters:
@@ -57,19 +57,20 @@ class ParamAttr(object):
         trainable (bool): Whether this parameter is trainable. Default True.
         do_model_average (bool): Whether this parameter should do model average
                 when model average is enabled. Default False.
+        need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            w_param_attrs = fluid.ParamAttr(name="fc_weight",
-                                            learning_rate=0.5,
-                                            regularizer=fluid.regularizer.L2Decay(1.0),
-                                            trainable=True)
-            print(w_param_attrs.name) # "fc_weight"
-            x = fluid.data(name='X', shape=[None, 1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
+            weight_attr = paddle.ParamAttr(name="weight",
+                                           learning_rate=0.5,
+                                           regularizer=paddle.regularizer.L2Decay(1.0),
+                                           trainable=True)
+            print(weight_attr.name) # "weight"
+            paddle.nn.Linear(3, 4, weight_attr=weight_attr)
     """
 
     def __init__(self,
@@ -78,7 +79,8 @@ def __init__(self,
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 do_model_average=True):
+                 do_model_average=True,
+                 need_clip=True):
 
         if sys.version_info.major == 2:
             check_type(name, "name", (str, type(None), unicode), "ParamAttr")
@@ -87,6 +89,7 @@ def __init__(self,
         check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
         check_type(trainable, "trainable", (bool), "ParamAttr")
         check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
+        check_type(need_clip, "need_clip", (bool), "ParamAttr")
         check_type(initializer, "initializer", (Initializer, type(None)),
                    "ParamAttr")
         check_type(regularizer, "regularizer",
@@ -101,6 +104,7 @@ def __init__(self,
         self.regularizer = regularizer
         self.trainable = trainable
         self.do_model_average = do_model_average
+        self.need_clip = need_clip
 
     def _set_default_initializer(self, initializer):
         """
@@ -197,7 +201,8 @@ def _to_kwargs(self, with_initializer=False):
             },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
-            'do_model_average': self.do_model_average
+            'do_model_average': self.do_model_average,
+            'need_clip': self.need_clip
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -206,7 +211,7 @@ def _to_kwargs(self, with_initializer=False):
 
 class WeightNormParamAttr(ParamAttr):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Note:
         Please use 'paddle.nn.utils.weight_norm' in dygraph mode.
@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
     <https://arxiv.org/pdf/1602.07868.pdf>`_.
       
     Note:
-        ``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
         
 
@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
         trainable(bool, optional): Whether this parameter is trainable. Default True.
         do_model_average(bool, optional): Whether this parameter should do model average.
             Default False.
+        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
 
     Examples:
         .. code-block:: python
@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
                                                 learning_rate=1.0,
                                                 regularizer=paddle.regularizer.L2Decay(0.1),
                                                 trainable=True,
-                                                do_model_average=False))
+                                                do_model_average=False,
+                                                need_clip=True))
 
     """
     # List to record the parameters reparameterized by weight normalization.
@@ -283,12 +290,14 @@ def __init__(self,
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 do_model_average=False):
+                 do_model_average=False,
+                 need_clip=True):
         super(WeightNormParamAttr, self).__init__(
             name=name,
             initializer=initializer,
             learning_rate=learning_rate,
             regularizer=regularizer,
             trainable=trainable,
-            do_model_average=do_model_average)
+            do_model_average=do_model_average,
+            need_clip=need_clip)
         self.dim = dim
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 6cc00a7fd3734..35dcd45223419 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -215,7 +215,7 @@ class DataLoader(object):
             None.
 
     Returns:
-        DataLoader: an iterable object for data iterating
+        DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor.
 
     Examples:
         
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index fefaecd3b8979..7fc66e8e84961 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -169,6 +169,10 @@ def do_pyreader_training(self, fleet):
             except fluid.core.EOFException:
                 self.reader.reset()
 
+        if fleet.is_first_worker():
+            model_path = tempfile.mkdtemp()
+            fleet.save_persistables(executor=exe, dirname=model_path)
+            shutil.rmtree(model_path)
         fleet.stop_worker()
 
     def do_dataset_training(self, fleet):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index ba0adaf32e15d..63edd35f59bd4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -28,11 +28,12 @@ class PredictorTools(object):
     Paddle-Inference predictor
     '''
 
-    def __init__(self, model_path, params_file, feeds_var):
+    def __init__(self, model_path, model_file, params_file, feeds_var):
         '''
         __init__
         '''
         self.model_path = model_path
+        self.model_file = model_file
         self.params_file = params_file
 
         self.feeds_var = feeds_var
@@ -43,7 +44,7 @@ def _load_model_and_set_config(self):
         '''
         if os.path.exists(os.path.join(self.model_path, self.params_file)):
             config = AnalysisConfig(
-                os.path.join(self.model_path, "__model__"),
+                os.path.join(self.model_path, self.model_file),
                 os.path.join(self.model_path, self.params_file))
         else:
             config = AnalysisConfig(os.path.join(self.model_path))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
new file mode 100644
index 0000000000000..6612450b7cff8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -0,0 +1,493 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+import paddle
+
+
+class EmbeddingLayer(object):
+    """
+    Embedding Layer class
+    """
+
+    def __init__(self, dict_size, emb_dim, name="emb", padding_idx=None):
+        """
+        initialize
+        """
+        self.dict_size = dict_size
+        self.emb_dim = emb_dim
+        self.name = name
+        self.padding_idx = padding_idx
+
+    def ops(self):
+        """
+        operation
+        """
+        # TODO(huihuangzheng): The original code set the is_sparse=True, but it
+        # causes crush in dy2stat. Set it to True after fixing it.
+        emb = paddle.fluid.dygraph.Embedding(
+            size=[self.dict_size, self.emb_dim],
+            is_sparse=True,
+            padding_idx=self.padding_idx,
+            param_attr=paddle.ParamAttr(
+                name=self.name, initializer=paddle.nn.initializer.Xavier()))
+
+        return emb
+
+
+class FCLayer(object):
+    """
+    Fully Connect Layer class
+    """
+
+    def __init__(self, fc_dim, act, name="fc"):
+        """
+        initialize
+        """
+        self.fc_dim = fc_dim
+        self.act = act
+        self.name = name
+
+    def ops(self):
+        """
+        operation
+        """
+        fc = FC(size=self.fc_dim,
+                param_attr=paddle.ParamAttr(name="%s.w" % self.name),
+                bias_attr=paddle.ParamAttr(name="%s.b" % self.name),
+                act=self.act)
+        return fc
+
+
+class ConcatLayer(object):
+    """
+    Connection Layer class
+    """
+
+    def __init__(self, axis):
+        """
+        initialize
+        """
+        self.axis = axis
+
+    def ops(self, inputs):
+        """
+        operation
+        """
+        concat = paddle.concat(x=inputs, axis=self.axis)
+        return concat
+
+
+class ReduceMeanLayer(object):
+    """
+    Reduce Mean Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input):
+        """
+        operation
+        """
+        mean = paddle.reduce_mean(input)
+        return mean
+
+
+class CosSimLayer(object):
+    """
+    Cos Similarly Calculate Layer
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        sim = paddle.nn.functional.cosine_similarity(x, y)
+        return sim
+
+
+class ElementwiseMaxLayer(object):
+    """
+    Elementwise Max Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        max = paddle.maximum(x=x, y=y)
+        return max
+
+
+class ElementwiseAddLayer(object):
+    """
+    Elementwise Add Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        add = paddle.add(x=x, y=y)
+        return add
+
+
+class ElementwiseSubLayer(object):
+    """
+    Elementwise Add Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        sub = paddle.elementwise_sub(x, y)
+        return sub
+
+
+class ConstantLayer(object):
+    """
+    Generate A Constant Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input, shape, dtype, value):
+        """
+        operation
+        """
+        shape = list(shape)
+        input_shape = paddle.shape(input)
+        shape[0] = input_shape[0]
+        constant = paddle.fill_constant(shape, dtype, value)
+        return constant
+
+
+class SoftsignLayer(object):
+    """
+    Softsign Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input):
+        """
+        operation
+        """
+        softsign = paddle.nn.functional.softsign(input)
+        return softsign
+
+
+class FC(paddle.nn.Layer):
+    """
+    This interface is used to construct a callable object of the ``FC`` class.
+    For more details, refer to code examples.
+    It creates a fully connected layer in the network. It can take
+    one or multiple ``Tensor`` as its inputs. It creates a Variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [N, `size`],
+    where N is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [N, `size`] will be summed up. If ``bias_attr``
+    is not None, a bias variable will be created and added to the output.
+    Finally, if ``act`` is not None, it will be applied to the output as well.
+    When the input is single ``Tensor`` :
+    .. math::
+        Out = Act({XW + b})
+    When the input are multiple ``Tensor`` :
+    .. math::
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+    In the above equation:
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of ``Tensor`` .
+    * :math:`X_i`: The i-th input ``Tensor`` .
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output ``Tensor`` .
+    See below for an example.
+    .. code-block:: text
+        Given:
+            data_1.data = [[[0.1, 0.2]]]
+            data_1.shape = (1, 1, 2) # 1 is batch_size
+            data_2.data = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3) # 1 is batch_size
+            fc = FC("fc", 2, num_flatten_dims=2)
+            out = fc(input=[data_1, data_2])
+        Then:
+            out.data = [[[0.182996 -0.474117]]]
+            out.shape = (1, 1, 2)
+    Parameters:
+        
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multi-dimension tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
+        param_attr (ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
+            weights(Parameter) of this layer. Default: None.
+        bias_attr (ParamAttr or list of ParamAttr, optional): The attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str, optional): Activation to be applied to the output of this layer. Default: None.
+        is_test(bool, optional): A flag indicating whether execution is in test phase. Default: False.
+        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
+    Attribute:
+        **weight** (list of Parameter): the learnable weights of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Returns:
+        None
+    
+    """
+
+    def __init__(self,
+                 size,
+                 num_flatten_dims=1,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 is_test=False,
+                 dtype="float32"):
+        super(FC, self).__init__(dtype)
+
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+        self.__w = list()
+
+    def _build_once(self, input):
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
+            ] + [self._size]
+            self.__w.append(
+                self.add_parameter(
+                    '_w%d' % i,
+                    self.create_parameter(
+                        attr=param,
+                        shape=param_shape,
+                        dtype=self._dtype,
+                        is_bias=False)))
+            i += 1
+
+        size = list([self._size])
+        self._b = self.create_parameter(
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+
+    # TODO(songyouwei): We should remove _w property
+    @property
+    def _w(self, i=0):
+        return self.__w[i]
+
+    @_w.setter
+    def _w(self, value, i=0):
+        assert isinstance(self.__w[i], Variable)
+        self.__w[i].set_value(value)
+
+    @property
+    def weight(self):
+        if len(self.__w) > 1:
+            return self.__w
+        else:
+            return self.__w[0]
+
+    @weight.setter
+    def weight(self, value):
+        if len(self.__w) == 1:
+            self.__w[0] = value
+
+    @property
+    def bias(self):
+        return self._b
+
+    @bias.setter
+    def bias(self, value):
+        self._b = value
+
+    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            tmp = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="mul",
+                inputs={"X": inp,
+                        "Y": self.__w[i]},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": self._num_flatten_dims,
+                    "y_num_col_dims": 1
+                })
+            i += 1
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias},
+                attrs={"use_mkldnn": False})
+
+        if self._b is not None:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [self._b]},
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class HingeLoss(object):
+    """
+    Hing Loss Calculate class
+    """
+
+    def __init__(self, conf_dict):
+        """
+        initialize
+        """
+        self.margin = conf_dict["loss"]["margin"]
+
+    def compute(self, pos, neg):
+        """
+        compute loss
+        """
+        elementwise_max = ElementwiseMaxLayer()
+        elementwise_add = ElementwiseAddLayer()
+        elementwise_sub = ElementwiseSubLayer()
+        constant = ConstantLayer()
+        reduce_mean = ReduceMeanLayer()
+        loss = reduce_mean.ops(
+            elementwise_max.ops(
+                constant.ops(neg, neg.shape, "float32", 0.0),
+                elementwise_add.ops(
+                    elementwise_sub.ops(neg, pos),
+                    constant.ops(neg, neg.shape, "float32", self.margin))))
+        return loss
+
+
+class BOW(paddle.nn.Layer):
+    """
+    BOW
+    """
+
+    def __init__(self, conf_dict):
+        """
+        initialize
+        """
+        super(BOW, self).__init__()
+        self.dict_size = conf_dict["dict_size"]
+        self.task_mode = conf_dict["task_mode"]
+        self.emb_dim = conf_dict["net"]["emb_dim"]
+        self.bow_dim = conf_dict["net"]["bow_dim"]
+        self.seq_len = conf_dict["seq_len"]
+        self.emb_layer = EmbeddingLayer(self.dict_size, self.emb_dim,
+                                        "emb").ops()
+        self.bow_layer = paddle.nn.Linear(
+            in_features=self.bow_dim, out_features=self.bow_dim)
+        self.bow_layer_po = FCLayer(self.bow_dim, None, "fc").ops()
+        self.softmax_layer = FCLayer(2, "softmax", "cos_sim").ops()
+
+    @paddle.jit.to_static
+    def forward(self, left, right):
+        """
+        Forward network
+        """
+
+        # embedding layer
+        left_emb = self.emb_layer(left)
+        right_emb = self.emb_layer(right)
+        left_emb = paddle.reshape(
+            left_emb, shape=[-1, self.seq_len, self.bow_dim])
+        right_emb = paddle.reshape(
+            right_emb, shape=[-1, self.seq_len, self.bow_dim])
+
+        bow_left = paddle.reduce_sum(left_emb, dim=1)
+        bow_right = paddle.reduce_sum(right_emb, dim=1)
+        softsign_layer = SoftsignLayer()
+        left_soft = softsign_layer.ops(bow_left)
+        right_soft = softsign_layer.ops(bow_right)
+
+        # matching layer
+        if self.task_mode == "pairwise":
+            left_bow = self.bow_layer(left_soft)
+            right_bow = self.bow_layer(right_soft)
+            cos_sim_layer = CosSimLayer()
+            pred = cos_sim_layer.ops(left_bow, right_bow)
+            return left_bow, pred
+        else:
+            concat_layer = ConcatLayer(1)
+            concat = concat_layer.ops([left_soft, right_soft])
+            concat_fc = self.bow_layer_po(concat)
+            pred = self.softmax_layer(concat_fc)
+            return left_soft, pred
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index f105dd5e94744..6c26189a4adb3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import time
 import unittest
-
 import numpy as np
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
@@ -31,7 +33,10 @@
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
-MODEL_SAVE_PATH = "./bert.inference.model"
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/bert"
+MODEL_FILENAME = "bert" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "bert" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
 
 
@@ -85,7 +90,7 @@ def train(bert_config, data_reader, to_static):
             step_idx += 1
             if step_idx == STEP_NUM:
                 if to_static:
-                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PATH)
+                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PREFIX)
                 else:
                     fluid.dygraph.save_dygraph(bert.state_dict(),
                                                DY_STATE_DICT_SAVE_PATH)
@@ -104,11 +109,15 @@ def train_static(bert_config, data_reader):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     # load inference model
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
     pred_res = exe.run(inference_program,
                        feed=dict(zip(feed_target_names, data)),
                        fetch_list=fetch_targets)
@@ -143,7 +152,7 @@ def predict_dygraph(bert_config, data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        bert = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        bert = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         bert.eval()
 
         src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
@@ -155,7 +164,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, data)
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            data)
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index af7e73c41464d..f54f70e4b854b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -21,7 +21,7 @@
 from paddle.fluid import ParamAttr
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -422,7 +422,10 @@ class Args(object):
     prop_boundary_ratio = 0.5
     num_sample = 2
     num_sample_perbin = 2
-    infer_dir = './bmn_infer_model'
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/bmn"
+    model_filename = "bmn" + INFER_MODEL_SUFFIX
+    params_filename = "bmn" + INFER_PARAMS_SUFFIX
     dy_param_path = './bmn_dy_param'
 
 
@@ -620,7 +623,7 @@ def train_bmn(args, place, to_static):
 
                 if batch_id == args.train_batch_num:
                     if to_static:
-                        fluid.dygraph.jit.save(bmn, args.infer_dir)
+                        fluid.dygraph.jit.save(bmn, args.model_save_prefix)
                     else:
                         fluid.dygraph.save_dygraph(bmn.state_dict(),
                                                    args.dy_param_path)
@@ -735,13 +738,15 @@ def predict_dygraph(self, data):
             return pred_res
 
     def predict_static(self, data):
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             self.args.infer_dir,
+             self.args.model_save_dir,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=self.args.model_filename,
+             params_filename=self.args.params_filename)
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
                            fetch_list=fetch_targets)
@@ -750,7 +755,7 @@ def predict_static(self, data):
 
     def predict_dygraph_jit(self, data):
         with fluid.dygraph.guard(self.place):
-            bmn = fluid.dygraph.jit.load(self.args.infer_dir)
+            bmn = fluid.dygraph.jit.load(self.args.model_save_prefix)
             bmn.eval()
 
             x = to_variable(data)
@@ -760,7 +765,9 @@ def predict_dygraph_jit(self, data):
             return pred_res
 
     def predict_analysis_inference(self, data):
-        output = PredictorTools(self.args.infer_dir, VARIABLE_FILENAME, [data])
+        output = PredictorTools(self.args.model_save_dir,
+                                self.args.model_filename,
+                                self.args.params_filename, [data])
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 4d735b565ddbc..c9bc8cc647df3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -26,9 +26,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
-
-from predictor_utils import PredictorTools
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 SEED = 2020
 
@@ -395,7 +393,10 @@ class Args(object):
     base_learning_rate = 0.01
     bigru_num = 2
     print_steps = 1
-    model_save_dir = "./lac_model"
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/lac"
+    model_filename = "lac" + INFER_MODEL_SUFFIX
+    params_filename = "lac" + INFER_PARAMS_SUFFIX
     dy_param_path = "./lac_dy_param"
 
 
@@ -498,13 +499,11 @@ def do_train(args, to_static):
                 step += 1
         # save inference model
         if to_static:
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [crf_decode]
             fluid.dygraph.jit.save(
                 layer=model,
-                model_path=args.model_save_dir,
+                path=args.model_save_prefix,
                 input_spec=[words, length],
-                configs=configs)
+                output_spec=[crf_decode])
         else:
             fluid.dygraph.save_dygraph(model.state_dict(), args.dy_param_path)
 
@@ -539,7 +538,6 @@ def verify_predict(self):
             dy_pre = self.predict_dygraph(batch)
             st_pre = self.predict_static(batch)
             dy_jit_pre = self.predict_dygraph_jit(batch)
-            predictor_pre = self.predict_analysis_inference(batch)
             self.assertTrue(
                 np.allclose(dy_pre, st_pre),
                 msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -547,10 +545,6 @@ def verify_predict(self):
                 np.allclose(dy_jit_pre, st_pre),
                 msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
                                                                st_pre))
-            self.assertTrue(
-                np.allclose(predictor_pre, st_pre),
-                msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
-                                                                  st_pre))
 
     def predict_dygraph(self, batch):
         words, targets, length = batch
@@ -573,13 +567,15 @@ def predict_static(self, batch):
         LAC model contains h_0 created in `__init__` that is necessary for inferring.
         Load inference model to test it's ok for prediction.
         """
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
              self.args.model_save_dir,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=self.args.model_filename,
+             params_filename=self.args.params_filename)
 
         words, targets, length = batch
         pred_res = exe.run(
@@ -592,21 +588,13 @@ def predict_static(self, batch):
     def predict_dygraph_jit(self, batch):
         words, targets, length = batch
         with fluid.dygraph.guard(self.place):
-            model = fluid.dygraph.jit.load(self.args.model_save_dir)
+            model = fluid.dygraph.jit.load(self.args.model_save_prefix)
             model.eval()
 
             pred_res = model(to_variable(words), to_variable(length))
 
             return pred_res.numpy()
 
-    def predict_analysis_inference(self, batch):
-        words, targets, length = batch
-
-        output = PredictorTools(self.args.model_save_dir, VARIABLE_FILENAME,
-                                [words, length])
-        out = output()
-        return out
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index bd600d2f2dbd6..8a21c4cfd0eca 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -25,7 +25,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
 from predictor_utils import PredictorTools
@@ -218,34 +218,39 @@ def train(self, to_static=False):
     def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
         if to_static:
             infer_model_path = "./test_mnist_inference_model_by_jit_save"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [gt_out]
+            model_save_dir = "./inference"
+            model_save_prefix = "./inference/mnist"
+            model_filename = "mnist" + INFER_MODEL_SUFFIX
+            params_filename = "mnist" + INFER_PARAMS_SUFFIX
             fluid.dygraph.jit.save(
                 layer=model,
-                model_path=infer_model_path,
+                path=model_save_prefix,
                 input_spec=input_spec,
-                configs=configs)
+                output_spec=[gt_out])
             # load in static mode
             static_infer_out = self.jit_load_and_run_inference_static(
-                infer_model_path, inputs)
+                model_save_dir, model_filename, params_filename, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), static_infer_out))
             # load in dygraph mode
             dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
-                infer_model_path, inputs)
+                model_save_prefix, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
             # load in Paddle-Inference
             predictor_infer_out = self.predictor_load_and_run_inference_analysis(
-                infer_model_path, inputs)
+                model_save_dir, model_filename, params_filename, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), predictor_infer_out))
 
     @switch_to_static_graph
-    def jit_load_and_run_inference_static(self, model_path, inputs):
+    def jit_load_and_run_inference_static(self, model_path, model_filename,
+                                          params_filename, inputs):
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
              dirname=model_path,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=model_filename,
+             params_filename=params_filename)
         assert len(inputs) == len(feed_target_names)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
@@ -258,8 +263,10 @@ def jit_load_and_run_inference_dygraph(self, model_path, inputs):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
-    def predictor_load_and_run_inference_analysis(self, model_path, inputs):
-        output = PredictorTools(model_path, VARIABLE_FILENAME, inputs)
+    def predictor_load_and_run_inference_analysis(
+            self, model_path, model_filename, params_filename, inputs):
+        output = PredictorTools(model_path, model_filename, params_filename,
+                                inputs)
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index a377075062b26..a086bf1455a81 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -20,7 +20,7 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 import unittest
 
@@ -439,7 +439,10 @@ class Args(object):
     train_step = 10
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
-    model_save_path = model + ".inference.model"
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/" + model
+    model_filename = model + INFER_MODEL_SUFFIX
+    params_filename = model + INFER_PARAMS_SUFFIX
     dy_state_dict_save_path = model + ".dygraph"
 
 
@@ -504,7 +507,7 @@ def train_mobilenet(args, to_static):
                 t_last = time.time()
                 if batch_id > args.train_step:
                     if to_static:
-                        fluid.dygraph.jit.save(net, args.model_save_path)
+                        fluid.dygraph.jit.save(net, args.model_save_prefix)
                     else:
                         fluid.dygraph.save_dygraph(net.state_dict(),
                                                    args.dy_state_dict_save_path)
@@ -514,11 +517,15 @@ def train_mobilenet(args, to_static):
 
 
 def predict_static(args, data):
+    paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         args.model_save_path, executor=exe, params_filename=VARIABLE_FILENAME)
+         args.model_save_dir,
+         executor=exe,
+         model_filename=args.model_filename,
+         params_filename=args.params_filename)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -545,7 +552,7 @@ def predict_dygraph(args, data):
 
 def predict_dygraph_jit(args, data):
     with fluid.dygraph.guard(args.place):
-        model = fluid.dygraph.jit.load(args.model_save_path)
+        model = fluid.dygraph.jit.load(args.model_save_prefix)
         model.eval()
 
         pred_res = model(data)
@@ -554,7 +561,8 @@ def predict_dygraph_jit(args, data):
 
 
 def predict_analysis_inference(args, data):
-    output = PredictorTools(args.model_save_path, VARIABLE_FILENAME, [data])
+    output = PredictorTools(args.model_save_dir, args.model_filename,
+                            args.params_filename, [data])
     out = output()
     return out
 
@@ -565,7 +573,9 @@ def setUp(self):
 
     def train(self, model_name, to_static):
         self.args.model = model_name
-        self.args.model_save_path = model_name + ".inference.model"
+        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_filename = model_name + INFER_MODEL_SUFFIX
+        self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
         self.args.dy_state_dict_save_path = model_name + ".dygraph"
         out = train_mobilenet(self.args, to_static)
         return out
@@ -579,7 +589,9 @@ def assert_same_loss(self, model_name):
 
     def assert_same_predict(self, model_name):
         self.args.model = model_name
-        self.args.model_save_path = model_name + ".inference.model"
+        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_filename = model_name + INFER_MODEL_SUFFIX
+        self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
         self.args.dy_state_dict_save_path = model_name + ".dygraph"
         local_random = np.random.RandomState(SEED)
         image = local_random.random_sample([1, 3, 224, 224]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 203c8ddb3488c..095940d79eac6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -24,7 +24,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -38,7 +38,11 @@
 epoch_num = 1
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
-MODEL_SAVE_PATH = "./resnet.inference.model"
+
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/resnet"
+MODEL_FILENAME = "resnet" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "resnet" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
 program_translator = ProgramTranslator()
 
@@ -261,7 +265,7 @@ def train(to_static):
                             total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
                 if batch_id == 10:
                     if to_static:
-                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PATH)
+                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PREFIX)
                     else:
                         fluid.dygraph.save_dygraph(resnet.state_dict(),
                                                    DY_STATE_DICT_SAVE_PATH)
@@ -287,10 +291,14 @@ def predict_dygraph(data):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -301,7 +309,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         resnet.eval()
 
         pred_res = resnet(data)
@@ -310,7 +318,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
new file mode 100644
index 0000000000000..a8cfeb90bd814
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import time
+import unittest
+
+import numpy as np
+
+import paddle
+
+from predictor_utils import PredictorTools
+
+SEED = 2020
+IMAGENET1000 = 1281167
+base_lr = 0.001
+momentum_rate = 0.9
+l2_decay = 1e-4
+# NOTE: Reduce batch_size from 8 to 2 to avoid unittest timeout.
+batch_size = 2
+epoch_num = 1
+place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
+    else paddle.CPUPlace()
+
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/resnet_v2"
+MODEL_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_PARAMS_SUFFIX
+DY_STATE_DICT_SAVE_PATH = "./resnet_v2.dygraph"
+program_translator = paddle.jit.ProgramTranslator()
+
+if paddle.is_compiled_with_cuda():
+    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+def optimizer_setting(parameter_list=None):
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=base_lr,
+        momentum=momentum_rate,
+        weight_decay=paddle.regularizer.L2Decay(l2_decay),
+        parameters=parameter_list)
+
+    return optimizer
+
+
+class ConvBNLayer(paddle.nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = paddle.nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+        self._batch_norm = paddle.nn.BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(paddle.nn.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+
+        layer_helper = paddle.fluid.layer_helper.LayerHelper(
+            self.full_name(), act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(paddle.nn.Layer):
+    def __init__(self, layers=50, class_dim=102):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = paddle.nn.Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block]
+                        if i == 0 else num_filters[block] * 4,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = paddle.nn.Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
+
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = paddle.nn.Linear(
+            in_features=self.pool2d_avg_output,
+            out_features=class_dim,
+            weight_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)))
+
+    @paddle.jit.to_static
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output])
+        pred = self.out(y)
+        pred = paddle.nn.functional.softmax(pred)
+
+        return pred
+
+
+def reader_decorator(reader):
+    def __reader__():
+        for item in reader():
+            img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
+            label = np.array(item[1]).astype('int64').reshape(1)
+            yield img, label
+
+    return __reader__
+
+
+def train(to_static):
+    """
+    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+    """
+    paddle.disable_static(place)
+    np.random.seed(SEED)
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+
+    train_reader = paddle.batch(
+        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+        batch_size=batch_size,
+        drop_last=True)
+    data_loader = paddle.io.DataLoader.from_generator(capacity=5, iterable=True)
+    data_loader.set_sample_list_generator(train_reader)
+
+    resnet = ResNet()
+    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+    for epoch in range(epoch_num):
+        total_loss = 0.0
+        total_acc1 = 0.0
+        total_acc5 = 0.0
+        total_sample = 0
+
+        for batch_id, data in enumerate(data_loader()):
+            start_time = time.time()
+            img, label = data
+
+            pred = resnet(img)
+            loss = paddle.nn.functional.cross_entropy(input=pred, label=label)
+            avg_loss = paddle.mean(x=loss)
+            acc_top1 = paddle.metric.accuracy(input=pred, label=label, k=1)
+            acc_top5 = paddle.metric.accuracy(input=pred, label=label, k=5)
+
+            avg_loss.backward()
+            optimizer.minimize(avg_loss)
+            resnet.clear_gradients()
+
+            total_loss += avg_loss
+            total_acc1 += acc_top1
+            total_acc5 += acc_top5
+            total_sample += 1
+
+            end_time = time.time()
+            if batch_id % 2 == 0:
+                print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
+                    ( epoch, batch_id, total_loss.numpy() / total_sample, \
+                        total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
+            if batch_id == 10:
+                if to_static:
+                    paddle.jit.save(resnet, MODEL_SAVE_PREFIX)
+                else:
+                    paddle.fluid.dygraph.save_dygraph(resnet.state_dict(),
+                                                      DY_STATE_DICT_SAVE_PATH)
+                    # avoid dataloader throw abort signaal
+                data_loader._reset()
+                break
+    paddle.enable_static()
+
+    return total_loss.numpy()
+
+
+def predict_dygraph(data):
+    program_translator.enable(False)
+    paddle.disable_static(place)
+    resnet = ResNet()
+
+    model_dict, _ = paddle.fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
+    resnet.set_dict(model_dict)
+    resnet.eval()
+
+    pred_res = resnet(
+        paddle.to_tensor(
+            data=data, dtype=None, place=None, stop_gradient=True))
+
+    ret = pred_res.numpy()
+    paddle.enable_static()
+    return ret
+
+
+def predict_static(data):
+    exe = paddle.static.Executor(place)
+    [inference_program, feed_target_names,
+     fetch_targets] = paddle.static.load_inference_model(
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
+
+    pred_res = exe.run(inference_program,
+                       feed={feed_target_names[0]: data},
+                       fetch_list=fetch_targets)
+
+    return pred_res[0]
+
+
+def predict_dygraph_jit(data):
+    paddle.disable_static(place)
+    resnet = paddle.jit.load(MODEL_SAVE_PREFIX)
+    resnet.eval()
+
+    pred_res = resnet(data)
+
+    ret = pred_res.numpy()
+    paddle.enable_static()
+    return ret
+
+
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
+    out = output()
+    return out
+
+
+class TestResnet(unittest.TestCase):
+    def train(self, to_static):
+        program_translator.enable(to_static)
+        return train(to_static)
+
+    def verify_predict(self):
+        image = np.random.random([1, 3, 224, 224]).astype('float32')
+        dy_pre = predict_dygraph(image)
+        st_pre = predict_static(image)
+        dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
+        self.assertTrue(
+            np.allclose(dy_pre, st_pre),
+            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
+        self.assertTrue(
+            np.allclose(dy_jit_pre, st_pre),
+            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
+
+    def test_resnet(self):
+        static_loss = self.train(to_static=True)
+        dygraph_loss = self.train(to_static=False)
+        self.assertTrue(
+            np.allclose(static_loss, dygraph_loss),
+            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
+                                                             dygraph_loss))
+        self.verify_predict()
+
+    def test_in_static_mode_mkldnn(self):
+        paddle.fluid.set_flags({'FLAGS_use_mkldnn': True})
+        try:
+            train(to_static=True)
+        finally:
+            paddle.fluid.set_flags({'FLAGS_use_mkldnn': False})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index cf7708c675aa9..b431d5ae048a9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -16,14 +16,14 @@
 
 import os
 import unittest
-
 import numpy as np
-import paddle.fluid as fluid
 
+import paddle
+import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 SEED = 2020
 
@@ -66,14 +66,13 @@ def test_save_inference_model(self):
                 adam.minimize(loss)
                 layer.clear_gradients()
             # test for saving model in dygraph.guard
-            infer_model_dir = "./test_dy2stat_save_inference_model_in_guard"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [pred]
+            infer_model_prefix = "./test_dy2stat_inference_in_guard/model"
+            infer_model_dir = "./test_dy2stat_inference_in_guard"
             fluid.dygraph.jit.save(
                 layer=layer,
-                model_path=infer_model_dir,
+                path=infer_model_prefix,
                 input_spec=[x],
-                configs=configs)
+                output_spec=[pred])
             # Check the correctness of the inference
             dygraph_out, _ = layer(x)
         self.check_save_inference_model(layer, [x_data], dygraph_out.numpy())
@@ -91,30 +90,30 @@ def check_save_inference_model(self,
 
         expected_persistable_vars = set([p.name for p in model.parameters()])
 
-        infer_model_dir = "./test_dy2stat_save_inference_model"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        if fetch is not None:
-            configs.output_spec = fetch
-        configs.separate_params = True
+        infer_model_prefix = "./test_dy2stat_inference/model"
+        infer_model_dir = "./test_dy2stat_inference"
+        model_filename = "model" + INFER_MODEL_SUFFIX
+        params_filename = "model" + INFER_PARAMS_SUFFIX
         fluid.dygraph.jit.save(
             layer=model,
-            model_path=infer_model_dir,
+            path=infer_model_prefix,
             input_spec=feed if feed else None,
-            configs=configs)
-        saved_var_names = set([
-            filename for filename in os.listdir(infer_model_dir)
-            if filename != '__model__' and filename != EXTRA_VAR_INFO_FILENAME
-        ])
-        self.assertEqual(saved_var_names, expected_persistable_vars)
+            output_spec=fetch if fetch else None)
         # Check the correctness of the inference
-        infer_out = self.load_and_run_inference(infer_model_dir, inputs)
+        infer_out = self.load_and_run_inference(infer_model_dir, model_filename,
+                                                params_filename, inputs)
         self.assertTrue(np.allclose(gt_out, infer_out))
 
-    def load_and_run_inference(self, model_path, inputs):
+    def load_and_run_inference(self, model_path, model_filename,
+                               params_filename, inputs):
+        paddle.enable_static()
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=model_path, executor=exe)
+             dirname=model_path,
+             executor=exe,
+             model_filename=model_filename,
+             params_filename=params_filename)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
                           fetch_list=fetch_targets)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 8f11a58588463..15cff501838a1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -24,7 +24,7 @@
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
 from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -35,7 +35,10 @@
 EPOCH_NUM = 1
 PRINT_STEP = 2
 STEP_NUM = 10
-MODEL_SAVE_PATH = "./se_resnet.inference.model"
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/se_resnet"
+MODEL_FILENAME = "se_resnet" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "se_resnet" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./se_resnet.dygraph"
 
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
@@ -383,10 +386,10 @@ def train(train_reader, to_static):
                 step_idx += 1
                 if step_idx == STEP_NUM:
                     if to_static:
-                        configs = fluid.dygraph.jit.SaveLoadConfig()
-                        configs.output_spec = [pred]
-                        fluid.dygraph.jit.save(se_resnext, MODEL_SAVE_PATH,
-                                               [img], configs)
+                        fluid.dygraph.jit.save(
+                            se_resnext,
+                            MODEL_SAVE_PREFIX, [img],
+                            output_spec=[pred])
                     else:
                         fluid.dygraph.save_dygraph(se_resnext.state_dict(),
                                                    DY_STATE_DICT_SAVE_PATH)
@@ -414,10 +417,14 @@ def predict_dygraph(data):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -428,7 +435,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         se_resnext.eval()
 
         pred_res = se_resnext(data)
@@ -437,7 +444,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
new file mode 100644
index 0000000000000..284087e61ec64
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import random
+import unittest
+
+from simnet_dygraph_model_v2 import BOW, HingeLoss
+
+SEED = 102
+random.seed(SEED)
+
+
+def create_conf_dict():
+    conf_dict = {}
+    conf_dict["task_mode"] = "pairwise"
+    conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
+    conf_dict["loss"] = {"margin": 0.1}
+    return conf_dict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Total examples' number in batch for training.")
+    parser.add_argument(
+        "--seq_len", type=int, default=32, help="The length of each sentence.")
+    parser.add_argument(
+        "--epoch", type=int, default=1, help="The number of training epoch.")
+    parser.add_argument(
+        "--fake_sample_size",
+        type=int,
+        default=128,
+        help="The number of samples of fake data.")
+    args = parser.parse_args([])
+    return args
+
+
+args = parse_args()
+
+
+def fake_vocabulary():
+    vocab = {}
+    vocab["<unk>"] = 0
+    for i in range(26):
+        c = chr(ord('a') + i)
+        vocab[c] = i + 1
+    return vocab
+
+
+vocab = fake_vocabulary()
+
+
+class FakeReaderProcessor(object):
+    def __init__(self, args, vocab):
+        self.vocab = vocab
+        self.seq_len = args.seq_len
+        self.sample_size = args.fake_sample_size
+        self.data_samples = []
+        for i in range(self.sample_size):
+            query = [random.randint(0, 26) for i in range(self.seq_len)]
+            pos_title = query[:]
+            neg_title = [26 - q for q in query]
+            self.data_samples.append(
+                np.array([query, pos_title, neg_title]).astype(np.int64))
+
+    def get_reader(self, mode, epoch=0):
+        def reader_with_pairwise():
+            if mode == "train":
+                for i in range(self.sample_size):
+                    yield self.data_samples[i]
+
+        return reader_with_pairwise
+
+
+simnet_process = FakeReaderProcessor(args, vocab)
+
+
+def train(conf_dict, to_static):
+    """
+    train process
+    """
+    program_translator = paddle.jit.ProgramTranslator()
+    program_translator.enable(to_static)
+
+    # Get device
+    if paddle.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    else:
+        place = paddle.CPUPlace()
+
+    paddle.disable_static(place)
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+
+    conf_dict['dict_size'] = len(vocab)
+    conf_dict['seq_len'] = args.seq_len
+
+    net = BOW(conf_dict)
+    loss = HingeLoss(conf_dict)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        parameters=net.parameters())
+
+    metric = paddle.metric.Auc(name="auc")
+
+    global_step = 0
+    losses = []
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=16, return_list=True, iterable=True, use_double_buffer=True)
+    get_train_examples = simnet_process.get_reader("train", epoch=args.epoch)
+    train_loader.set_sample_list_generator(
+        paddle.batch(
+            get_train_examples, batch_size=args.batch_size), place)
+
+    for left, pos_right, neg_right in train_loader():
+        left = paddle.reshape(left, shape=[-1, 1])
+        pos_right = paddle.reshape(pos_right, shape=[-1, 1])
+        neg_right = paddle.reshape(neg_right, shape=[-1, 1])
+        net.train()
+        global_step += 1
+        left_feat, pos_score = net(left, pos_right)
+        pred = pos_score
+        _, neg_score = net(left, neg_right)
+        avg_cost = loss.compute(pos_score, neg_score)
+        losses.append(np.mean(avg_cost.numpy()))
+        avg_cost.backward()
+        optimizer.minimize(avg_cost)
+        net.clear_gradients()
+    paddle.enable_static()
+    return losses
+
+
+class TestSimnet(unittest.TestCase):
+    def test_dygraph_static_same_loss(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+        conf_dict = create_conf_dict()
+        dygraph_loss = train(conf_dict, to_static=False)
+        static_loss = train(conf_dict, to_static=True)
+
+        self.assertEqual(len(dygraph_loss), len(static_loss))
+        for i in range(len(dygraph_loss)):
+            self.assertAlmostEqual(dygraph_loss[i], static_loss[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 4fc8d27d30cb8..6721e7a51d2bc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -32,6 +32,7 @@
 
 
 def train_static(args, batch_generator):
+    paddle.enable_static()
     paddle.manual_seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
index 8ebb99fda660e..e264a300d8c18 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
@@ -277,7 +277,8 @@ def load_dygraph(model_path, keep_name_table=False):
     To load python2 saved models in python3.
     """
     try:
-        para_dict, opti_dict = fluid.load_dygraph(model_path, keep_name_table)
+        para_dict, opti_dict = fluid.load_dygraph(
+            model_path, keep_name_table=keep_name_table)
         return para_dict, opti_dict
     except UnicodeDecodeError:
         warnings.warn(
@@ -287,7 +288,7 @@ def load_dygraph(model_path, keep_name_table=False):
         if six.PY3:
             load_bak = pickle.load
             pickle.load = partial(load_bak, encoding="latin1")
-            para_dict, opti_dict = fluid.load_dygraph(model_path,
-                                                      keep_name_table)
+            para_dict, opti_dict = fluid.load_dygraph(
+                model_path, keep_name_table=keep_name_table)
             pickle.load = load_bak
             return para_dict, opti_dict
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
new file mode 100755
index 0000000000000..e7cdd49a32c26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+        return avg_cost, strategy
+
+    def optimizer(self,
+                  loss,
+                  strategy,
+                  train_prog,
+                  startup_prog,
+                  name='momentum'):
+        with fluid.program_guard(train_prog, startup_prog):
+            with fluid.unique_name.guard():
+                if name == 'momentum':
+                    optimizer = paddle.fluid.optimizer.Momentum(
+                        learning_rate=0.01, momentum=0.9)
+                elif name == 'adam':
+                    optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(
+                    optimizer, strategy=strategy)
+                optimizer.minimize(loss)
+
+    def set_strategy(self, strategy, name):
+        if name == 'amp':
+            strategy.amp = True
+            strategy.amp_configs = {
+                "init_loss_scaling": 32768,
+                "decr_every_n_nan_or_inf": 2,
+                "incr_every_n_steps": 1000,
+                "incr_ratio": 2.0,
+                "use_dynamic_loss_scaling": True,
+                "decr_ratio": 0.5,
+                "custom_white_list": ['softmax'],
+                "custom_black_list": ['tanh'],
+            }
+        elif name == 'dgc':
+            strategy.dgc = True
+            strategy.dgc_configs = {
+                "rampup_begin_step": 128,
+                "rampup_step": 100,
+                "sparsity": [0.996, 0.999]
+            }
+        elif name == 'recompute':
+            strategy.recompute = True
+            strategy.recompute_configs = {
+                "checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"]
+            }
+        elif name == 'lars':
+            strategy.lars = True
+            strategy.lars_configs = {
+                "lars_coeff": 0.001,
+                "lars_weight_decay": 0.0005,
+                "epsilon": 0,
+                "exclude_from_weight_decay": ["batch_norm", ".b"],
+            }
+        elif name == 'lamb':
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                'lamb_weight_decay': 0.01,
+                'exclude_from_weight_decay': [],
+            }
+        elif name == 'localsgd':
+            strategy.localsgd = True
+            strategy.localsgd_configs = {
+                'k_steps': 1,
+                'begin_step': 1,
+            }
+        elif name == 'adaptive_localsgd':
+            strategy.adaptive_localsgd = True
+            strategy.adaptive_localsgd_configs = {
+                'init_k_steps': 1,
+                'begin_step': 1,
+            }
+        else:
+            raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/fleet_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_ps_training.py
index a9e9675a61160..65fa1ef935ef1 100644
--- a/python/paddle/fluid/tests/unittests/fleet_ps_training.py
+++ b/python/paddle/fluid/tests/unittests/fleet_ps_training.py
@@ -20,8 +20,12 @@
 
 input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+input_y = fluid.layers.cast(input_y, dtype="float32")
+
+with fluid.device_guard("gpu"):
+    input_y = fluid.layers.cast(input_y, dtype="int64")
+    cost = mlp(input_x, input_y)
 
-cost = mlp(input_x, input_y)
 optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
 
 role = role_maker.PaddleCloudRoleMaker()
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 266e42c06199d..29204a000592a 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -157,7 +157,7 @@ def _test_download(self, fs):
 
         assert fs.need_upload_download()
 
-        self.assertTrue(fs.is_exist(dst_file))
+        self.assertFalse(fs.is_exist(dst_file))
         fs.delete(dst_file)
         fs.delete(src_file)
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 0ac33383fb26b..6f0b4f9076ec4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -19,7 +19,7 @@
 import struct
 
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
 
 
@@ -35,6 +35,8 @@ def conv2d_residual_naive(out, residual):
     return out
 
 
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
 class TestConv2dBf16Op(TestConv2dOp):
     def setUp(self):
         self.op_type = "conv2d"
@@ -42,9 +44,9 @@ def setUp(self):
         self.exhaustive_search = False
         self.use_cuda = False
         self.use_mkldnn = True
+        self._cpu_only = True
         self.weight_type = np.float32
         self.input_type = np.float32
-        self.use_mkldnn = True
         self.mkldnn_data_type = "bfloat16"
         self.force_fp32_output = False
         self.init_group()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..83b636650ab41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestFusionGRUBF16MKLDNNOp(OpTest):
+    def set_confs(self):
+        self.mkldnn_data_type = False
+
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.M = 3
+        self.D = 5
+        self.is_reverse = False
+        self.with_h0 = False
+        self.use_mkldnn = True
+        self._cpu_only = True
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.origin_mode = False
+        self.use_mkldnn = True
+        self.force_fp32_output = False
+        self.set_confs()
+
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # fp32 X input for reference implementation and
+        # corressponding bf16 data as input to GRU oneDNN bf16 kernel
+        x_fp32 = np.random.rand(T, self.M).astype('float32')
+        x_bf16 = convert_float_to_uint16(x_fp32)
+
+        wx_fp32 = np.random.rand(self.M, 3 * self.D).astype('float32')
+        wh_fp32 = np.random.rand(self.D, 3 * self.D).astype('float32')
+
+        # bias is fp32 despite other inputs being in bf16
+        bias = np.random.rand(
+            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float32')
+
+        h0_fp32 = np.random.rand(
+            N, self.D).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float32')
+
+        _, _, _, hidden = fusion_gru(
+            x_fp32, self.lod, h0_fp32, wx_fp32, wh_fp32, bias, self.is_reverse,
+            self.origin_mode, ACTIVATION[self.act_state],
+            ACTIVATION[self.act_gate])
+
+        hidden_bf16 = convert_float_to_uint16(hidden)
+
+        self.inputs = {
+            'X': (x_bf16, self.lod),
+            'WeightX': wx_fp32,
+            'WeightH': wh_fp32
+        }
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0_bf16
+
+        h0_bf16 = convert_float_to_uint16(h0_fp32)
+        self.outputs = {'Hidden': (hidden_bf16, self.lod)}
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse,
+            'origin_mode': self.origin_mode,
+            'force_fp32_output': self.force_fp32_output,
+            'use_mkldnn': self.use_mkldnn
+        }
+
+
+class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUBF16MKLDNNOp):
+    def set_confs(self):
+        self.origin_mode = False
+
+
+class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
+    def set_confs(self):
+        self.with_bias = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
index ff4531f0e250e..89343c9fae459 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -45,9 +45,10 @@ def setUp(self):
 
         # Input data
         x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
-        scale_data = 63
-        shift_data = 64
-        x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
+        scale_data = 63.0
+        shift_data = 64.0
+        x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8)
+        #  x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
 
         # WeightX/WeightH data
         wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1
@@ -58,22 +59,23 @@ def setUp(self):
         # WeightX data shape in PP: [IC, 3 * OC]
         # WeightH data shape in PP: [OC, 2 * OC] + [OC, OC]
         # Scales shape in oneDNN:   [3, OC]
-        scale_ur = 63 / np.max(np.abs(
+        s8_max = 127.0
+        scale_ur = s8_max / np.max(np.abs(
             np.concatenate(
                 [
                     wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
                     .reshape(self.OC, 2 * self.OC)
                 ],
                 axis=0)),
-                               axis=0)
-        scale_o = 63 / np.max(np.abs(
+                                   axis=0)
+        scale_o = s8_max / np.max(np.abs(
             np.concatenate(
                 [
                     wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
                     .reshape(self.OC, self.OC)
                 ],
                 axis=0)),
-                              axis=0)
+                                  axis=0)
 
         scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
 
@@ -102,7 +104,9 @@ def setUp(self):
             self.outputs = {'Hidden': (hidden_f32, self.lod)}
         else:
             self.error_margin = 1
-            hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
+            hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype(
+                np.uint8)
+            #  hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
             self.outputs = {'Hidden': (hidden_u8, self.lod)}
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index ee917b059b87c..467bac67051dd 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -61,6 +61,37 @@ def init_data_type(self):
 create_test_mkldnn_class(TestCase5)
 
 
+class TestAvgPoolAdaptive(TestPool2D_Op):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_test_case(self):
+        self.ksize = [1, 1]
+        self.strides = [1, 1]
+
+    def init_data_type(self):
+        self.dtype = np.float32
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestAvgPoolAdaptive2(TestAvgPoolAdaptive):
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+
+    def init_shape(self):
+        self.shape = [2, 3, 6, 6]
+
+
 class TestAsymPad(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -160,4 +191,6 @@ def init_shape(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 68a5fa5e8f367..4fed0c8552b44 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -25,10 +25,11 @@
 import paddle.nn.functional as F
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestSqrtOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of sqrt op must be Variable or numpy.ndarray.
             in1 = 1
@@ -45,7 +46,6 @@ def test_errors(self):
 
 class TestActivation(OpTest):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "exp"
         self.init_dtype()
         self.init_kernel_type()
@@ -74,7 +74,6 @@ def init_kernel_type(self):
 
 class TestParameter(object):
     def test_out_name(self):
-        paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
             np_x = np.array([0.1])
             data = fluid.layers.data(name="X", shape=[1])
@@ -96,7 +95,6 @@ def test_dygraph(self):
 
 class TestSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sigmoid"
         self.init_dtype()
 
@@ -118,7 +116,6 @@ def test_check_grad(self):
 
 class TestLogSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "logsigmoid"
         self.init_dtype()
 
@@ -192,7 +189,6 @@ def test_errors(self):
 
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "tanh"
         self.init_dtype()
         np.random.seed(1024)
@@ -273,7 +269,6 @@ def test_errors(self):
 
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "atan"
         self.init_dtype()
 
@@ -311,7 +306,6 @@ def test_dygraph(self):
 
 class TestSinh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sinh"
         self.init_dtype()
 
@@ -371,7 +365,6 @@ def test_backward(self):
 
 class TestSinhOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.sinh, 1)
@@ -385,7 +378,6 @@ def test_errors(self):
 
 class TestCosh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "cosh"
         self.init_dtype()
 
@@ -445,7 +437,6 @@ def test_backward(self):
 
 class TestCoshOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.cosh, 1)
@@ -464,7 +455,6 @@ def ref_tanhshrink(x):
 
 class TestTanhshrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
@@ -544,7 +534,6 @@ def ref_hardshrink(x, threshold):
 
 class TestHardShrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "hard_shrink"
         self.init_dtype()
 
@@ -575,7 +564,6 @@ def set_attrs(self):
 class TestHardShrinkAPI(unittest.TestCase):
     # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
     def setUp(self):
-        paddle.enable_static()
         np.random.seed(1024)
         self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
@@ -704,7 +692,6 @@ def ref_softshrink(x, threshold=0.5):
 
 class TestSoftshrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softshrink"
         self.init_dtype()
 
@@ -784,7 +771,6 @@ def test_errors(self):
 
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sqrt"
         self.init_dtype()
 
@@ -803,7 +789,6 @@ def test_check_grad(self):
 
 class TestRsqrt(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "rsqrt"
         self.init_dtype()
 
@@ -822,7 +807,6 @@ def test_check_grad(self):
 
 class TestAbs(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "abs"
         self.init_dtype()
 
@@ -846,7 +830,6 @@ def test_check_grad(self):
 
 class TestCeil(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "ceil"
         self.init_dtype()
 
@@ -864,7 +847,6 @@ def test_check_grad(self):
 
 class TestFloor(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "floor"
         self.init_dtype()
 
@@ -884,7 +866,6 @@ def test_check_grad(self):
 
 class TestCos(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "cos"
         self.init_dtype()
 
@@ -903,7 +884,6 @@ def test_check_grad(self):
 
 class TestAcos(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "acos"
         self.init_dtype()
 
@@ -922,7 +902,6 @@ def test_check_grad(self):
 
 class TestSin(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sin"
         self.init_dtype()
 
@@ -941,7 +920,6 @@ def test_check_grad(self):
 
 class TestAsin(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "asin"
         self.init_dtype()
 
@@ -960,7 +938,6 @@ def test_check_grad(self):
 
 class TestRound(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "round"
         self.init_dtype()
 
@@ -977,7 +954,6 @@ def test_check_grad(self):
 
 class TestRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "relu"
         self.init_dtype()
 
@@ -1052,7 +1028,6 @@ def get_alpha(self):
         return 0.02
 
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "leaky_relu"
         self.init_dtype()
         alpha = self.get_alpha()
@@ -1162,7 +1137,6 @@ def gelu(x, approximate):
 
 class TestGeluApproximate(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = True
@@ -1182,7 +1156,6 @@ def test_check_grad(self):
 
 class TestGelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = False
@@ -1254,7 +1227,6 @@ def test_errors(self):
 
 class TestBRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "brelu"
         self.init_dtype()
 
@@ -1279,9 +1251,35 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestBReluOpError(unittest.TestCase):
+class TestBreluAPI(unittest.TestCase):
+    # test paddle.fluid.layers.brelu
+    def setUp(self):
+        np.random.seed(1024)
+        self.t_min = 0.
+        self.t_max = 24.
+        self.x_np = np.random.uniform(-1, 30, [10, 12]).astype('float32')
+        self.out_ref = np.copy(self.x_np)
+        self.out_ref[self.out_ref < self.t_min] = self.t_min
+        self.out_ref[self.out_ref > self.t_max] = self.t_max
+        self.out_ref = self.out_ref.astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_fluid_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', [10, 12])
+            out = paddle.fluid.layers.brelu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+            self.assertTrue(np.allclose(self.out_ref, res[0]))
+
+            paddle.disable_static(self.place)
+            x = paddle.to_tensor(self.x_np)
+            out = paddle.fluid.layers.brelu(x)
+            self.assertTrue(np.allclose(self.out_ref, out.numpy()))
+            paddle.enable_static()
+
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.brelu, 1)
@@ -1303,7 +1301,6 @@ def ref_relu6(x, threshold=6.0):
 
 class TestRelu6(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "relu6"
         self.init_dtype()
 
@@ -1378,9 +1375,13 @@ def test_errors(self):
             F.relu6(x_fp16)
 
 
+def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
+    return (x * np.minimum(np.maximum(x + offset, 0.), threshold) /
+            scale).astype(x.dtype)
+
+
 class TestHardSwish(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = 'hard_swish'
         self.init_dtype()
 
@@ -1392,9 +1393,9 @@ def setUp(self):
         #the same with TestAbs
         x[np.abs(x + offset) < 0.005] = 0.02
         x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
-        out = x * np.minimum(np.maximum(x + offset, 0), threshold) / scale
+        out = ref_hardswish(x, threshold, scale, offset)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
         self.outputs = {'Out': out}
 
@@ -1404,23 +1405,65 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestHardSwishOpError(unittest.TestCase):
-    def test_errors(self):
+class TestHardswishAPI(unittest.TestCase):
+    # test paddle.nn.Hardswish, paddle.nn.functional.hardswish
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardswish(x)
+            m = paddle.nn.Hardswish()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardswish(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardswish(x)
+        m = paddle.nn.Hardswish()
+        out2 = m(x)
+        out_ref = ref_hardswish(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
-        with program_guard(Program()):
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_swish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardswish(self.x_np)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_swish(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
+            self.assertRaises(TypeError, F.hardswish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_swish, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardswish, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_swish(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardswish(x_fp16)
 
 
 class TestSoftRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "soft_relu"
         self.init_dtype()
 
@@ -1447,7 +1490,6 @@ def test_check_grad(self):
 
 class TestSoftReluOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
@@ -1466,7 +1508,6 @@ def elu(x, alpha):
 
 class TestELU(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "elu"
         self.init_dtype()
 
@@ -1540,7 +1581,6 @@ def test_errors(self):
 
 class TestReciprocal(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "reciprocal"
         self.init_dtype()
 
@@ -1559,7 +1599,6 @@ def test_check_grad(self):
 
 class TestLog(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "log"
         self.init_dtype()
 
@@ -1587,7 +1626,6 @@ def test_error(self):
 
 class TestLog1p(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "log1p"
         self.init_dtype()
 
@@ -1633,7 +1671,6 @@ def test_api(self):
 
 class TestSquare(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "square"
         self.init_dtype()
 
@@ -1652,7 +1689,6 @@ def test_check_grad(self):
 
 class TestPow(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1672,7 +1708,6 @@ def test_check_grad(self):
 
 class TestPow_factor_tensor(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1750,7 +1785,6 @@ def test_error(self):
 
 class TestSTanh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "stanh"
         self.init_dtype()
 
@@ -1772,7 +1806,6 @@ def test_check_grad(self):
 
 class TestSTanhOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.stanh, 1)
@@ -1793,7 +1826,6 @@ def ref_softplus(x, beta=1, threshold=20):
 
 class TestSoftplus(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softplus"
         self.init_dtype()
 
@@ -1877,7 +1909,6 @@ def ref_softsign(x):
 
 class TestSoftsign(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softsign"
         self.init_dtype()
 
@@ -1948,23 +1979,24 @@ def test_errors(self):
             F.softsign(x_fp16)
 
 
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "thresholded_relu"
         self.init_dtype()
 
-        threshold = 0.25
-        self.delta = 0.005
-        np.random.seed(1024)
-        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-
-        # Same reason as TestAbs
-        X[np.abs(X - threshold) < self.delta] = threshold + 0.2
-        out = (X > threshold) * X
+        threshold = 15
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'threshold': threshold}
+        np.random.seed(1024)
+        x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+        x[np.abs(x) < 0.005] = 0.02
+        out = ref_thresholded_relu(x, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {"threshold": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1973,98 +2005,238 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestThresholdedReluOpError(unittest.TestCase):
+class TestThresholdedReluAPI(unittest.TestCase):
+    # test paddle.nn.ThresholdedReLU, paddle.nn.functional.thresholded_relu
+    def setUp(self):
+        self.threshold = 15
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-20, 20, [10, 12]).astype(np.float64)
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.thresholded_relu(x, self.threshold)
+            thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+            out2 = thresholded_relu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.thresholded_relu(x, self.threshold)
+        thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+        out2 = thresholded_relu(x)
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.thresholded_relu(x, self.threshold)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
+            self.assertRaises(TypeError, F.thresholded_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.thresholded_relu, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.thresholded_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.thresholded_relu(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.thresholded_relu(x_fp16)
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
 
 
 class TestHardSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "hard_sigmoid"
-        self.init_dtype()
-
-        np.random.seed(1024)
-        X = np.random.uniform(-5, 5, [10, 12]).astype("float32")
-        slope = 0.2
-        offset = 0.5
-        lower_threshold = -offset / slope
-        upper_threshold = (1 - offset) / slope
+        self.dtype = 'float64'
+        self.slope = 0.166666666666667
+        self.offset = 0.5
+        self.set_attrs()
 
-        self.delta = 0.005
+        x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+        lower_threshold = -self.offset / self.slope
+        upper_threshold = (1. - self.offset) / self.slope
 
         # Same reason as TestAbs
-        X[(X - lower_threshold) < self.delta] = lower_threshold - 0.02
-        X[(X - upper_threshold) < self.delta] = upper_threshold + 0.02
+        delta = 0.005
+        x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+        x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
 
-        temp = X * slope + offset
-        out = np.maximum(0.0, np.minimum(1.0, temp))
+        out = ref_hardsigmoid(x, self.slope, self.offset)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.attrs = {'slope': self.slope, 'offset': self.offset}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out')
+    def set_attrs(self):
+        pass
 
 
-class TestHardSigmoidOpError(unittest.TestCase):
-    def test_errors(self):
+class TestHardSigmoidFP32(TestHardSigmoid):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestHardSigmoidSlopeOffset(TestHardSigmoid):
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.4
+
+
+class TestHardsigmoidAPI(unittest.TestCase):
+    # test paddle.nn.Hardsigmoid, paddle.nn.functional.hardsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardsigmoid(x)
+            m = paddle.nn.Hardsigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardsigmoid(x)
+        m = paddle.nn.Hardsigmoid()
+        out2 = m(x)
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
-        with program_guard(Program()):
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_sigmoid(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_sigmoid(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
+            self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_sigmoid(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardsigmoid(x_fp16)
+
+
+def ref_swish(x):
+    out = x * expit(x)
+    return out
 
 
 class TestSwish(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "swish"
         self.init_dtype()
 
         np.random.seed(1024)
-        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        beta = 2.3
-        out = X * expit(beta * X)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'beta': beta}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_swish(x)
+        self.inputs = {'X': x}
+        self.attrs = {'beta': 1.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out')
+
+
+class TestSwishAPI(unittest.TestCase):
+    # test paddle.nn.Swish, paddle.nn.functional.swish
+    def setUp(self):
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.swish(x)
+            swish = paddle.nn.Swish()
+            out2 = swish(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_swish(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.swish(x)
+        swish = paddle.nn.Swish()
+        out2 = swish(x)
+        out_ref = ref_swish(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
 
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.swish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_swish(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
 
-class TestSwishOpError(unittest.TestCase):
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.swish, 1)
+            self.assertRaises(TypeError, F.swish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.swish, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.swish, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.swish(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.swish(x_fp16)
 
 
 #------------------ Test Error Activation----------------------
diff --git a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
index 51e447dba725c..c0d622d7ea187 100644
--- a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
@@ -178,16 +178,25 @@ def initTestCase(self):
         self.data_type = 'float32'
 
 
-class TestBilateralSliceApi(TestBilateralSliceOp):
+class TestBilateralSliceApi(unittest.TestCase):
     def test_api(self):
         x = paddle.fluid.data(
             name='x', shape=[None, 3, 25, 15], dtype='float32')
         guide = paddle.fluid.data(
             name='guide', shape=[None, 25, 15], dtype='float32')
         grid = paddle.fluid.data(
-            name='grid', shape=[None, 12, 8, 5, 3], dtype='float32')
-        paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid,
-                                                    self.has_offset)
+            name='grid', shape=[None, None, 8, 5, 3], dtype='float32')
+        paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid, False)
+
+        if not paddle.fluid.is_compiled_with_cuda():
+            return
+
+        with paddle.fluid.dygraph.guard():
+            x1 = paddle.rand([3, 1, 50, 30])
+            guide1 = paddle.rand([3, 50, 30])
+            grid1 = paddle.rand([3, 2, 2, 5, 3])
+
+            paddle.fluid.contrib.bilateral_slice(x1, guide1, grid1, False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_bmm_op.py b/python/paddle/fluid/tests/unittests/test_bmm_op.py
index cb1b3ded53472..a1c8266842087 100644
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -79,8 +79,10 @@ def test_api_error(self):
         y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
         y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4))
         y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2))
+        y_data_wrong3 = np.arange(24, dtype='float32').reshape((3, 4, 2))
         self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1)
         self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2)
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong3)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 9dd617f90b65d..7bdfa3d2dfd74 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -34,6 +34,7 @@ class InplaceTestBase(unittest.TestCase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def setUp(self):
         paddle.enable_static()
@@ -93,6 +94,7 @@ def check_single_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
+                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
                     loss_name=loss.name,
                     build_strategy=build_strategy,
@@ -146,6 +148,7 @@ def check_multi_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
+                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_program = fluid.CompiledProgram(
                     prog).with_data_parallel(
                         loss_name=loss.name,
@@ -175,6 +178,7 @@ class CUDAInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -187,6 +191,7 @@ class CPUInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
index 0b14cab4a7846..e9e62bee00680 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
@@ -20,6 +20,7 @@ class CUDAInplaceTestWithFuseOptimizationOps(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = True
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -32,6 +33,7 @@ class CPUInplaceTestWithFuseOptimizationOps(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = True
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
index e685d7b5f53b0..eed637b1d5da1 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from op_test import OpTest
@@ -260,6 +261,7 @@ def init_group(self):
 class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
     def test_error(self):
         def test_invalid_input():
+            paddle.enable_static()
             input = [1, 3, 32, 32]
             offset = fluid.data(
                 name='offset', shape=[None, 3, 32, 32], dtype='float32')
@@ -271,6 +273,7 @@ def test_invalid_input():
         self.assertRaises(TypeError, test_invalid_input)
 
         def test_invalid_offset():
+            paddle.enable_static()
             input = fluid.data(
                 name='input', shape=[None, 3, 32, 32], dtype='int32')
             offset = fluid.data(
@@ -283,5 +286,36 @@ def test_invalid_offset():
         self.assertRaises(TypeError, test_invalid_offset)
 
 
+class TestDeformConv2dAPI(unittest.TestCase):
+    def test_api(self):
+        def test_deform_conv2d_v1():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input_v1', shape=[None, 3, 32, 32], dtype='float32')
+            offset = paddle.static.data(
+                name='offset_v1', shape=[None, 4, 32, 32], dtype='float32')
+            out = paddle.static.nn.deform_conv2d(
+                input, offset, None, num_filters=4, filter_size=1)
+
+            assert (out.shape == (-1, 4, 32, 32))
+
+        test_deform_conv2d_v1()
+
+        def test_deform_conv2d_v2():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input_v2', shape=[None, 3, 32, 32], dtype='float32')
+            offset = paddle.static.data(
+                name='offset_v2', shape=[None, 4, 32, 32], dtype='float32')
+            mask = paddle.static.data(
+                name='mask_v2', shape=[None, 2, 32, 32], dtype='float32')
+            out = paddle.static.nn.deform_conv2d(
+                input, offset, mask, num_filters=4, filter_size=1)
+
+            assert (out.shape == (-1, 4, 32, 32))
+
+        test_deform_conv2d_v2()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index 49b93e0dfaaac..d615f7cb7044e 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -16,12 +16,14 @@
 
 import unittest
 
+import paddle
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
 import paddle.fluid.clip as clip
 import paddle.compat as cpt
 from paddle.fluid.backward import append_backward
+paddle.enable_static()
 
 
 class TestDGCMomentumOptimizer(unittest.TestCase):
@@ -86,13 +88,17 @@ def check_dgc_momentum_optimizer(self,
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         # params_grads = append_backward(mean_out)
-        params_grads = dgc_momentum_optimizer.backward(mean_out)
+        params_grads = dgc_momentum_optimizer.backward(
+            mean_out, startup_program=init_program)
+
+        with framework.program_guard(program, init_program):
+            opts = dgc_momentum_optimizer.apply_gradients(params_grads)
+
         accumulator_count = 1 if name == "momentum" else 2
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(
             len(dgc_momentum_optimizer.get_accumulators()), accumulator_count)
-        with framework.program_guard(program, init_program):
-            opts = dgc_momentum_optimizer.apply_gradients(params_grads)
+
         self.assertEqual(len(opts), 2)
         sgd_op = opts[-1]
         self.assertEqual([op.type for op in opts], ["scale", name])
@@ -108,8 +114,11 @@ def check_dgc_momentum_optimizer(self,
         self.assertTrue(mul_x.name in velocity_acc)
 
         # Check init_program
+        # dgc not apply include: lr, dgc(count, nranks, begin step), (u,)
+        # dgc apply include: lr, dgc(count, nranks, begin_step), (u,v,k,encode,gather)
+        init_ops_count = 5 if name == "momentum" else 9
         init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 1)
+        self.assertEqual(len(init_ops), init_ops_count)
         self.assertEqual(init_ops[0].type, "fill_constant")
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2f35b45aa670c..28232e9ba4dc0 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -43,15 +43,14 @@ def test_new_directory(self):
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.SaveLoadConfig',
-            'paddle.NoamDecay', 'paddle.PiecewiseDecay',
-            'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
-            'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
-            'paddle.CosineDecay', 'paddle.static.Executor',
-            'paddle.static.global_scope', 'paddle.static.scope_guard',
-            'paddle.static.append_backward', 'paddle.static.gradients',
-            'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
-            'paddle.static.ExecutionStrategy',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay',
+            'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay',
+            'paddle.ExponentialDecay', 'paddle.InverseTimeDecay',
+            'paddle.PolynomialDecay', 'paddle.CosineDecay',
+            'paddle.static.Executor', 'paddle.static.global_scope',
+            'paddle.static.scope_guard', 'paddle.static.append_backward',
+            'paddle.static.gradients', 'paddle.static.BuildStrategy',
+            'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy',
             'paddle.static.default_main_program',
             'paddle.static.default_startup_program', 'paddle.static.Program',
             'paddle.static.name_scope', 'paddle.static.program_guard',
@@ -64,12 +63,11 @@ def test_new_directory(self):
             'paddle.static.nn.conv3d', 'paddle.static.nn.conv3d_transpose',
             'paddle.static.nn.create_parameter',
             'paddle.static.nn.crf_decoding', 'paddle.static.nn.data_norm',
-            'paddle.static.nn.deformable_conv', 'paddle.static.nn.group_norm',
-            'paddle.static.nn.hsigmoid', 'paddle.static.nn.instance_norm',
-            'paddle.static.nn.layer_norm', 'paddle.static.nn.multi_box_head',
-            'paddle.static.nn.nce', 'paddle.static.nn.prelu',
-            'paddle.static.nn.row_conv', 'paddle.static.nn.spectral_norm',
-            'paddle.static.nn.embedding'
+            'paddle.static.nn.deform_conv2d', 'paddle.static.nn.group_norm',
+            'paddle.static.nn.instance_norm', 'paddle.static.nn.layer_norm',
+            'paddle.static.nn.multi_box_head', 'paddle.static.nn.nce',
+            'paddle.static.nn.prelu', 'paddle.static.nn.row_conv',
+            'paddle.static.nn.spectral_norm', 'paddle.static.nn.embedding'
         ]
 
         import_file = 'run_import_modules.py'
@@ -104,9 +102,7 @@ def test_old_directory(self):
             'paddle.imperative.TracedLayer', 'paddle.imperative.declarative',
             'paddle.imperative.ProgramTranslator',
             'paddle.imperative.TranslatedLayer', 'paddle.imperative.jit.save',
-            'paddle.imperative.jit.load',
-            'paddle.imperative.jit.SaveLoadConfig',
-            'paddle.imperative.NoamDecay'
+            'paddle.imperative.jit.load', 'paddle.imperative.NoamDecay'
             'paddle.imperative.PiecewiseDecay',
             'paddle.imperative.NaturalExpDecay',
             'paddle.imperative.ExponentialDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index 6c5a1d6e36c25..071b68bf9e856 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -288,7 +288,7 @@ def _run_cluster(self, model, envs):
         print("tr end communicate")
 
         tr0_ret = tr0.returncode
-        tr1_ret = tr0.returncode
+        tr1_ret = tr1.returncode
 
         # close trainer file
         tr0_pipe.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 7f4e5d99e0208..eed8d5f1a496e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -50,6 +50,10 @@ def build_role(self):
     def build_strategy(self):
         self.strategy = paddle.distributed.fleet.DistributedStrategy()
         self.strategy.a_sync = True
+        self.strategy.a_sync_configs = {
+            "launch_barrier": False,
+            "heter_worker_device_guard": "gpu"
+        }
         return self.strategy
 
     def build_input(self):
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index 40611fed65260..d5790811df94f 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -65,41 +65,6 @@ def entropy(self):
         return np.log(self.high - self.low)
 
 
-class NormalNumpy(DistributionNumpy):
-    def __init__(self, loc, scale):
-        self.loc = np.array(loc)
-        self.scale = np.array(scale)
-        if str(self.loc.dtype) not in ['float32', 'float64']:
-            self.loc = self.loc.astype('float32')
-            self.scale = self.scale.astype('float32')
-
-    def sample(self, shape):
-        shape = tuple(shape) + (self.loc + self.scale).shape
-        return self.loc + (np.random.randn(*shape) * self.scale)
-
-    def log_prob(self, value):
-        var = self.scale * self.scale
-        log_scale = np.log(self.scale)
-        return -((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
-
-    def probs(self, value):
-        var = self.scale * self.scale
-        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
-                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
-
-    def entropy(self):
-        return 0.5 + 0.5 * np.log(
-            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
-
-    def kl_divergence(self, other):
-        var_ratio = (self.scale / other.scale)
-        var_ratio = var_ratio * var_ratio
-        t1 = ((self.loc - other.loc) / other.scale)
-        t1 = (t1 * t1)
-        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
-
-
 class UniformTest(unittest.TestCase):
     def setUp(self, use_gpu=False, batch_size=5, dims=6):
         self.use_gpu = use_gpu
@@ -336,6 +301,41 @@ def init_static_data(self, batch_size, dims):
                 name='values', shape=[dims], dtype='float32')
 
 
+class NormalNumpy(DistributionNumpy):
+    def __init__(self, loc, scale):
+        self.loc = np.array(loc)
+        self.scale = np.array(scale)
+        if str(self.loc.dtype) not in ['float32', 'float64']:
+            self.loc = self.loc.astype('float32')
+            self.scale = self.scale.astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.loc + self.scale).shape
+        return self.loc + (np.random.randn(*shape) * self.scale)
+
+    def log_prob(self, value):
+        var = self.scale * self.scale
+        log_scale = np.log(self.scale)
+        return -((value - self.loc) * (value - self.loc)) / (
+            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+
+    def probs(self, value):
+        var = self.scale * self.scale
+        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
+
+    def entropy(self):
+        return 0.5 + 0.5 * np.log(
+            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
+
+    def kl_divergence(self, other):
+        var_ratio = (self.scale / other.scale)
+        var_ratio = var_ratio * var_ratio
+        t1 = ((self.loc - other.loc) / other.scale)
+        t1 = (t1 * t1)
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
+
+
 class NormalTest(unittest.TestCase):
     def setUp(self, use_gpu=False, batch_size=2, dims=3):
         self.use_gpu = use_gpu
@@ -559,26 +559,6 @@ def init_static_data(self, batch_size, dims):
 
 
 class NormalTest6(NormalTest):
-    def init_data(self, batch_size=2, dims=3):
-        # loc and scale are Tensor with dtype 'VarType.FP32'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        self.loc = paddle.to_tensor(self.loc_np)
-        self.scale = paddle.to_tensor(self.scale_np)
-        self.values = paddle.to_tensor(self.values_np)
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-        self.other_loc = paddle.to_tensor(self.other_loc_np)
-        self.other_scale = paddle.to_tensor(self.other_scale_np)
-
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are Tensor with dtype 'VarType.FP32'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float32')
@@ -693,6 +673,294 @@ def init_static_data(self, batch_size, dims):
                 name='other_scale', shape=[dims], dtype='float64')
 
 
+class CategoricalNumpy(DistributionNumpy):
+    def __init__(self, logits):
+        self.logits = np.array(logits).astype('float32')
+
+    def entropy(self):
+        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
+        e_logits = np.exp(logits)
+        z = np.sum(e_logits, axis=-1, keepdims=True)
+        prob = e_logits / z
+        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1, keepdims=True)
+
+    def kl_divergence(self, other):
+        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
+        other_logits = other.logits - np.max(
+            other.logits, axis=-1, keepdims=True)
+        e_logits = np.exp(logits)
+        other_e_logits = np.exp(other_logits)
+        z = np.sum(e_logits, axis=-1, keepdims=True)
+        other_z = np.sum(other_e_logits, axis=-1, keepdims=True)
+        prob = e_logits / z
+        return np.sum(prob * (logits - np.log(z) - other_logits \
+            + np.log(other_z)), axis=-1, keepdims=True)
+
+
+class CategoricalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=3, dims=5):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.batch_size = batch_size
+        self.dims = dims
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float32')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        # dist_shape = logits_shape[:-1], it represents the number of 
+        #  different distributions.
+        self.dist_shape = [batch_size]
+        # sample shape represents the number of samples
+        self.sample_shape = [2, 4]
+        # value used in probs and log_prob method
+        # If value is 1-D and logits is 2-D or higher dimension, value will be
+        #  broadcasted to have the same number of distributions with logits.
+        # If value is 2-D or higher dimentsion, it should have the same number 
+        #  of distributions with logtis. ``value[:-1] = logits[:-1]
+        self.value_shape = [3]
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.logits = paddle.to_tensor(self.logits_np)
+        self.other_logits = paddle.to_tensor(self.other_logits_np)
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = fluid.data(
+                name='logits', shape=self.logits_shape, dtype='float32')
+            self.other_logits_static = fluid.data(
+                name='other_logits', shape=self.logits_shape, dtype='float32')
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.dist_shape + self.value_shape)
+        for i in range(self.batch_size):
+            for j in range(3):
+                np_probs[i][j] = probability[i][self.value_np[j]]
+        return np_probs
+
+    def compare_with_numpy(self, fetch_list, tolerance=1e-6):
+        sample, entropy, kl, probs, log_prob = fetch_list
+        log_tolerance = 1e-4
+
+        np.testing.assert_equal(sample.shape,
+                                self.sample_shape + self.dist_shape)
+
+        np_categorical = CategoricalNumpy(self.logits_np)
+        np_other_categorical = CategoricalNumpy(self.other_logits_np)
+        np_entropy = np_categorical.entropy()
+        np_kl = np_categorical.kl_divergence(np_other_categorical)
+
+        np.testing.assert_allclose(
+            entropy, np_entropy, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(
+            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
+
+        sum_dist = np.sum(self.logits_np, axis=-1, keepdims=True)
+        probability = self.logits_np / sum_dist
+        np_probs = self.get_numpy_selected_probs(probability)
+        np_log_prob = np.log(np_probs)
+
+        np.testing.assert_allclose(
+            probs, np_probs, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            log_prob, np_log_prob, rtol=tolerance, atol=tolerance)
+
+    def test_categorical_distribution_dygraph(self, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        categorical = Categorical(self.logits)
+        other_categorical = Categorical(self.other_logits)
+
+        sample = categorical.sample(self.sample_shape).numpy()
+        entropy = categorical.entropy().numpy()
+        kl = categorical.kl_divergence(other_categorical).numpy()
+        probs = categorical.probs(self.value).numpy()
+        log_prob = categorical.log_prob(self.value).numpy()
+
+        fetch_list = [sample, entropy, kl, probs, log_prob]
+        self.compare_with_numpy(fetch_list)
+
+    def test_categorical_distribution_static(self, tolerance=1e-6):
+        paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            categorical = Categorical(self.logits_static)
+            other_categorical = Categorical(self.other_logits_static)
+
+            sample = categorical.sample(self.sample_shape)
+            entropy = categorical.entropy()
+            kl = categorical.kl_divergence(other_categorical)
+            probs = categorical.probs(self.value_static)
+            log_prob = categorical.log_prob(self.value_static)
+
+            fetch_list = [sample, entropy, kl, probs, log_prob]
+
+        feed_vars = {
+            'logits': self.logits_np,
+            'other_logits': self.other_logits_np,
+            'value': self.value_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class CategoricalTest2(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor with dtype Float64
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float64')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        self.dist_shape = [batch_size]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = fluid.data(
+                name='logits', shape=self.logits_shape, dtype='float64')
+            self.other_logits_static = fluid.data(
+                name='other_logits', shape=self.logits_shape, dtype='float64')
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest3(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D numpy.ndarray with dtype Float32
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = self.logits_np
+        self.other_logits = self.other_logits_np
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np
+            self.other_logits_static = self.other_logits_np
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest4(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D numpy.ndarray with dtype Float64
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float64')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        self.dist_shape = [batch_size]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.logits = self.logits_np
+        self.other_logits = self.other_logits_np
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np
+            self.other_logits_static = self.other_logits_np
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+# test shape of logits and value used in probs and log_prob method
+class CategoricalTest5(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 1-D Tensor
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(dims).astype('float32')
+        self.other_logits_np = np.random.rand(dims).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [dims]
+        self.dist_shape = []
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.value_shape)
+        for i in range(3):
+            np_probs[i] = probability[self.value_np[i]]
+        return np_probs
+
+
+class CategoricalTest6(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor
+        # value used in probs and log_prob method has the same number of batches with input
+        self.logits_np = np.random.rand(3, 5).astype('float32')
+        self.other_logits_np = np.random.rand(3, 5).astype('float32')
+        self.value_np = np.array([[2, 1], [0, 3], [2, 3]]).astype('int64')
+
+        self.logits_shape = [3, 5]
+        self.dist_shape = [3]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3, 2]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.value_shape)
+        for i in range(3):
+            for j in range(2):
+                np_probs[i][j] = probability[i][self.value_np[i][j]]
+        return np_probs
+
+
+class CategoricalTest7(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 3-D Tensor
+        # value used in probs and log_prob method has the same number of distribuions with input
+        self.logits_np = np.random.rand(3, 2, 5).astype('float32')
+        self.other_logits_np = np.random.rand(3, 2, 5).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [3, 2, 5]
+        self.dist_shape = [3, 2]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.dist_shape + self.value_shape)
+        for i in range(3):
+            for j in range(2):
+                for k in range(3):
+                    np_probs[i][j][k] = probability[i][j][self.value_np[k]]
+        return np_probs
+
+
 class DistributionTestError(unittest.TestCase):
     def test_distribution_error(self):
         distribution = Distribution()
@@ -711,6 +979,7 @@ def test_distribution_error(self):
         self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
 
     def test_normal_error(self):
+        paddle.enable_static()
         normal = Normal(0.0, 1.0)
 
         value = [1.0, 2.0]
@@ -734,6 +1003,7 @@ def test_normal_error(self):
         self.assertRaises(TypeError, normal.kl_divergence, normal_other)
 
     def test_uniform_error(self):
+        paddle.enable_static()
         uniform = Uniform(0.0, 1.0)
 
         value = [1.0, 2.0]
@@ -752,6 +1022,39 @@ def test_uniform_error(self):
         # type of seed must be int
         self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
 
+    def test_categorical_error(self):
+        paddle.enable_static()
+
+        categorical = Categorical([0.4, 0.6])
+
+        value = [1, 0]
+        # type of value must be variable
+        self.assertRaises(AttributeError, categorical.log_prob, value)
+
+        value = [1, 0]
+        # type of value must be variable
+        self.assertRaises(AttributeError, categorical.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, categorical.sample, shape)
+
+        categorical_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Categorical
+        self.assertRaises(TypeError, categorical.kl_divergence,
+                          categorical_other)
+
+        def test_shape_not_match_error():
+            # shape of value must match shape of logits
+            # value_shape[:-1] == logits_shape[:-1]
+            paddle.disable_static()
+            logits = paddle.rand([3, 5])
+            cat = Categorical(logits)
+            value = paddle.to_tensor([[2, 1, 3], [3, 2, 1]], dtype='int64')
+            cat.log_prob(value)
+
+        self.assertRaises(ValueError, test_shape_not_match_error)
+
 
 class DistributionTestName(unittest.TestCase):
     def get_prefix(self, string):
@@ -812,6 +1115,35 @@ def test_uniform_name(self):
         p = uniform1.probs(value_tensor)
         self.assertEqual(self.get_prefix(p.name), name + '_probs')
 
+    def test_categorical_name(self):
+        name = 'test_categorical'
+        categorical1 = Categorical([0.4, 0.6], name=name)
+        self.assertEqual(categorical1.name, name)
+
+        categorical2 = Categorical([0.5, 0.5])
+        self.assertEqual(categorical2.name, 'Categorical')
+
+        paddle.enable_static()
+
+        sample = categorical1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = categorical1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        kl = categorical1.kl_divergence(categorical2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+        value_npdata = np.array([0], dtype="int64")
+        value_tensor = layers.create_tensor(dtype="int64")
+        layers.assign(value_npdata, value_tensor)
+
+        p = categorical1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        lp = categorical1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index ec30cb70c5790..1272d82dfdd1d 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -149,9 +149,9 @@ def run_program(num_flatten_dims):
                     append_batch_size=False,
                     dtype="float32")
 
-                out = fluid.layers.fc(input=x,
-                                      size=1,
-                                      num_flatten_dims=num_flatten_dims)
+                out = paddle.static.nn.fc(x=x,
+                                          size=1,
+                                          num_flatten_dims=num_flatten_dims)
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 362428631e68c..6bc1a310d0aea 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -12,57 +12,97 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
 import unittest
 import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.meta_optimizers import AMPOptimizer
 import os
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
 paddle.enable_static()
 
 
-class TestFleetAMPOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
+    def test_amp_optimizer_backward(self):
+        """ test amp optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertNotIn('check_finite_and_unscale', ops)
+
+    def test_amp_optimizer_backward_gradients(self):
+        """ test amp optimizer backward + gradients"""
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            opt.apply_gradients(params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+    def test_amp_optimizer_backward_optimize(self):
+        """ test amp optimizer backward + optimizer """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
     def test_amp_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.amp = True
-        strategy.amp_configs = {
-            "init_loss_scaling": 32768,
-            "decr_every_n_nan_or_inf": 2,
-            "incr_every_n_steps": 1000,
-            "incr_ratio": 2.0,
-            "use_dynamic_loss_scaling": True,
-            "decr_ratio": 0.5,
-            "custom_white_list": ['softmax'],
-            "custom_black_list": ['tanh'],
-        }
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        """ test amp """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+    def test_amp_recompute_optimizer(self):
+        """ test amp + recompute """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         strategy = fleet._final_strategy()
 
         ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
         self.assertIn('cast', ops)
         self.assertIn('check_finite_and_unscale', ops)
 
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 4945c158025b7..3d4b2e218f725 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -18,6 +18,7 @@
 import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 import paddle.fluid as fluid
+import paddle.nn as nn
 import numpy as np
 
 
@@ -170,6 +171,44 @@ def test_dygraph_method(self):
         final_strategy = fleet._final_strategy()
 
 
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear1 = nn.Linear(10, 10)
+        self._linear2 = nn.Linear(10, 1)
+
+    def forward(self, x):
+        return self._linear2(self._linear1(x))
+
+
+class TestFleetDygraphSingle(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def test_dygraph_single(self):
+        paddle.disable_static()
+        fleet.init(is_collective=True)
+
+        layer = LinearNet()
+        loss_fn = nn.MSELoss()
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=layer.parameters())
+
+        adam = fleet.distributed_optimizer(adam)
+        dp_layer = fleet.distributed_model(layer)
+        for step in range(2):
+            inputs = paddle.randn([10, 10], 'float32')
+            outputs = dp_layer(inputs)
+            labels = paddle.randn([10, 1], 'float32')
+            loss = loss_fn(outputs, labels)
+            loss.backward()
+            adam.step()
+            adam.clear_grad()
+
+
 class TestFleetBaseSingleRunCollective(unittest.TestCase):
     def setUp(self):
         os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
index 55d4ff7726aac..0faafd76a799d 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -17,65 +17,82 @@
 from paddle import fluid
 import os
 import paddle.distributed.fleet as fleet
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers import DGCOptimizer
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
 
-class TestFleetDGCOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
-
-    def net(self, main_prog, startup_prog):
-        with fluid.program_guard(main_prog, startup_prog):
-            with fluid.unique_name.guard():
-                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-                fleet.init(role)
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
-
-                fc_1 = paddle.fluid.layers.fc(input=input_x,
-                                              size=64,
-                                              act='tanh')
-                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-                prediction = paddle.fluid.layers.fc(input=[fc_2],
-                                                    size=2,
-                                                    act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
-
-                strategy = paddle.distributed.fleet.DistributedStrategy()
-                strategy.dgc = True
-                strategy.dgc_configs = {
-                    "rampup_begin_step": 128,
-                    "rampup_step": 100,
-                    "sparsity": [0.996, 0.999]
-                }
-        return avg_cost, strategy
+
+class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
+    def test_dgc_optimizer_backward(self):
+        """ test dgc optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('dgc', ops)
+
+    def test_dgc_optimizer_gradients(self):
+        """ test dgc optimizer backward + gradients """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            dgc_opt.apply_gradients(params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
+
+    def test_dgc_optimizer_optimize(self):
+        """ test dgc optimizer backward + optimize """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+        dgc_opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
 
     def test_dgc_optimizer(self):
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('dgc', ops)
         self.assertIn('dgc_momentum', ops)
 
     def test_dgc_not_apply_with_adam(self):
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('dgc', ops)
@@ -85,18 +102,32 @@ def test_dgc_not_apply_with_one_worker(self):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('dgc', ops)
         self.assertNotIn('dgc_momentum', ops)
 
+    def test_dgc_recompute_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'dgc')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
+
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index e717962ead2e2..4cd8dc3d945e1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -28,13 +28,27 @@ function test_launch_ps(){
     fi
 }
 
+function test_launch_ps_heter(){
+    fleetrun --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
+    if grep -q "server are killed" ut.elog; then
+        echo "test heter pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
+}
+
 if [[ ${WITH_GPU} == "OFF" ]]; then
+    echo "in cpu test mode"
     test_launch_ps
     exit 0
 fi
 
+echo "No.1 unittest"
 test_launch_ps
+test_launch_ps_heter
 # use default values
+echo "No.2 unittest"
 fleetrun multi_process.py fleetrun
 
 # use paddlecloud
@@ -48,6 +62,7 @@ export PADDLE_TRAINER_ID=0
 export PADDLE_PORT=35789
 export TRAINER_PORTS_NUM=2
 
+echo "No.3 unittest"
 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
 CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
 
@@ -83,7 +98,7 @@ fi
 unset PADDLE_PORT
 export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
-echo ""
+echo "No.4 unittest"
 echo "paddle.distributed.launch async poll process test"
 if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
     echo "train abort as planned"
@@ -112,5 +127,6 @@ rm -rf $file_0_0 $file_0_1
 
 distributed_args="--gpus=0,1 --log_dir=testlog"
 export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
+echo "No.5 unittest"
 CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
 str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index f5347b0c665e2..bafb2419123b0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -16,71 +16,87 @@
 import paddle
 import os
 
+import paddle
+import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
+paddle.enable_static()
 
-class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
+class TestFleetLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
     def test_localsgd_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.localsgd = True
-        strategy.auto = True
-        config = strategy.localsgd_configs
-        config['k_steps'] = 1
-        config['begin_step'] = 1
-        strategy.localsgd_configs = config
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-
-
-class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
-
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'localsgd')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+    def test_localsgd_amp_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'localsgd')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+        # amp
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+
+class TestFleetAdaptiveLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
     def test_adaptive_localsgd_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.adaptive_localsgd = True
-        config = strategy.adaptive_localsgd_configs
-        config['init_k_steps'] = 1
-        config['begin_step'] = 1
-        strategy.adaptive_localsgd_configs = config
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'adaptive_localsgd')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+    def test_localsgd_amp_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'adaptive_localsgd')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+        # amp
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index a42010a4eaa50..42b60cd3fad5a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -14,40 +14,144 @@
 
 import unittest
 import paddle
+import paddle.fluid as fluid
 import os
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers import RecomputeOptimizer
 
+paddle.enable_static()
 
-class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+
+class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward_gradients(self):
+        """ test recompute optimizer backward + gradients """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            opt.apply_gradients(params_grads)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward_optimize(self):
+        """ test recompute optimizer backward + optimize """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
 
     def test_recompute_optimizer(self):
-        import paddle.distributed.fleet as fleet
-        import paddle.distributed.fleet.base.role_maker as role_maker
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.recompute = True
-        strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_lars_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lars')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+        self.assertIn('lars_momentum', ops)
+
+    def test_recompute_lamb_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lamb')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+        self.assertIn('lamb', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py
new file mode 100644
index 0000000000000..9f8ee1b46e827
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py
@@ -0,0 +1,149 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import platform
+import shutil
+import tempfile
+import unittest
+import paddle
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestPSCloudRoleMakerCase1(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+
+    def test_paddle_trainers_num(self):
+        # PADDLE_TRAINERS_NUM
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase2(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+
+    def test_training_role(self):
+        # TRAINING_ROLE
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase3(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'TRAINER'
+
+    def test_trainer_id(self):
+        # PADDLE_TRAINER_ID
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase4(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'PSERVER'
+
+    def test_ps_port(self):
+        # PADDLE_PORT
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase5(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'PSERVER'
+        os.environ["PADDLE_PORT"] = str(4001)
+
+    def test_ps_ip(self):
+        # POD_IP
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase6(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:4003,127.0.0.1:4004"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'HETER_TRAINER'
+
+    def test_heter_port(self):
+        # PADDLE_PORT
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase7(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:4003,127.0.0.1:4004"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'HETER_TRAINER'
+        os.environ["PADDLE_PORT"] = str(4003)
+
+    def test_heter_ip(self):
+        # POD_IP
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index cc54e680c7525..f258e830b5fe5 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -185,12 +185,7 @@ def func(params_grads):
     # invoke 'set_gradient_clip' in a wrong order
     def test_wrong_API_order(self):
         def backward_func(cost):
-            # no clip gradient
-            def fileter_func(param):
-                return param.name == "fc.w_0"
-
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=5.0, need_clip=fileter_func)
+            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
             fluid.clip.set_gradient_clip(clip)
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
                                                 grad_clip=clip)
@@ -205,11 +200,7 @@ def fileter_func(param):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "x"
-
-        clip = fluid.clip.GradientClipByGlobalNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
         x = fluid.default_main_program().global_block().create_parameter(
             name="x", shape=[2, 3], dtype="float32")
         y = fluid.default_main_program().global_block().create_parameter(
@@ -228,11 +219,6 @@ def fileter_func(param):
 
     # raise typeError
     def test_tpyeError(self):
-        # the type of need_clip must be an funciton
-        with self.assertRaises(TypeError):
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=self.clip_norm, need_clip="test")
-
         # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
         with self.assertRaises(TypeError):
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
@@ -264,26 +250,22 @@ def test_gradient_clip(self):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
         x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
         y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)
 
         # (x, None) should not be returned
         params_grads = [(x, None), (x, y)]
         params_grads = clip(params_grads)
         self.assertTrue(
             len(clip(params_grads)) == 1,
-            "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
         )
         self.assertTrue(
             params_grads[0][1].name == 'y',
-            "ClipByNorm: grad should not be clipped when filtered out!")
+            "ClipGradByNorm: grad should not be clipped when filtered out!")
 
 
 class TestGradientClipByValue(TestGradientClip):
@@ -312,26 +294,22 @@ def test_gradient_clip(self):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByValue(
-            self.max, self.min, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByValue(self.max, self.min)
         x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
         y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)
 
         # (x, None) should not be returned
         params_grads = [(x, None), (x, y)]
         params_grads = clip(params_grads)
         self.assertTrue(
             len(clip(params_grads)) == 1,
-            "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
         )
         self.assertTrue(
             params_grads[0][1].name == 'y',
-            "ClipByValue: grad should not be clipped when filtered out!")
+            "ClipGradByValue: grad should not be clipped when filtered out!")
 
 
 class TestDygraphGradientClip(unittest.TestCase):
@@ -355,13 +333,9 @@ def check_clip_result(self, loss, optimizer):
 
 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
     def setUp(self):
-        # only clip gradient of x (ParamBase)
-        def fileter_func(param):
-            return param.name == "x"
-
         self.clip_norm = 0.8
         self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+            clip_norm=self.clip_norm)
         self.clip2 = fluid.clip.GradientClipByGlobalNorm(
             clip_norm=self.clip_norm)
 
@@ -401,13 +375,8 @@ def check_clip_result(self, loss, optimizer):
 
 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
     def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
         self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -435,14 +404,9 @@ def check_clip_result(self, loss, optimizer):
 
 class TestDygraphGradientClipByValue(TestDygraphGradientClip):
     def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
         self.max = 0.2
         self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(
-            max=self.max, min=self.min, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index ea94a8ba69a78..9ad0309a70e31 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -127,5 +127,15 @@ def load_tests(loader, standard_tests, pattern):
     return suite
 
 
+class TestGridSampleAPI(unittest.TestCase):
+    def test_errors(self):
+        with self.assertRaises(ValueError):
+            x = paddle.randn([1, 1, 3, 3])
+            F.grid_sample(x, 1.0)
+        with self.assertRaises(ValueError):
+            x = paddle.randn([1, 1, 3, 3])
+            F.grid_sample(1.0, x)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid.py b/python/paddle/fluid/tests/unittests/test_hsigmoid.py
deleted file mode 100644
index 80937640c2d2f..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-
-
-class HSigmoidTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName="runTest",
-                 batch_size=4,
-                 feature_size=6,
-                 num_classes=8,
-                 labels=None,
-                 path_code=None,
-                 path_table=None,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoidTestCase, self).__init__()
-        self.batch_size = batch_size
-        self.feature_size = feature_size
-        self.num_classes = num_classes
-        self.dtype = dtype
-        self.is_sparse = is_sparse
-
-        self.labels = labels
-        self.path_code = path_code
-        self.path_table = path_table
-        self.is_custom = path_code is not None and path_table is not None
-
-    def setUp(self):
-        input_shape = (self.batch_size, self.feature_size)
-        self.input = np.random.uniform(
-            -1, 1, size=input_shape).astype(self.dtype)
-        if self.labels is None:
-            self.labels = np.random.randint(
-                0, self.num_classes, size=(self.batch_size, 1)).astype(np.int64)
-        C = self.num_classes if self.is_custom else self.num_classes - 1
-        self.weight_shape = (C, self.feature_size)
-        self.weight = np.random.randn(*self.weight_shape).astype(self.dtype)
-        self.bias_shape = (C, 1)
-        self.bias = np.random.randn(*self.bias_shape).astype(self.dtype)
-
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                y = fluid.layers.hsigmoid(
-                    x,
-                    label,
-                    self.num_classes,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=I.NumpyArrayInitializer(self.bias),
-                    path_table=path_table,
-                    path_code=path_code,
-                    is_custom=self.is_custom,
-                    is_sparse=self.is_sparse, )
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "labels": self.labels}
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
-                b = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.hsigmoid(
-                    x,
-                    label,
-                    w,
-                    b,
-                    self.num_classes,
-                    is_sparse=self.is_sparse,
-                    path_table=path_table,
-                    path_code=path_code)
-
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {
-            "input": self.input,
-            "labels": self.labels,
-            "weight": self.weight,
-            "bias": self.bias
-        }
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-
-    def nn_layer(self, place):
-        with dg.guard(place):
-            x_var = dg.to_variable(self.input)
-            label_var = dg.to_variable(self.labels)
-            if self.is_custom:
-                path_code_var = dg.to_variable(self.path_code)
-                path_table_var = dg.to_variable(self.path_table)
-            else:
-                path_code_var = path_table_var = None
-            hierarchical_softmax = nn.HSigmoid(
-                self.feature_size,
-                self.num_classes,
-                is_custom=self.is_custom,
-                is_sparse=self.is_sparse,
-                param_attr=I.NumpyArrayInitializer(self.weight),
-                bias_attr=I.NumpyArrayInitializer(self.bias),
-                dtype=self.dtype)
-            y_var = hierarchical_softmax(
-                x_var,
-                label_var,
-                path_table=path_table_var,
-                path_code=path_code_var)
-            y_np = y_var.numpy()
-        return y_np
-
-    def _test_equivalence(self, place):
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        result3 = self.nn_layer(place)
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-
-
-class HSigmoidTestErrorCase(HSigmoidTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.nn_layer()
-
-    def nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        label_var = dg.to_variable(self.labels)
-        if self.is_custom:
-            path_code_var = dg.to_variable(self.path_code)
-            path_table_var = dg.to_variable(self.path_table)
-        else:
-            path_code_var = path_table_var = None
-        hierarchical_softmax = nn.HSigmoid(
-            self.feature_size,
-            self.num_classes,
-            is_custom=self.is_custom,
-            param_attr=I.NumpyArrayInitializer(self.weight),
-            bias_attr=I.NumpyArrayInitializer(self.bias),
-            dtype=self.dtype)
-        y_var = hierarchical_softmax(
-            x_var,
-            label_var,
-            path_table=path_table_var,
-            path_code=path_code_var)
-        y_np = y_var.numpy()
-        return y_np
-
-
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    suite.addTest(HSigmoidTestCase(methodName="runTest"))
-    suite.addTest(
-        HSigmoidTestCase(
-            methodName="runTest",
-            batch_size=4,
-            feature_size=6,
-            num_classes=8,
-            labels=np.array([0, 1, 4, 5]).astype(np.int64),
-            path_table=np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
-                0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64),
-            path_code=np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-                1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)))
-    suite.addTest(HSigmoidTestErrorCase(methodName="runTest", num_classes=1))
-    return suite
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 5c9867e681524..3f8eed08adf68 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -19,10 +19,13 @@
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle.nn.functional as F
 from paddle.fluid import Program, program_guard
+import paddle.fluid.initializer as I
 import math
 from op_test import OpTest, skip_check_grad_ci
 
+paddle.enable_static()
 np.random.seed(100)
 
 
@@ -56,7 +59,6 @@ def cal_index(self, bit):
     def get_length(self):
         length = 0
         for ele in self.ptable_[self.index_]:  # find the first -1 to stop trace
-
             if ele >= 0:
                 length = length + 1
             else:
@@ -388,8 +390,192 @@ def test_check_grad(self):
         self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
 
 
-class TestHSigmoidOpError(unittest.TestCase):
+class TestHSigmoidLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.hsigmoid_loss, paddle.nn.HSigmoidLoss
+    def setUp(self):
+        self.dtype = 'float32'
+        self.batch_size = 4
+        self.feature_size = 6
+        self.num_classes = 8
+        self.is_custom = False
+        self.place = paddle.CPUPlace()
+
+        paddle.set_default_dtype(self.dtype)
+
+        self.x_np = np.random.uniform(
+            -1, 1, [self.batch_size, self.feature_size]).astype(self.dtype)
+        self.labels_np = np.random.randint(
+            self.num_classes, size=(self.batch_size, 1), dtype='int64')
+        self.weight_np = np.random.uniform(
+            -1, 1, [self.num_classes - 1, self.feature_size]).astype(self.dtype)
+        self.bias_np = np.random.uniform(-1, 1, (
+            self.num_classes - 1, )).astype(self.dtype)
+        self.path_table_np = None
+        self.path_code_np = None
+        _, self.out_np = hsigmoid(self.x_np, self.weight_np, self.labels_np,
+                                  self.bias_np, self.num_classes)
+        self.set_attrs()
+
+        if self.is_custom:
+            _, self.out_np = hsigmoidWithCustomTree(
+                self.x_np, self.weight_np, self.path_table_np,
+                self.path_code_np, self.labels_np,
+                self.bias_np.reshape(-1, 1), self.num_classes)
+
+    def set_attrs(self):
+        pass
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        labels = paddle.to_tensor(self.labels_np)
+        weight = paddle.to_tensor(self.weight_np)
+        bias = paddle.to_tensor(self.bias_np)
+        path_table = None
+        path_code = None
+        if self.is_custom:
+            path_table = paddle.to_tensor(self.path_table_np)
+            path_code = paddle.to_tensor(self.path_code_np)
+        out1 = F.hsigmoid_loss(x, labels, self.num_classes, weight, bias,
+                               path_table, path_code)
+
+        weight_attr = I.NumpyArrayInitializer(self.weight_np)
+        bias_attr = I.NumpyArrayInitializer(self.bias_np)
+        m = paddle.nn.HSigmoidLoss(self.feature_size, self.num_classes,
+                                   weight_attr, bias_attr, self.is_custom)
+        out2 = m(x, labels, path_table, path_code)
+
+        for out in [out1, out2]:
+            self.assertTrue(np.allclose(self.out_np, out.numpy()))
+        paddle.enable_static()
+
+    def test_static_api(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.static.data('x', [-1, self.feature_size])
+            labels = paddle.static.data('labels', [-1, 1], 'int64')
+            weight = paddle.static.data('weight', [-1, self.feature_size])
+            bias = paddle.static.data('bias', [-1, ])
+            path_table = None
+            path_code = None
+            if self.is_custom:
+                path_table = paddle.static.data('path_table', [-1, -1], 'int64')
+                path_code = paddle.static.data('path_code', [-1, -1], 'int64')
+            out1 = F.hsigmoid_loss(x, labels, self.num_classes, weight, bias,
+                                   path_table, path_code)
+
+            weight_attr = paddle.framework.ParamAttr(
+                initializer=I.NumpyArrayInitializer(self.weight_np))
+            bias_attr = paddle.framework.ParamAttr(
+                initializer=I.NumpyArrayInitializer(self.bias_np))
+            m = paddle.nn.HSigmoidLoss(self.feature_size, self.num_classes,
+                                       weight_attr, bias_attr, self.is_custom)
+            out2 = m(x, labels, path_table, path_code)
+
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+            feed_dict = {
+                'x': self.x_np,
+                'labels': self.labels_np,
+                'weight': self.weight_np,
+                'bias': self.bias_np
+            }
+            if self.is_custom:
+                feed_dict["path_code"] = self.path_code_np
+                feed_dict["path_table"] = self.path_table_np
+            ret1, ret2 = exe.run(train_program,
+                                 feed=feed_dict,
+                                 fetch_list=[out1, out2])
+
+            for ret in [ret1, ret2]:
+                self.assertTrue(np.allclose(self.out_np, ret))
+
+    def test_fluid_api(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.data('x', [-1, self.feature_size])
+            labels = fluid.data('labels', [-1, 1], 'int64')
+            path_table = None
+            path_code = None
+            if self.is_custom:
+                path_table = fluid.data('path_table', [-1, -1], 'int64')
+                path_code = fluid.data('path_code', [-1, -1], 'int64')
+            weight_attr = I.NumpyArrayInitializer(self.weight_np)
+            bias_attr = I.NumpyArrayInitializer(self.bias_np)
+            out = fluid.layers.hsigmoid(x, labels, self.num_classes,
+                                        weight_attr, bias_attr, 'out',
+                                        path_table, path_code, self.is_custom)
+
+            exe = fluid.Executor(self.place)
+            exe.run(startup_program)
+            feed_dict = {'x': self.x_np, 'labels': self.labels_np}
+            if self.is_custom:
+                feed_dict["path_code"] = self.path_code_np
+                feed_dict["path_table"] = self.path_table_np
+            ret, = exe.run(train_program, feed=feed_dict, fetch_list=[out])
+
+            self.assertTrue(np.allclose(ret, self.out_np))
+
     def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            # test paddle.nn.HSigmoidLoss
+            self.assertRaises(ValueError, paddle.nn.HSigmoidLoss, 6, 1)
+
+            # test paddle.nn.functional.hsigmoid_loss
+            x = paddle.static.data('x', [4, 6])
+            label = paddle.static.data('label', [4, 1], 'int64')
+            weight = paddle.static.data('weight', [7, 6])
+            bias = paddle.static.data('bias', [7])
+
+            x_int32 = paddle.static.data('x_int32', [4, 6], 'int32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x_int32, label, 8,
+                              weight)
+
+            label_float32 = paddle.static.data('label_float32', [4, 1],
+                                               'float32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x, label_float32, 8,
+                              weight)
+
+            weight_int32 = paddle.static.data('weight_int32', [7, 6], 'int32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x, label, 8,
+                              weight_int32)
+
+            bias_int32 = paddle.static.data('bias_int32', [7], 'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                bias=bias_int32)
+
+            path_table_int32 = paddle.static.data('path_table_int32', [7],
+                                                  'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                path_table=path_table_int32)
+
+            path_code_int32 = paddle.static.data('path_code_int32', [7],
+                                                 'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                path_code=path_code_int32)
+
+        # test paddle.fluid.layers.hsigmoid
         with program_guard(Program()):
             label = fluid.data('label', [4, 1], 'int64')
             # The input type must be Variable.
@@ -410,5 +596,17 @@ def test_errors(self):
                               label_int32, 2)
 
 
+class TestHSigmoidLossAPICustom(TestHSigmoidLossAPI):
+    def set_attrs(self):
+        self.is_custom = True
+        self.path_table_np = np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
+            0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64)
+        self.path_code_np = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)
+
+    def test_errors(self):
+        pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index fdf7adbfb45f0..71381ecfde738 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -196,6 +196,84 @@ def test_nan_inf(self):
                     np.array_equal(param.numpy(), params_init[param.name]))
 
 
+class TestResnet2(unittest.TestCase):
+    def train_resnet(self, enable_amp=True):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+
+        paddle.disable_static()
+
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
+
+        resnet = ResNet(use_cudnn=True)
+        optimizer = optimizer_setting(
+            train_parameters, parameter_list=resnet.parameters())
+        np.random.seed(seed)
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
+
+        dy_param_init_value = {}
+        for param in resnet.parameters():
+            dy_param_init_value[param.name] = param.numpy()
+
+        program = None
+        scaler = paddle.amp.GradScaler(
+            enable=enable_amp, init_loss_scaling=2.**10)
+
+        for batch_id, data in enumerate(train_reader()):
+            if batch_id >= batch_num:
+                break
+            dy_x_data = np.array(
+                [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+            if len(np.array([x[1]
+                             for x in data]).astype('int64')) != batch_size:
+                continue
+            y_data = np.array([x[1] for x in data]).astype('int64').reshape(-1,
+                                                                            1)
+            img = paddle.to_tensor(dy_x_data)
+            label = paddle.to_tensor(y_data)
+            label.stop_gradient = True
+
+            with paddle.amp.auto_cast(enable=enable_amp):
+                out = resnet(img)
+
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+            avg_loss = paddle.mean(x=loss)
+
+            dy_out = avg_loss.numpy()
+
+            scaled_loss = scaler.scale(avg_loss)
+            scaled_loss.backward()
+
+            scaler.minimize(optimizer, scaled_loss)
+
+            dy_grad_value = {}
+            for param in resnet.parameters():
+                if param.trainable:
+                    np_array = np.array(param._grad_ivar().value().get_tensor())
+                    dy_grad_value[param.name + fluid.core.grad_var_suffix(
+                    )] = np_array
+
+            resnet.clear_gradients()
+
+            dy_param_value = {}
+            for param in resnet.parameters():
+                dy_param_value[param.name] = param.numpy()
+
+            paddle.enable_static()
+
+        return dy_out, dy_param_value, dy_grad_value
+
+    def test_resnet(self):
+        out_fp32 = self.train_resnet(enable_amp=False)
+        out_amp = self.train_resnet(enable_amp=True)
+        print(out_fp32[0], out_amp[0])
+        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+
+
 class TestResnet(unittest.TestCase):
     def train_resnet(self, enable_amp=True):
         seed = 90
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index bee53fd10f5fe..45709a358635c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -917,11 +917,6 @@ def test_load_compatible_with_keep_name_table(self):
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = fluid.load_dygraph(
-                os.path.join('saved_dy', 'emb_dy'), True)
-            self.assertTrue(para_state_dict != None)
-            self.assertTrue(opti_state_dict == None)
-
             para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
             self.assertTrue(para_state_dict != None)
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
new file mode 100644
index 0000000000000..6ad19658fd203
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.nn.initializer as initializer
+from paddle.fluid.core import VarDesc
+
+DELTA = 0.00001
+
+
+def check_cast_op(op):
+    return op.type == 'cast' and \
+           op.attr('in_dtype') == VarDesc.VarType.FP32 and \
+           op.attr('out_dtype') == VarDesc.VarType.FP16
+
+
+class TestConstantInitializer(unittest.TestCase):
+    def static_test_constant_initializer_common(self,
+                                                init_inst,
+                                                dtype="float32",
+                                                value_target=0.0):
+        paddle.enable_static()
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=init_inst)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'fill_constant')
+        self.assertAlmostEqual(init_op.attr('value'), value_target, delta=DELTA)
+        paddle.disable_static()
+        return block
+
+    def test_constant_initializer_default_value_static(self, dtype="float32"):
+        """Test the constant initializer with default value in static graph
+        """
+        block = self.static_test_constant_initializer_common(
+            init_inst=initializer.Constant(), dtype=dtype, value_target=0.0)
+        return block
+
+    def test_constant_initializer_default_value_dygraph(self, dtype="float32"):
+        """Test constant initializer with supplied value in dygraph
+        """
+        with fluid.dygraph.guard():
+            linear = nn.Linear(2, 4, weight_attr=nn.initializer.Constant())
+            mat_target = np.ones((2, 4), dtype=dtype) * 0.0
+            mat_linear = linear.weight.numpy()
+            mismatch = np.sum(
+                (mat_target - mat_linear) * (mat_target - mat_linear))
+            self.assertAlmostEqual(mismatch, 0.0, delta=DELTA)
+
+    def test_constant_initializer_static(self, dtype="float32"):
+        """Test constant initializer with supplied value in static graph
+        """
+        block = self.static_test_constant_initializer_common(
+            init_inst=initializer.Constant(2.3), dtype=dtype, value_target=2.3)
+        return block
+
+    def test_constant_initializer_dygraph(self, dtype="float32"):
+        """Test constant initializer with supplied value in dygraph
+        """
+        with fluid.dygraph.guard():
+            linear = nn.Linear(
+                2, 4, weight_attr=nn.initializer.Constant(value=2.0))
+            mat_target = np.ones((2, 4), dtype=dtype) * 2.0
+            mat_linear = linear.weight.numpy()
+            mismatch = np.sum(
+                (mat_target - mat_linear) * (mat_target - mat_linear))
+            self.assertAlmostEqual(mismatch, 0.0, delta=DELTA)
+
+    def test_constant_initializer_fp16(self):
+        """Test constant initializer with float16
+        """
+        block = self.test_constant_initializer_default_value_static("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_constant_initializer_static("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_constant_initializer_default_value_dygraph("float16")
+        self.test_constant_initializer_dygraph("float16")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 9940424618504..71ec1271a041e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -127,8 +127,8 @@ class MultiLoadingLinearNet(fluid.dygraph.Layer):
     def __init__(self, size, model_path):
         super(MultiLoadingLinearNet, self).__init__()
         self._linear = Linear(size, size)
-        self._load_linear1 = fluid.dygraph.jit.load(model_path)
-        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+        self._load_linear1 = paddle.jit.load(model_path)
+        self._load_linear2 = paddle.jit.load(model_path)
 
     @declarative
     def forward(self, x):
@@ -218,23 +218,20 @@ def train_with_label(layer, input_size=784, label_size=1):
 
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.test_jit_save_load"
+        self.model_path = "test_jit_save_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
         paddle.manual_seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-    def train_and_save_model(self, model_path=None, configs=None):
+    def train_and_save_model(self, model_path=None):
         layer = LinearNet(784, 1)
         example_inputs, layer, _ = train(layer)
         final_model_path = model_path if model_path else self.model_path
         orig_input_types = [type(x) for x in example_inputs]
-        fluid.dygraph.jit.save(
-            layer=layer,
-            model_path=final_model_path,
-            input_spec=example_inputs,
-            configs=configs)
+        paddle.jit.save(
+            layer=layer, path=final_model_path, input_spec=example_inputs)
         new_input_types = [type(x) for x in example_inputs]
         self.assertEqual(orig_input_types, new_input_types)
         return layer
@@ -243,13 +240,10 @@ def test_save_load(self):
         # train and save model
         train_layer = self.train_and_save_model()
         # load model
-        program_translator = ProgramTranslator()
-        program_translator.enable(False)
-        loaded_layer = fluid.dygraph.jit.load(self.model_path)
+        loaded_layer = paddle.jit.load(self.model_path)
         self.load_and_inference(train_layer, loaded_layer)
         self.load_dygraph_state_dict(train_layer)
         self.load_and_finetune(train_layer, loaded_layer)
-        program_translator.enable(True)
 
     def load_and_inference(self, train_layer, infer_layer):
         train_layer.eval()
@@ -274,7 +268,7 @@ def load_dygraph_state_dict(self, train_layer):
         # construct new model
         new_layer = LinearNet(784, 1)
         orig_state_dict = new_layer.state_dict()
-        load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        load_state_dict = paddle.load(self.model_path)
         for structured_name in orig_state_dict:
             self.assertTrue(structured_name in load_state_dict)
         new_layer.set_state_dict(load_state_dict)
@@ -286,20 +280,24 @@ def load_dygraph_state_dict(self, train_layer):
             np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
 
     def test_load_dygraph_no_path(self):
-        model_path = "model.test_jit_save_load.no_path"
-        new_layer = LinearNet(784, 1)
+        model_path = "test_jit_save_load.no_path/model_path"
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
     def test_jit_load_model_incomplete(self):
-        model_path = "model.test_jit_save_load.remove_variables"
-        self.train_and_save_model(model_path=model_path)
-        # remove `__variables__`	
-        var_path = os.path.join(model_path, VARIABLE_FILENAME)
+        model_path = "test_jit_save_load.remove_variables/model"
+        self.train_and_save_model(model_path)
+        # remove `.pdiparams`	
+        var_path = model_path + INFER_PARAMS_SUFFIX
         os.remove(var_path)
         with self.assertRaises(ValueError):
             paddle.jit.load(model_path)
 
+    def test_jit_load_no_path(self):
+        path = "test_jit_save_load.no_path/model_path"
+        with self.assertRaises(ValueError):
+            loaded_layer = paddle.jit.load(path)
+
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
@@ -313,8 +311,7 @@ def test_with_input_spec(self):
             net.forward, input_spec=[InputSpec(
                 [None, 8], name='x')])
 
-        model_path = "model.input_spec.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
+        model_path = "input_spec.output_spec/model"
         # check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 1)
         input_x = net.forward.inputs[0]
@@ -322,11 +319,11 @@ def test_with_input_spec(self):
         self.assertTrue(input_x.name == 'x')
 
         # 1. prune loss
-        configs.output_spec = net.forward.outputs[:1]
-        fluid.dygraph.jit.save(net, model_path, configs=configs)
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
 
         # 2. load to infer
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         pred = infer_layer(x)
@@ -334,8 +331,7 @@ def test_with_input_spec(self):
     def test_multi_in_out(self):
         net = LinearNetMultiInput(8, 8)
 
-        model_path = "model.multi_inout.output_spec1"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
+        model_path = "multi_inout.output_spec1/model"
         # 1. check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 2)
         input_x = net.forward.inputs[0]
@@ -344,11 +340,11 @@ def test_multi_in_out(self):
         self.assertTrue(input_y.shape == (-1, 8))
 
         # 2. prune loss
-        configs.output_spec = net.forward.outputs[:2]
-        fluid.dygraph.jit.save(net, model_path, configs=configs)
+        output_spec = net.forward.outputs[:2]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
 
         # 3. load to infer
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         y = fluid.dygraph.to_variable(
@@ -357,11 +353,11 @@ def test_multi_in_out(self):
         pred_x, pred_y = infer_layer(x, y)
 
         # 1. prune y and loss
-        model_path = "model.multi_inout.output_spec2"
-        configs.output_spec = net.forward.outputs[:1]
-        fluid.dygraph.jit.save(net, model_path, [input_x], configs)
+        model_path = "multi_inout.output_spec2/model"
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, [input_x], output_spec=output_spec)
         # 2. load again
-        infer_layer2 = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer2 = paddle.jit.load(model_path)
         # 3. predict
         pred_xx = infer_layer2(x)
 
@@ -377,44 +373,6 @@ def setUp(self):
         paddle.manual_seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-    def basic_save_load(self, layer, model_path, configs):
-        # 1. train & save
-        example_inputs, train_layer, _ = train(layer)
-        fluid.dygraph.jit.save(
-            layer=train_layer,
-            model_path=model_path,
-            input_spec=example_inputs,
-            configs=configs)
-        # 2. load 
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
-        train_layer.eval()
-        # 3. inference & compare
-        x = fluid.dygraph.to_variable(
-            np.random.random((1, 784)).astype('float32'))
-        self.assertTrue(
-            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
-
-    def test_model_filename(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.model_filename = "__simplenet__"
-        self.basic_save_load(layer, model_path, configs)
-
-    def test_params_filename(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.params_filename"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.params_filename = "__params__"
-        self.basic_save_load(layer, model_path, configs)
-
-    def test_separate_params(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.separate_params"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        self.basic_save_load(layer, model_path, configs)
-
     def test_output_spec(self):
         train_layer = LinearNetReturnLoss(8, 8)
         adam = fluid.optimizer.AdamOptimizer(
@@ -427,27 +385,47 @@ def test_output_spec(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        model_path = "model.save_load_config.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.output_spec = [out]
-        fluid.dygraph.jit.save(
+        model_path = "save_load_config.output_spec"
+        output_spec = [out]
+        paddle.jit.save(
             layer=train_layer,
-            model_path=model_path,
+            path=model_path,
             input_spec=[x],
-            configs=configs)
+            output_spec=output_spec)
 
         train_layer.eval()
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         self.assertTrue(
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
+    def test_save_no_support_config_error(self):
+        layer = LinearNet(784, 1)
+        path = "no_support_config_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.save(layer=layer, path=path, model_filename="")
+
+    def test_load_empty_model_filename_error(self):
+        path = "error_model_filename_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, model_filename="")
+
+    def test_load_empty_params_filename_error(self):
+        path = "error_params_filename_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, params_filename="")
+
+    def test_load_with_no_support_config(self):
+        path = "no_support_config_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, separate_params=True)
+
 
 class TestJitMultipleLoading(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "model.jit_multi_load"
+        self.model_path = "jit_multi_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -459,8 +437,8 @@ def setUp(self):
     def train_and_save_orig_model(self):
         layer = LinearNet(self.linear_size, self.linear_size)
         example_inputs, layer, _ = train(layer, self.linear_size, 1)
-        fluid.dygraph.jit.save(
-            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+        paddle.jit.save(
+            layer=layer, path=self.model_path, input_spec=example_inputs)
 
     def test_load_model_retransform_inference(self):
         multi_loaded_layer = MultiLoadingLinearNet(self.linear_size,
@@ -475,7 +453,7 @@ def test_load_model_retransform_inference(self):
 class TestJitPruneModelAndLoad(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "model.jit_prune_model_and_load"
+        self.model_path = "jit_prune_model_and_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -494,13 +472,12 @@ def train_and_save(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.output_spec = [hidden]
-        fluid.dygraph.jit.save(
+        output_spec = [hidden]
+        paddle.jit.save(
             layer=train_layer,
-            model_path=self.model_path,
+            path=self.model_path,
             input_spec=[x],
-            configs=configs)
+            output_spec=output_spec)
 
         return train_layer
 
@@ -508,7 +485,7 @@ def test_load_pruned_model(self):
         train_layer = self.train_and_save()
         train_layer.eval()
 
-        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        infer_layer = paddle.jit.load(self.model_path)
 
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
@@ -519,7 +496,7 @@ def test_load_var_not_in_extra_var_info(self):
         self.train_and_save()
 
         # chage extra var info
-        var_info_path = os.path.join(self.model_path, EXTRA_VAR_INFO_FILENAME)
+        var_info_path = self.model_path + INFER_PARAMS_INFO_SUFFIX
         with open(var_info_path, 'rb') as f:
             extra_var_info = pickle.load(f)
             extra_var_info.clear()
@@ -527,7 +504,7 @@ def test_load_var_not_in_extra_var_info(self):
             pickle.dump(extra_var_info, f, protocol=2)
 
         with self.assertRaises(RuntimeError):
-            fluid.dygraph.jit.load(self.model_path)
+            paddle.jit.load(self.model_path)
 
 
 class TestJitSaveMultiCases(unittest.TestCase):
@@ -561,7 +538,7 @@ def test_no_prune_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_to_static_after_train"
+        model_path = "test_no_prune_to_static_after_train/model"
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -569,7 +546,7 @@ def test_no_prune_to_static_after_train(self):
     def test_no_prune_to_static_no_train(self):
         layer = LinearNetWithInputSpec(784, 1)
 
-        model_path = "test_no_prune_to_static_no_train"
+        model_path = "test_no_prune_to_static_no_train/model"
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -579,7 +556,7 @@ def test_no_prune_no_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train"
+        model_path = "test_no_prune_no_to_static_after_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -593,16 +570,15 @@ def test_no_prune_no_to_static_after_train_with_examples(self):
 
         example_inputs, _, _ = train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train_with_examples"
-        fluid.dygraph.jit.save(
-            layer=layer, model_path=model_path, input_spec=example_inputs)
+        model_path = "test_no_prune_no_to_static_after_train_with_examples/model"
+        paddle.jit.save(layer=layer, path=model_path, input_spec=example_inputs)
 
         self.verify_inference_correctness(layer, model_path)
 
     def test_no_prune_no_to_static_no_train(self):
         layer = LinearNetNotDeclarative(784, 1)
 
-        model_path = "test_no_prune_no_to_static_no_train"
+        model_path = "test_no_prune_no_to_static_no_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -616,9 +592,7 @@ def test_prune_to_static_after_train(self):
 
         out = train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train"
-        configs = paddle.SaveLoadConfig()
-        configs.output_spec = [out]
+        model_path = "test_prune_to_static_after_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -626,18 +600,17 @@ def test_prune_to_static_after_train(self):
                 InputSpec(
                     shape=[None, 784], dtype='float32', name="image")
             ],
-            configs=configs)
+            output_spec=[out])
 
         self.verify_inference_correctness(layer, model_path, True)
 
     def test_prune_to_static_no_train(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_to_static_no_train"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_prune_to_static_no_train/model"
         # TODO: no train, cannot get output_spec var here
         # now only can use index
-        configs.output_spec = layer.forward.outputs[:1]
+        output_spec = layer.forward.outputs[:1]
         paddle.jit.save(
             layer,
             model_path,
@@ -645,7 +618,7 @@ def test_prune_to_static_no_train(self):
                 InputSpec(
                     shape=[None, 784], dtype='float32', name="image")
             ],
-            configs=configs)
+            output_spec=output_spec)
 
         self.verify_inference_correctness(layer, model_path, True)
 
@@ -654,7 +627,7 @@ def test_no_prune_input_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_no_prune_input_spec_name_warning"
+        model_path = "test_no_prune_input_spec_name_warning/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -675,18 +648,16 @@ def test_not_prune_output_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_not_prune_output_spec_name_warning"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_not_prune_output_spec_name_warning/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
-        configs.output_spec = [out]
-        paddle.jit.save(layer, model_path, configs=configs)
+        paddle.jit.save(layer, model_path, output_spec=[out])
 
         self.verify_inference_correctness(layer, model_path)
 
     def test_prune_input_spec_name_error(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_input_spec_name_error"
+        model_path = "test_prune_input_spec_name_error/model"
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -707,10 +678,8 @@ def test_prune_output_spec_name_error(self):
 
         train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_prune_to_static_after_train/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
-        configs.output_spec = [out]
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -719,12 +688,12 @@ def test_prune_output_spec_name_error(self):
                     InputSpec(
                         shape=[None, 784], dtype='float32', name="image")
                 ],
-                configs=configs)
+                output_spec=[out])
 
 
 class TestJitSaveLoadEmptyLayer(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.jit_save_load_empty_layer"
+        self.model_path = "jit_save_load_empty_layer/model"
         # enable dygraph mode
         paddle.disable_static()
 
@@ -740,7 +709,7 @@ def test_save_load_empty_layer(self):
 
 class TestJitSaveLoadNoParamLayer(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.jit_save_load_no_param_layer"
+        self.model_path = "jit_save_load_no_param_layer/model"
         # enable dygraph mode
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 3a3b7071e04dc..aaba571e1a6b9 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -115,5 +115,20 @@ def test_kl_loss_static_api(self):
         pred_loss = paddle.nn.functional.kl_div(input, label)
 
 
+class TestKLDivLossTypePromotion(unittest.TestCase):
+    def test_kl_div_promotion(self):
+
+        with paddle.fluid.dygraph.guard():
+            x1 = paddle.rand([5, 20], dtype='float32')
+            target1 = paddle.rand([5, 20], dtype='float64')
+
+            kldiv_criterion = paddle.nn.KLDivLoss()
+            pred_loss1 = kldiv_criterion(x1, target1)
+
+            x2 = paddle.rand([5, 20], dtype='float64')
+            target2 = paddle.rand([5, 20], dtype='float32')
+            pred_loss2 = paddle.nn.functional.kl_div(x2, target2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 26073f49bdd3d..e0ec676f1b14c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1657,21 +1657,6 @@ def test_eye_op(self):
         with self.assertRaises(TypeError):
             layers.eye(num_rows=3, batch_shape=[-1])
 
-    def test_hard_swish(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            ret = layers.hard_swish(t)
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.hard_swish(base.to_variable(t))
-            dy_ret_rlt = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_rlt))
-
     def test_while_loop(self):
         with self.static_graph():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
@@ -2563,13 +2548,6 @@ def make_l2_normalize(self):
             output = layers.l2_normalize(x, axis=1)
             return output
 
-    def make_maxout(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='x', shape=[8, 6, 6], dtype="float32")
-            output = layers.maxout(x=data, groups=2)
-            return (output)
-
     def make_crop(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -2656,13 +2634,6 @@ def make_prelu(self):
                 name='prelu')
             return (out)
 
-    def make_brelu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
-            return (out)
-
     def make_soft_relu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
new file mode 100644
index 0000000000000..1d7f986507ca9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, randomize_probability
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestLoadOpXpu(unittest.TestCase):
+    """ Test load operator.
+    """
+
+    def setUp(self):
+        self.ones = np.ones((4, 4)).astype('float32')
+        main_prog = fluid.Program()
+        start_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            input = fluid.data('input', shape=[-1, 4], dtype='float32')
+            output = layers.fc(
+                input,
+                4,
+                param_attr=fluid.ParamAttr(
+                    name='w',
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        self.ones)))
+        exe = fluid.Executor(fluid.XPUPlace(0))
+        exe.run(start_prog)
+        fluid.io.save_persistables(
+            exe, dirname="/tmp/model", main_program=main_prog)
+
+    def test_load_xpu(self):
+        main_prog = fluid.Program()
+        start_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            var = layers.create_tensor(dtype='float32')
+            layers.load(var, file_path='/tmp/model/w')
+
+        exe = fluid.Executor(fluid.XPUPlace(0))
+        exe.run(start_prog)
+        ret = exe.run(main_prog, fetch_list=[var.name])
+        self.assertTrue(np.array_equal(self.ones, ret[0]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index fdc1e6b52aba1..35ad6fdb30e7b 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -63,6 +63,8 @@ def setUp(self):
         self.epoch_num = 1
         self.batch_size = 128
         self.batch_num = 10
+        # enable static mode
+        paddle.enable_static()
 
     def train_and_save_model(self, only_params=False):
         with new_program_scope():
@@ -136,13 +138,12 @@ def test_load_with_model_filename(self):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.separate_params = True
-        config.model_filename = self.model_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_param_filename(self):
@@ -151,12 +152,12 @@ def test_load_with_param_filename(self):
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.params_filename = self.params_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_model_and_param_filename(self):
@@ -165,13 +166,16 @@ def test_load_with_model_and_param_filename(self):
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.params_filename = self.params_filename
-        config.model_filename = self.model_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname,
+            params_filename=self.params_filename,
+            model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname,
+            params_filename=self.params_filename,
+            model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_state_dict_from_save_params(self):
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 29a0fa55f7729..82443f8c5493b 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -20,14 +20,44 @@
 
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+import random
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
 
 SIGMOID_THRESHOLD_MIN = -40.0
 SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
+class RandomWeight:
+    def __init__(self):
+        pass
+
+    def updata_weight(self, hidden_size, input_size, dtype):
+        std = 1.0 / math.sqrt(hidden_size)
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.dtype = dtype
+
+        self.weight_ih = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size,
+                                      self.input_size)).astype(dtype)
+        self.weight_hh = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size,
+                                      self.hidden_size)).astype(dtype)
+        self.bias_ih = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+        self.bias_hh = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+
+
+weight = RandomWeight()
+
+
 class LayerMixin(object):
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
@@ -51,16 +81,13 @@ def __init__(self, input_size, hidden_size, bias=True):
         self.bias = bias
         self.dtype = np.float64
         self.parameters = dict()
-        std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.ones(
-            (4 * hidden_size, input_size), dtype=self.dtype)
-        self.weight_hh = np.ones((4 * hidden_size,
-                                  hidden_size)).astype(self.dtype)
+        self.weight_ih = weight.weight_ih
+        self.weight_hh = weight.weight_hh
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
-            self.bias_ih = np.ones((4 * hidden_size)).astype(self.dtype)
-            self.bias_hh = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.bias_ih = weight.bias_ih
+            self.bias_hh = weight.bias_hh
             self.parameters['bias_ih'] = self.bias_ih
             self.parameters['bias_hh'] = self.bias_hh
         else:
@@ -353,24 +380,26 @@ def __init__(self,
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
-    #TODO(GaoWei8): Need to satisfy the result through the new interface
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(2 * self.num_layers):
+            weight_names.append('weight{}'.format(i))
+        for i in range(2 * self.num_layers):
+            weight_names.append('bias{}'.format(i))
+        return weight_names
+
     def setUp(self):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float64
         self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
+        self.set_attrs()
 
         seq_length = 12
         batch_size = 5
         input_size = 21
         hidden_size = 21
 
-        input_weight_size = (hidden_size * hidden_size) * 4
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-        weight_size = input_weight_size + hidden_weight_size
-        weight_size += hidden_size * 8
-        weight_size *= self.num_layers
-
         input = np.random.uniform(
             low=-0.1, high=0.1,
             size=(seq_length, batch_size, input_size)).astype(self.dtype)
@@ -379,17 +408,39 @@ def setUp(self):
         input[9][3:][:] = 0
         input[8][4:][:] = 0
 
+        weight.updata_weight(hidden_size, input_size, self.dtype)
         rnn1 = LSTM(
             input_size,
             hidden_size,
-            self.num_layers,
+            num_layers=self.num_layers,
             time_major=True,
             direction="forward")
 
         output, (last_hidden, last_cell) = rnn1(
             input, sequence_length=self.sequence_length)
 
-        flat_w = np.ones((weight_size)).astype(self.dtype)
+        flat_w = []
+        num = 0
+        for i in range(self.num_layers):
+            if i == 0:
+                weight_ih = weight.weight_ih
+            else:
+                weight_ih = weight.weight_hh
+            flat_w.append(("weight" + str(num), weight_ih))
+            num += 1
+        for i in range(self.num_layers):
+            weight_hh = weight.weight_hh
+            flat_w.append(("weight" + str(num), weight_hh))
+            num += 1
+        num = 0
+        for i in range(self.num_layers):
+            bias_ih = weight.bias_ih
+            flat_w.append(("bias" + str(num), bias_ih))
+            num += 1
+        for i in range(self.num_layers):
+            bias_hh = weight.bias_hh
+            flat_w.append(("bias" + str(num), bias_hh))
+            num += 1
         init_h = np.zeros((self.num_layers, batch_size,
                            hidden_size)).astype(self.dtype)
         init_c = np.zeros((self.num_layers, batch_size,
@@ -398,7 +449,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': input,
-            'W': flat_w,
+            'WeightList': flat_w,
             'InitH': init_h,
             'InitC': init_c,
             'SequenceLength': self.sequence_length
@@ -408,7 +459,7 @@ def setUp(self):
             'is_bidirec': False,
             'input_size': input_size,
             'hidden_size': hidden_size,
-            'num_layers': 1,
+            'num_layers': self.num_layers,
         }
         self.outputs = {
             'Out': output,
@@ -428,16 +479,42 @@ def test_output_with_place(self):
 
     def test_grad_with_place(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place,
-                                   set(['Input', 'W', 'InitH', 'InitC']),
-                                   ['Out', 'LastH', 'LastC'])
+        var_name_list = self.get_weight_names()
+        for var_name in var_name_list:
+            self.check_grad_with_place(
+                place,
+                set(['Input', var_name, 'InitH', 'InitC']),
+                ['Out', 'LastH', 'LastC'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNNLstmOp2(TestCUDNNLstmOp):
-    def set_attrs(self):
-        self.num_layers = 2
+class TestCUDNNlstmAPI(unittest.TestCase):
+    def test_lstm(self):
+        seq_len = 20
+        batch_size = 5
+        hidden_size = 20
+        dropout_prob = 0.0
+        num_layers = 1
+        input = fluid.data(
+            name='input',
+            shape=[seq_len, batch_size, hidden_size],
+            dtype='float64')
+        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
+                                              hidden_size, num_layers,
+                                              dropout_prob, False)
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        exe.run(fluid.default_startup_program())
+        input_i = np.random.uniform(
+            low=-0.1, high=0.1, size=(seq_len, batch_size,
+                                      hidden_size)).astype("float64")
+        out = exe.run(fluid.default_main_program(),
+                      feed={'input': input_i},
+                      fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -448,7 +525,7 @@ def test_lstm(self):
         batch_size = 5
         hidden_size = 20
         dropout_prob = 0.0
-        num_layers = 1
+        num_layers = 2
         input = fluid.data(
             name='input',
             shape=[seq_len, batch_size, hidden_size],
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index 6781965b0b4e9..1d38c833773ca 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -16,32 +16,43 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 import paddle.fluid.core as core
+import paddle.nn.functional as F
 from op_test import OpTest
 
+paddle.enable_static()
+np.random.seed(1)
 
-def maxout_forward_naive(input, groups, channel_axis):
-    s0, s1, s2, s3 = input.shape
-    if channel_axis == 3:
-        return np.ndarray([s0, s1, s2, s3 // groups, groups], \
-            buffer = input, dtype=input.dtype).max(axis=(4))
-    return np.ndarray([s0, s1 // groups, groups, s2, s3], \
-        buffer = input, dtype=input.dtype).max(axis=(2))
+
+def maxout_forward_naive(x, groups, channel_axis):
+    s0, s1, s2, s3 = x.shape
+    if channel_axis == 1:
+        return np.ndarray([s0, s1 // groups, groups, s2, s3], \
+            buffer = x, dtype=x.dtype).max(axis=2)
+    return np.ndarray([s0, s1, s2, s3 // groups, groups], \
+        buffer = x, dtype=x.dtype).max(axis=4)
 
 
 class TestMaxOutOp(OpTest):
     def setUp(self):
         self.op_type = "maxout"
-        self.init_test_case()
-        input = np.random.random(self.shape)
-        output = self.MaxOut_forward_naive(input, self.groups, self.axis)
+        self.dtype = 'float64'
+        self.shape = [3, 6, 2, 4]
+        self.groups = 2
+        self.axis = 1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = maxout_forward_naive(x, self.groups, self.axis)
 
-        self.inputs = {'X': input}
+        self.inputs = {'X': x}
         self.attrs = {'groups': self.groups, 'axis': self.axis}
+        self.outputs = {'Out': out}
 
-        self.outputs = {'Out': output}
+    def set_attrs(self):
+        pass
 
     def test_check_output(self):
         self.check_output()
@@ -49,65 +60,89 @@ def test_check_output(self):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 6, 2, 2]
-        self.groups = 2
-        self.axis = 1
-
 
-class TestMaxOutOpAxis(TestMaxOutOp):
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 2, 2, 6]  # NHWC format
-        self.groups = 2
-        self.axis = 3
+class TestMaxOutOpAxis0(TestMaxOutOp):
+    def set_attrs(self):
+        self.axis = -1
 
 
-class TestMaxOutOpAxisAPI(unittest.TestCase):
-    def test_axis(self):
-        data1 = fluid.data(name='data1', shape=[3, 6, 2, 2], dtype='float32')
-        data2 = fluid.data(name='data2', shape=[3, 2, 2, 6], dtype='float32')
-        out1 = fluid.layers.maxout(data1, groups=2, axis=1)
-        out2 = fluid.layers.maxout(data2, groups=2, axis=-1)
-        data1_np = np.random.random((3, 6, 2, 2)).astype("float32")
-        data2_np = np.transpose(data1_np, [0, 2, 3, 1])
+class TestMaxOutOpAxis1(TestMaxOutOp):
+    def set_attrs(self):
+        self.axis = 3
 
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        results = exe.run(fluid.default_main_program(),
-                          feed={"data1": data1_np,
-                                "data2": data2_np},
-                          fetch_list=[out1, out2],
-                          return_numpy=True)
 
-        self.assertTrue(
-            np.allclose(results[0], np.transpose(results[1], (0, 3, 1, 2))))
+class TestMaxOutOpFP32(TestMaxOutOp):
+    def set_attrs(self):
+        self.dtype = 'float32'
 
-    def test_exception(self):
-        input = fluid.data(name="input", shape=[2, 4, 6, 6], dtype="float32")
 
-        def _attr_axis():
-            out = fluid.layers.maxout(input, groups=2, axis=2)
+class TestMaxOutOpGroups(TestMaxOutOp):
+    def set_attrs(self):
+        self.groups = 3
 
-        self.assertRaises(ValueError, _attr_axis)
 
+class TestMaxoutAPI(unittest.TestCase):
+    # test paddle.nn.Maxout, paddle.nn.functional.maxout
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [2, 6, 5, 4]).astype(np.float64)
+        self.groups = 2
+        self.axis = 1
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.maxout(x, self.groups, self.axis)
+            m = paddle.nn.Maxout(self.groups, self.axis)
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.maxout(x, self.groups, self.axis)
+        m = paddle.nn.Maxout(self.groups, self.axis)
+        out2 = m(x)
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
+
+        out3 = F.maxout(x, self.groups, -1)
+        out3_ref = maxout_forward_naive(self.x_np, self.groups, -1)
+        self.assertTrue(np.allclose(out3_ref, out3.numpy()))
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.maxout(x, groups=self.groups, axis=self.axis)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.maxout(x, groups=self.groups, axis=self.axis)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
 
-class TestMaxOutOpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.maxout, 1, 2)
+            self.assertRaises(TypeError, F.maxout, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.maxout, x_int32, 2)
-            # support the input dtype is float32
-            x_fp32 = fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
-            fluid.layers.maxout(x_fp32, 2)
+            x_int32 = paddle.data(
+                name='x_int32', shape=[2, 4, 6, 8], dtype='int32')
+            self.assertRaises(TypeError, F.maxout, x_int32)
+
+            x_float32 = paddle.data(name='x_float32', shape=[2, 4, 6, 8])
+            self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
index dbf167617a24f..abd0c15dc7235 100755
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -26,6 +26,7 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_static_graph_case(self, x_data, y_data, axis=-1):
         with program_guard(Program(), Program()):
+            paddle.enable_static()
             x = paddle.static.data(
                 name='x', shape=x_data.shape, dtype=x_data.dtype)
             y = paddle.static.data(
@@ -42,6 +43,21 @@ def __run_static_graph_case(self, x_data, y_data, axis=-1):
             res = outs[0]
             return res
 
+    def __run_static_graph_case_with_numpy_input(self, x_data, y_data, axis=-1):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+
+            res = tensor.multiply(x_data, y_data, axis=axis)
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            outs = exe.run(fluid.default_main_program(),
+                           feed={'x': x_data,
+                                 'y': y_data},
+                           fetch_list=[res])
+            res = outs[0]
+            return res
+
     def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
         paddle.disable_static()
         x = paddle.to_tensor(x_data)
@@ -49,27 +65,52 @@ def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
         res = paddle.multiply(x, y, axis=axis)
         return res.numpy()
 
+    def __run_dynamic_graph_case_with_numpy_input(self, x_data, y_data,
+                                                  axis=-1):
+        paddle.disable_static()
+        res = paddle.multiply(x_data, y_data, axis=axis)
+        return res.numpy()
+
     def test_multiply(self):
         """test_multiply."""
         np.random.seed(7)
+
         # test static computation graph: 1-d array
         x_data = np.random.rand(200)
         y_data = np.random.rand(200)
         res = self.__run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test static computation graph: 1-d array
+        x_data = np.random.rand(200)
+        y_data = np.random.rand(200)
+        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test static computation graph: 2-d array
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(2, 500)
         res = self.__run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test static computation graph with_primitives: 2-d array
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(2, 500)
+        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test static computation graph: broadcast
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(500)
         res = self.__run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test static computation graph with_primitives: broadcast
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(500)
+        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test static computation graph: broadcast with axis
         x_data = np.random.rand(2, 300, 40)
         y_data = np.random.rand(300)
@@ -77,24 +118,50 @@ def test_multiply(self):
         expected = np.multiply(x_data, y_data[..., np.newaxis])
         self.assertTrue(np.allclose(res, expected))
 
+        # test static computation graph with_primitives: broadcast with axis
+        x_data = np.random.rand(2, 300, 40)
+        y_data = np.random.rand(300)
+        res = self.__run_static_graph_case_with_numpy_input(
+            x_data, y_data, axis=1)
+        expected = np.multiply(x_data, y_data[..., np.newaxis])
+        self.assertTrue(np.allclose(res, expected))
+
         # test dynamic computation graph: 1-d array
         x_data = np.random.rand(200)
         y_data = np.random.rand(200)
         res = self.__run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test dynamic numpy input computation graph: 1-d array
+        x_data = np.random.rand(200)
+        y_data = np.random.rand(200)
+        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test dynamic computation graph: 2-d array
         x_data = np.random.rand(20, 50)
         y_data = np.random.rand(20, 50)
         res = self.__run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test dynamic numpy input computation graph: 1-d array
+        x_data = np.random.rand(20, 50)
+        y_data = np.random.rand(20, 50)
+        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test dynamic computation graph: broadcast
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(500)
         res = self.__run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test dynamic computation graph with numpy tensor: broadcast
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(500)
+        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test dynamic computation graph: broadcast with axis
         x_data = np.random.rand(2, 300, 40)
         y_data = np.random.rand(300)
@@ -102,6 +169,14 @@ def test_multiply(self):
         expected = np.multiply(x_data, y_data[..., np.newaxis])
         self.assertTrue(np.allclose(res, expected))
 
+        # test dynamic computation graph with numpy tensor: broadcast with axis
+        x_data = np.random.rand(2, 300, 40)
+        y_data = np.random.rand(300)
+        res = self.__run_dynamic_graph_case_with_numpy_input(
+            x_data, y_data, axis=1)
+        expected = np.multiply(x_data, y_data[..., np.newaxis])
+        self.assertTrue(np.allclose(res, expected))
+
 
 class TestMultiplyError(unittest.TestCase):
     """TestMultiplyError."""
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 5d1e016287e07..1675f935f7d6a 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -22,8 +22,8 @@
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
-
 from decorator_helper import prog_scope
+paddle.enable_static()
 
 
 class TestMulGradCheck(unittest.TestCase):
@@ -153,6 +153,38 @@ def test_grad(self):
             self.func(p)
 
 
+class TestMatmulDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        x_shapes = [[2], [2, 3], [2, 4, 3], [2, 3, 4, 5], [2, 3, 4]]
+        y_shapes = [[2], [3, 2], [2, 4, 5], [2, 3, 3, 5], [4, 3]]
+        transpose_xs = [False, True, True, False, False]
+        transpose_ys = [False, True, False, True, False]
+        dtypes = [np.float64, np.float64, np.float32, np.float32, np.float64]
+        typenames = ["float64", "float64", "float32", "float32", "float64"]
+        for i, (x_shape, y_shape, transpose_x, transpose_y, dtype, typename) \
+            in enumerate(zip(x_shapes, y_shapes, transpose_xs, transpose_ys, dtypes, typenames)):
+            x = layers.create_parameter(
+                dtype=typename, shape=x_shape, name='x{}'.format(i))
+            y = layers.create_parameter(
+                dtype=typename, shape=y_shape, name='y{}'.format(i))
+            out = layers.matmul(
+                x, y, transpose_x, transpose_y, name='out{}'.format(i))
+
+            x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+            y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReshapeDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
@@ -249,5 +281,53 @@ def test_grad(self):
             self.func(p)
 
 
+class TestSqueezeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [1, 3, 1, 40]
+        axes = [0, 2]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.squeeze(x, axes)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 40]
+        axes = [1, 2]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.unsqueeze(x, axes)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nonzero_api.py b/python/paddle/fluid/tests/unittests/test_nonzero_api.py
index 0e68f9d5be761..8569be82db09e 100644
--- a/python/paddle/fluid/tests/unittests/test_nonzero_api.py
+++ b/python/paddle/fluid/tests/unittests/test_nonzero_api.py
@@ -76,6 +76,14 @@ def test_nonzero_api(self):
         expect_out = np.array([[0], [1]])
         self.assertTrue(np.allclose(expect_out, np.array(res)))
 
+    def test_dygraph_api(self):
+        data_x = np.array([[True, False], [False, True]])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(data_x)
+            z = paddle.nonzero(x)
+            np_z = z.numpy()
+        expect_out = np.array([[0, 0], [1, 1]])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index a12a328b653b2..5e8828c3e9126 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -102,14 +102,21 @@ def avg_pool2D_forward_naive(x,
                 c_start = adaptive_start_index(j, W, ksize[1])
                 c_end = adaptive_end_index(j, W, ksize[1])
             else:
-                r_start = np.max((i * strides[0] - paddings[0], 0))
-                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-                c_start = np.max((j * strides[1] - paddings[1], 0))
-                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                r_start = i * strides[0] - paddings[0]
+                r_end = i * strides[0] + ksize[0] - paddings[0]
+                c_start = j * strides[1] - paddings[1]
+                c_end = j * strides[1] + ksize[1] - paddings[1]
+                field_size = (r_end - r_start) * (c_end - c_start)
+                r_start = np.max((r_start, 0))
+                r_end = np.min((r_end, H))
+                c_start = np.max((c_start, 0))
+                c_end = np.min((c_end, W))
+
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
-            field_size = ((r_end - r_start) * (c_end - c_start)) \
-                if (exclusive or adaptive) else (ksize[0] * ksize[1])
+            if (exclusive or adaptive):
+                field_size = (r_end - r_start) * (c_end - c_start)
+
             if data_type == np.int8 or data_type == np.uint8:
                 out[:, :, i, j] = (np.rint(
                     np.sum(x_masked, axis=(2, 3)) /
@@ -207,22 +214,34 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 in_w_start = adaptive_start_index(j, W, ksize[1])
                 in_w_end = adaptive_end_index(j, W, ksize[1])
             else:
-                in_w_start = np.max((j * strides[1] - pad_w_left, 0))
-                in_w_end = np.min((j * strides[1] + ksize[1] - pad_w_left, W))
+                in_h_start = i * strides[0] - pad_h_up
+                in_w_start = j * strides[1] - pad_w_left
+                in_h_end = i * strides[0] + ksize[0] - pad_h_up
+                in_w_end = j * strides[1] + ksize[1] - pad_w_left
+
+                field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start)
+                in_h_start = np.max((in_h_start, 0))
+                in_w_start = np.max((in_w_start, 0))
+                in_h_end = np.min((in_h_end, H))
+                in_w_end = np.min((in_w_end, W))
 
             if data_format == 'NCHW':
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
-                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
+
+#                         if (exclusive or adaptive) else (ksize[0] * ksize[1])
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
                 elif pool_type == 'max':
                     out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
             elif data_format == 'NHWC':
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
-                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 3d139e9b90c10..eab7126c7a422 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -116,32 +116,44 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         if adaptive:
             d_start = adaptive_start_index(k, D, ksize[0])
             d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - pad_d_forth, 0))
-            d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth, D))
 
         for i in range(H_out):
             if adaptive:
                 h_start = adaptive_start_index(i, H, ksize[1])
                 h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - pad_h_up, 0))
-                h_end = np.min((i * strides[1] + ksize[1] - pad_h_up, H))
 
             for j in range(W_out):
                 if adaptive:
                     w_start = adaptive_start_index(j, W, ksize[2])
                     w_end = adaptive_end_index(j, W, ksize[2])
                 else:
-                    w_start = np.max((j * strides[2] - pad_w_left, 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left, W))
 
+                    d_start = k * strides[0] - pad_d_forth
+                    d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth,
+                                    D + pad_d_back))
+                    h_start = i * strides[1] - pad_h_up
+                    h_end = np.min(
+                        (i * strides[1] + ksize[1] - pad_h_up, H + pad_h_down))
+                    w_start = j * strides[2] - pad_w_left
+                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left,
+                                    W + pad_w_right))
+
+                    field_size = (d_end - d_start) * (h_end - h_start) * (
+                        w_end - w_start)
+                    w_start = np.max((w_start, 0))
+                    d_start = np.max((d_start, 0))
+                    h_start = np.max((h_start, 0))
+                    w_end = np.min((w_end, W))
+                    d_end = np.min((d_end, D))
+                    h_end = np.min((h_end, H))
                 if data_format == 'NCDHW':
                     x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
                                  w_end]
                     if pool_type == 'avg':
-                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        if (exclusive or adaptive):
+                            field_size = (d_end - d_start) * (
+                                h_end - h_start) * (w_end - w_start)
+
                         out[:, :, k, i, j] = np.sum(x_masked,
                                                     axis=(2, 3, 4)) / field_size
                     elif pool_type == 'max':
@@ -151,8 +163,10 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                     x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
                                  w_end, :]
                     if pool_type == 'avg':
-                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        if (exclusive or adaptive):
+                            field_size = (d_end - d_start) * (
+                                h_end - h_start) * (w_end - w_start)
+
                         out[:, k, i, j, :] = np.sum(x_masked,
                                                     axis=(1, 2, 3)) / field_size
                     elif pool_type == 'max':
@@ -564,7 +578,7 @@ def init_exclusive(self):
         self.exclusive = False
 
     def init_paddings(self):
-        self.paddings = [1, 2, 1, 1, 1, 0]
+        self.paddings = [2, 2, 1, 1, 0, 0]
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
new file mode 100644
index 0000000000000..71e119739e777
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+from op_test import OpTest
+from test_sigmoid_focal_loss_op import sigmoid_focal_loss_forward
+
+
+def call_sfl_functional(logit,
+                        label,
+                        normalizer,
+                        alpha=0.25,
+                        gamma=2.0,
+                        reduction='sum'):
+    res = paddle.nn.functional.sigmoid_focal_loss(
+        logit, label, normalizer, alpha=alpha, gamma=gamma, reduction=reduction)
+    return res
+
+
+def test_static(place,
+                logit_np,
+                label_np,
+                normalizer_np,
+                alpha=0.25,
+                gamma=2.0,
+                reduction='sum'):
+    paddle.enable_static()
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        feed_dict = {"logit": logit_np, "label": label_np}
+
+        normalizer = None
+        if normalizer_np is not None:
+            normalizer = paddle.data(
+                name='normalizer', shape=normalizer_np.shape, dtype='float64')
+            feed_dict["normalizer"] = normalizer_np
+
+        res = call_sfl_functional(logit, label, normalizer, alpha, gamma,
+                                  reduction)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 logit_np,
+                 label_np,
+                 normalizer_np,
+                 alpha=0.25,
+                 gamma=2.0,
+                 reduction='sum'):
+    paddle.disable_static()
+    logit = paddle.to_tensor(logit_np)
+    label = paddle.to_tensor(label_np)
+    normalizer = None
+    if normalizer_np is not None:
+        normalizer = paddle.to_tensor(normalizer_np)
+    dy_res = call_sfl_functional(logit, label, normalizer, alpha, gamma,
+                                 reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_sigmoid_focal_loss(logit_np,
+                            label_np,
+                            normalizer_np,
+                            alpha=0.25,
+                            gamma=2.0,
+                            reduction='sum'):
+
+    loss = np.maximum(
+        logit_np,
+        0) - logit_np * label_np + np.log(1 + np.exp(-np.abs(logit_np)))
+
+    pred = 1 / (1 + np.exp(-logit_np))
+    p_t = pred * label_np + (1 - pred) * (1 - label_np)
+
+    if alpha is not None:
+        alpha_t = alpha * label_np + (1 - alpha) * (1 - label_np)
+        loss = alpha_t * loss
+
+    if gamma is not None:
+        loss = loss * ((1 - p_t)**gamma)
+
+    if normalizer_np is not None:
+        loss = loss / normalizer_np
+
+    if reduction == 'mean':
+        loss = np.mean(loss)
+    elif reduction == 'sum':
+        loss = np.sum(loss)
+
+    return loss
+
+
+class TestSigmoidFocalLoss(unittest.TestCase):
+    def test_SigmoidFocalLoss(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        normalizer_nps = [
+            np.asarray(
+                [np.sum(label_np > 0)], dtype=label_np.dtype), None
+        ]
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        alphas = [0.25, 0.5]
+        gammas = [3, 0.]
+        for place in places:
+            for reduction in reductions:
+                for alpha in alphas:
+                    for gamma in gammas:
+                        for normalizer_np in normalizer_nps:
+                            static_result = test_static(place, logit_np,
+                                                        label_np, normalizer_np,
+                                                        alpha, gamma, reduction)
+                            dy_result = test_dygraph(place, logit_np, label_np,
+                                                     normalizer_np, alpha,
+                                                     gamma, reduction)
+                            expected = calc_sigmoid_focal_loss(
+                                logit_np, label_np, normalizer_np, alpha, gamma,
+                                reduction)
+                            self.assertTrue(
+                                np.allclose(static_result, expected))
+                            self.assertTrue(
+                                np.allclose(static_result, dy_result))
+                            self.assertTrue(np.allclose(dy_result, expected))
+
+    def test_SigmoidFocalLoss_error(self):
+        paddle.disable_static()
+        logit = paddle.to_tensor([[0.97], [0.91], [0.03]], dtype='float32')
+        label = paddle.to_tensor([[1.0], [1.0], [0.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.sigmoid_focal_loss,
+            logit=logit,
+            label=label,
+            normalizer=None,
+            reduction="unsupport reduction")
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
index a1879c724597e..377f8597cca3b 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
@@ -18,6 +18,8 @@
 import numpy as np
 
 from op_test import OpTest
+import paddle
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 5ab13cec540aa..830678fe8f6af 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -20,6 +20,7 @@
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 from op_test import OpTest
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 37f11c449d21f..0fe6cd5e7e753 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -16,6 +16,9 @@
 import numpy as np
 import unittest
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 def strided_slice_native_forward(input, axes, starts, ends, strides):
@@ -498,6 +501,16 @@ def test_1(self):
         assert np.array_equal(res_6, input[-3:3, 0:100:2, :, -1:2:-1])
         assert np.array_equal(res_7, input[-1, 0:100:2, :, -1:2:-1])
 
+    def test_dygraph_op(self):
+        x = paddle.zeros(shape=[3, 4, 5, 6], dtype="float32")
+        axes = [1, 2, 3]
+        starts = [-3, 0, 2]
+        ends = [3, 2, 4]
+        strides_1 = [1, 1, 1]
+        sliced_1 = paddle.strided_slice(
+            x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+        assert sliced_1.shape == (3, 2, 2, 2)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index f800f7b2ca857..1fbc0fc4604c2 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 from op_test import OpTest
 
+import paddle
 from paddle.fluid import core
 
 
@@ -77,5 +78,12 @@ def initTestCase(self):
         self.shift_ratio = 0.3
 
 
+class TestTemporalShiftAPI(unittest.TestCase):
+    def test_api(self):
+        input = paddle.randn([6, 4, 2, 2])
+        out = paddle.nn.functional.temporal_shift(
+            x=input, seg_num=2, shift_ratio=0.2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
index 340d22acbfb51..eaecf91215cc6 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
@@ -18,6 +18,8 @@
 import numpy as np
 import paddle.fluid as fluid
 from op_test import OpTest
+import paddle
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 1975e4306026e..f8d27dd42f43b 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index aa692eb536736..83ca577faa5c6 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -16,6 +16,7 @@
 import numpy as np
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -24,6 +25,8 @@
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.backward import append_backward
 
+paddle.enable_static()
+
 
 class TestApiWhileLoop(unittest.TestCase):
     def test_var_tuple(self):
@@ -199,16 +202,10 @@ def test_while_loop_backward(self):
         def cond(i, x):
             return layers.less_than(i, eleven)
 
-        def body(j, x):
-            # TODO: In while block, if the var created in parent block
-            # participates in the calculation of gradient, the result of gradient
-            # is incorrect because each step scope always returns the same value
-            # generated by last step.
-            # Here we call `assign` op in while block to avoid this bug, and working on fixing it in next PR.
-            i = layers.assign(j)
+        def body(i, x):
             x = layers.elementwise_mul(x=i, y=i)
-            j = layers.increment(j)
-            return [j, x]
+            i = layers.increment(i)
+            return [i, x]
 
         main_program = Program()
         startup_program = Program()
@@ -244,10 +241,10 @@ def body(j, x):
 
     def test_while_loop_backward2(self):
         def cond(i, x):
-            return i < 5
+            return i < 3
 
         def body(i, x):
-            x = x + i
+            x = x * i
             i = i + 1
             return [i, x]
 
@@ -269,17 +266,21 @@ def body(i, x):
 
         feed_i = np.ones(1).astype('float32')
         feed_x = np.ones(1).astype('float32')
-        data = np.asarray([11]).astype('float32')
-        i_grad = np.asarray([1]).astype('float32')
+        data = np.asarray([2]).astype('float32')
+        i_grad = np.asarray([3]).astype('float32')
+        x_grad = np.asarray([2]).astype('float32')
 
         res = exe.run(main_program,
                       feed={'i': feed_i,
                             'x': feed_x},
-                      fetch_list=[mean.name, i.grad_name])
+                      fetch_list=[mean.name, i.grad_name, x.grad_name])
         self.assertTrue(np.allclose(np.asarray(res[0]), data))
         self.assertTrue(
             np.allclose(np.asarray(res[1]), i_grad),
             msg=" \nres = \n{} \n\n ans = \n{}".format(res[1], i_grad))
+        self.assertTrue(
+            np.allclose(np.asarray(res[2]), x_grad),
+            msg=" \nres = \n{} \n\n ans = \n{}".format(res[2], x_grad))
 
 
 class TestApiWhileLoop_NestedWithBackwardAndLoDTensorArray(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index ee01bfb21f820..d6d52b7d604aa 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -24,6 +24,8 @@
 import numpy
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestWhileOp(unittest.TestCase):
     def simple_net(self):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
new file mode 100755
index 0000000000000..7aaa78856811f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
+        self.attrs = {'use_xpu': True}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
new file mode 100644
index 0000000000000..1a21b0f1972b7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+# situation 1: have shape( list, no tensor), no actual shape(Tensor)
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (2, 60)
+        self.new_shape = (12, 10)
+        self.infered_shape = (12, 10)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 25)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)
+
+
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+
+
+# situation 2: have shape(list, no tensor), have actual shape(Tensor)
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "Shape": np.array(
+                self.actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": self.new_shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.actual_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (6, 20)
+        self.new_shape = (0, -1, 20)
+        self.actual_shape = (2, 3, 20)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+class TestReshapeOp_attr_ShapeTensor(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        shape_tensor = []
+        for index, ele in enumerate(self.new_shape):
+            shape_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            'ShapeTensor': shape_tensor
+        }
+        self.attrs = {'shape': self.shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (4, 25)
+        self.new_shape = (10, 10)
+        self.infered_shape = (10, 10)
+        self.shape = (-1, -1)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+    def init_data(self):
+        self.ori_shape = (5, 20)
+        self.new_shape = (5, -1, 20)
+        self.infered_shape = (5, -1, 20)
+        self.shape = (5, -1, -1)
+
+
+class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+        self.shape = (10, 0, 3, -1)
+
+
+# Situation 4: have shape(Tensor), no actual shape(Tensor)
+class TestReshapeOp_attr_OnlyShape(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "Shape": np.array(
+                self.new_shape, dtype="int32")
+        }
+        self.attrs = {"use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (4, 25)
+        self.new_shape = (10, 10)
+        self.infered_shape = (10, 10)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+    def init_data(self):
+        self.ori_shape = (5, 20)
+        self.new_shape = (5, -1, 10)
+        self.infered_shape = (5, -1, 10)
+        self.shape = (5, -1, -1)
+
+
+class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+        self.shape = (10, 0, 3, -1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
new file mode 100644
index 0000000000000..1f74fa5e2d685
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.float32
+        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
+        self.attrs = {'scale': -2.3, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
new file mode 100644
index 0000000000000..f194f3ca80cf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+class TestShapeWithSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        if core.is_compiled_with_xpu():
+            places.append(core.XPUPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 1, 5, 4, 19]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out_shape = scope.var("Out").get_tensor()
+        op = Operator("shape", Input="X", Out="Out")
+
+        op.run(scope, place)
+
+        out_shape = np.array(out_shape).tolist()
+        self.assertListEqual([5, 2], out_shape)
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
new file mode 100644
index 0000000000000..ab07221a07071
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSignOp(OpTest):
+    def setUp(self):
+        self.op_type = "sign"
+        self.dtype = np.float32
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+        self.attrs = {'use_xpu': True}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
new file mode 100644
index 0000000000000..92842fbc2e65a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import sys
+import unittest
+sys.path.append("..")
+from op_test import OpTest
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_xpu': True}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 3
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax2D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [10, 12]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax3D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
new file mode 100644
index 0000000000000..3bafbf649e6ce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'use_xpu': True}
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['x0'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 406baa9d7d65c..9565dd74d83e2 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -97,9 +97,9 @@ def generate(key):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
+            import paddle
+            name1 = paddle.utils.unique_name.generate('fc')
+            name2 = paddle.utils.unique_name.generate('fc')
             print(name1, name2) # fc_0, fc_1
     """
     return generator(key)
@@ -154,19 +154,18 @@ def switch(new_generator=None, new_para_name_checker=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
+            import paddle
+            name1 = paddle.utils.unique_name.generate('fc')
+            name2 = paddle.utils.unique_name.generate('fc')
             print(name1, name2) # fc_0, fc_1
 
-            pre_generator, pre_dygraph_name_checker = fluid.unique_name.switch() # switch to a new anonymous namespace.
-            name2 = fluid.unique_name.generate('fc')
+            pre_generator, pre_dygraph_name_checker = paddle.utils.unique_name.switch() # switch to a new anonymous namespace.
+            name2 = paddle.utils.unique_name.generate('fc')
             print(name2) # fc_0
 
-            fluid.unique_name.switch(pre_generator, pre_dygraph_name_checker) # switch back to pre_generator.
-            name3 = fluid.unique_name.generate('fc')
+            paddle.utils.unique_name.switch(pre_generator, pre_dygraph_name_checker) # switch back to pre_generator.
+            name3 = paddle.utils.unique_name.generate('fc')
             print(name3) # fc_2, since pre_generator has generated fc_0, fc_1.
-
     """
     global generator
     old_generator = generator
@@ -204,17 +203,17 @@ def guard(new_generator=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            with fluid.unique_name.guard():
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard():
-              name_2 = fluid.unique_name.generate('fc')
+            import paddle
+            with paddle.utils.unique_name.guard():
+                name_1 = paddle.utils.unique_name.generate('fc')
+            with paddle.utils.unique_name.guard():
+                name_2 = paddle.utils.unique_name.generate('fc')
             print(name_1, name_2) # fc_0, fc_0
 
-            with fluid.unique_name.guard('A'):
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard('B'):
-              name_2 = fluid.unique_name.generate('fc') 
+            with paddle.utils.unique_name.guard('A'):
+                name_1 = paddle.utils.unique_name.generate('fc')
+            with paddle.utils.unique_name.guard('B'):
+                name_2 = paddle.utils.unique_name.generate('fc')
             print(name_1, name_2) # Afc_0, Bfc_0
     """
     if isinstance(new_generator, six.string_types):
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 2ce442add2e02..7e2f0eb2fb8bb 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'SaveLoadConfig', 'to_variable',
-    'no_grad', 'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
+    'DataParallel'
 ]
 
 __all__ += [
@@ -50,7 +50,6 @@
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from .io import save
 from .io import load
-from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 7175f3101448f..c196c1d689bfe 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -26,7 +26,9 @@
 from paddle import fluid
 from paddle.fluid import core
 from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 __all__ = [
     'save',
@@ -55,19 +57,16 @@ def _load_state_dict_from_save_inference_model(model_path, config):
     # 2. load layer parameters & buffers
     with fluid.dygraph.guard():
         persistable_var_dict = _construct_params_and_buffers(
-            model_path,
-            programs,
-            config.separate_params,
-            config.params_filename,
-            append_suffix=False)
+            model_path, programs, config.params_filename, append_suffix=False)
 
         # 3. construct state_dict
         load_param_dict = dict()
         for var_name in persistable_var_dict:
             load_param_dict[var_name] = persistable_var_dict[var_name].numpy()
 
-        # if __variables.info__ exists, we can recover structured_name
-        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        # if *.info exists, we can recover structured_name
+        var_info_filename = str(config.params_filename) + ".info"
+        var_info_path = os.path.join(model_path, var_info_filename)
         if os.path.exists(var_info_path):
             with open(var_info_path, 'rb') as f:
                 extra_var_info = pickle.load(f)
@@ -116,12 +115,99 @@ def _load_state_dict_from_save_params(model_path):
     return load_param_dict
 
 
+# NOTE(chenweihang): [ Handling of use cases of API paddle.load ]
+# `paddle.load` may be used to load saved results of:
+# 1. Expected cases:
+#   - need [full filename] when loading
+#       - paddle.save
+#       - paddle.static.save
+#       - paddle.fluid.save_dygraph
+#   - need [prefix] when loading [compatible for paddle 2.x]
+#       - paddle.jit.save
+#       - paddle.static.save_inference_model
+#   - need [directory] when loading [compatible for paddle 1.x]
+#       - paddle.fluid.io.save_inference_model
+#       - paddle.fluid.io.save_params/save_persistable
+# 2. Error cases:
+#   - no error case
+def _build_load_path_and_config(path, config):
+    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
+    # raise error, avoid confusing behavior
+    prefix_format_path = path + INFER_MODEL_SUFFIX
+    prefix_format_exist = os.path.exists(prefix_format_path)
+    directory_format_exist = os.path.isdir(path)
+    if prefix_format_exist and directory_format_exist:
+        raise ValueError(
+            "The %s.pdmodel and %s directory exist at the same time, "
+            "don't know which one to load, please make sure that the specified target "
+            "of ``path`` is unique." % (path, path))
+    elif not prefix_format_exist and not directory_format_exist:
+        error_msg = "The ``path`` (%s) to load model not exists."
+        # if current path is a prefix, and the path.pdparams or path.pdopt
+        # is exist, users may want use `paddle.load` load the result of 
+        # `fluid.save_dygraph`, we raise error here for users
+        params_file_path = path + ".pdparams"
+        opti_file_path = path + ".pdopt"
+        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
+            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
+                "please specify the full file name, not just the file name prefix. For " \
+                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+                "`paddle.load('model')`."
+        raise ValueError(error_msg % path)
+    else:
+        if prefix_format_exist:
+            file_prefix = os.path.basename(path)
+            model_path = os.path.dirname(path)
+            if config.model_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``model_filename`` config does "
+                    "not take effect.")
+            config.model_filename = file_prefix + INFER_MODEL_SUFFIX
+            if config.params_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``params_filename`` config does "
+                    "not take effect.")
+            config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            # Compatible with the old save_inference_model format
+            model_path = path
+
+    return model_path, config
+
+
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.load` is not supported."
+                % key)
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+    inner_config.keep_name_table = configs.get('keep_name_table', None)
+
+    return inner_config
+
+
 def save(obj, path):
     '''
     Save an object to the specified path.
     
     .. note::
         Now only supports save ``state_dict`` of Layer or Optimizer.
+
+    .. note::
+        ``paddle.save`` will not add a suffix to the saved results, 
+        but we recommend that you use the following paddle standard suffixes:
+        1. for ``Layer.state_dict`` -> ``.pdparams``
+        2. for ``Optimizer.state_dict`` -> ``.pdopt``
     
     Args:
         obj(Object) : The object to be saved.
@@ -178,7 +264,7 @@ def save(obj, path):
         pickle.dump(saved_obj, f, protocol=2)
 
 
-def load(path, config=None):
+def load(path, **configs):
     '''
     Load an object can be used in paddle from specified path.
 
@@ -186,21 +272,39 @@ def load(path, config=None):
         Now only supports load ``state_dict`` of Layer or Optimizer.
 
     .. note::
-        ``paddle.load`` supports loading ``state_dict`` from the result of several 
-        paddle1.x save APIs in static mode, but due to some historical reasons, 
-        if you load ``state_dict`` from the saved result of 
-        ``paddle.static.save_inference_model/paddle.fluid.io.save_params/paddle.fluid.io.save_persistables`` , 
+        ``paddle.load`` supports loading ``state_dict`` of Layer or Optimizer from 
+        the result of other save APIs except ``paddle.load`` , but the argument 
+        ``path`` format is different:
+        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
+        ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
+        ``model.pdopt`` ; 
+        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model`` 
+        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix, 
+        such as ``model/mnist``, and ``paddle.load`` will get information from 
+        ``mnist.pdmodel`` and ``mnist.pdiparams`` ;
+        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or 
+        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a 
+        directory, such as ``model`` and model is a directory.
+
+    .. note::
+        If you load ``state_dict`` from the saved result of 
+        ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
         the structured variable name will cannot be restored. You need to set the argument 
         ``use_structured_name=False`` when using ``Layer.set_state_dict`` later.
 
     Args:
         path(str) : The path to load the target object. Generally, the path is the target 
-            file path, when compatible with loading the saved results of 
-            ``paddle.jit.save/paddle.static.save_inference_model`` , the path is a directory. 
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
-            object that specifies additional configuration options, these options 
-            are for compatibility with ``paddle.jit.save/paddle.static.save_inference_model`` 
-            formats. Default None.
+            file path. When compatible with loading the saved results other APIs, the path 
+            can be a file prefix or directory. 
+        **configs (dict, optional): other load configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            ``save_inference_model`` save format. No default file name, save variables separately 
+            by default.
 
     Returns:
         Object(Object): a target object can be used in paddle
@@ -227,26 +331,9 @@ def load(path, config=None):
             load_layer_state_dict = paddle.load("emb.pdparams")
             load_opt_state_dict = paddle.load("adam.pdopt")
     '''
-    # 1. input check
-    if not os.path.exists(path):
-        error_msg = "The path `%s` does not exist."
-        # if current path is a prefix, and the path.pdparams or path.pdopt
-        # is exist, users may want use `paddle.load` load the result of 
-        # `fluid.save_dygraph`, we raise error here for users
-        params_file_path = path + ".pdparams"
-        opti_file_path = path + ".pdopt"
-        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
-            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
-                "please specify the full file name, not just the file name prefix. For " \
-                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
-                "`paddle.load('model')`."
-        raise ValueError(error_msg % path)
-
-    if config is None:
-        config = paddle.SaveLoadConfig()
-
-    # 2. load target
     load_result = None
+    config = _parse_load_config(configs)
+
     if os.path.isfile(path):
         # we think path is file means this file is created by paddle.save
         with open(path, 'rb') as f:
@@ -255,16 +342,15 @@ def load(path, config=None):
 
         if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
             del load_result["StructuredToParameterName@@"]
-    elif os.path.isdir(path):
-        # we think path is directory means compatible with loading 
-        # store results of static mode related save APIs
-
+    else:
+        # file prefix and directory are compatible cases
+        model_path, config = _build_load_path_and_config(path, config)
         # check whether model file exists
         if config.model_filename is None:
             model_filename = '__model__'
         else:
             model_filename = config.model_filename
-        model_file_path = os.path.join(path, model_filename)
+        model_file_path = os.path.join(model_path, model_filename)
 
         if os.path.exists(model_file_path):
             # Load state dict by `jit.save/io.save_inference_model` save format
@@ -274,7 +360,7 @@ def load(path, config=None):
             # `save_inference_model` not save structured name, we need to remind 
             # the user to configure the `use_structured_name` argument when `set_state_dict`
             # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
-            load_result = _load_state_dict_from_save_inference_model(path,
+            load_result = _load_state_dict_from_save_inference_model(model_path,
                                                                      config)
         else:
             # load state dict by `io.save_params/persistables` save format
@@ -283,9 +369,6 @@ def load(path, config=None):
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
             # paddle.static.load_program_state in this case
-            load_result = _load_state_dict_from_save_params(path)
-    else:
-        raise ValueError(
-            "Unsupported path format, now only supports file or directory.")
+            load_result = _load_state_dict_from_save_params(model_path)
 
     return load_result
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 69b7fedd72eed..4a1751b331d21 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 import os
+import numbers
 
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.utils import try_import
 
 from .progressbar import ProgressBar
 
-__all__ = ['Callback', 'ProgBarLogger', 'ModelCheckpoint']
+__all__ = ['Callback', 'ProgBarLogger', 'ModelCheckpoint', 'VisualDL']
 
 
 def config_callbacks(callbacks=None,
@@ -471,3 +473,111 @@ def on_train_end(self, logs=None):
             path = '{}/final'.format(self.save_dir)
             print('save checkpoint at {}'.format(os.path.abspath(path)))
             self.model.save(path)
+
+
+class VisualDL(Callback):
+    """VisualDL callback function
+    Args:
+        log_dir (str): The directory to save visualdl log file.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.static import InputSpec
+
+            inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
+
+            train_dataset = paddle.vision.datasets.MNIST(mode='train')
+            eval_dataset = paddle.vision.datasets.MNIST(mode='test')
+
+            net = paddle.vision.LeNet()
+            model = paddle.Model(net, inputs, labels)
+
+            optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
+            model.prepare(optimizer=optim,
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
+            
+            ## uncomment following lines to fit model with visualdl callback function
+            # callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')
+            # model.fit(train_dataset, eval_dataset, batch_size=64, callbacks=callback)
+
+    """
+
+    def __init__(self, log_dir):
+        self.log_dir = log_dir
+        self.epochs = None
+        self.steps = None
+        self.epoch = 0
+
+    def _is_write(self):
+        return ParallelEnv().local_rank == 0
+
+    def on_train_begin(self, logs=None):
+        self.epochs = self.params['epochs']
+        assert self.epochs
+        self.train_metrics = self.params['metrics']
+        assert self.train_metrics
+        self._is_fit = True
+        self.train_step = 0
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.steps = self.params['steps']
+        self.epoch = epoch
+
+    def _updates(self, logs, mode):
+        if not self._is_write():
+            return
+        if not hasattr(self, 'writer'):
+            visualdl = try_import('visualdl')
+            self.writer = visualdl.LogWriter(self.log_dir)
+
+        metrics = getattr(self, '%s_metrics' % (mode))
+        current_step = getattr(self, '%s_step' % (mode))
+
+        if mode == 'train':
+            total_step = current_step
+        else:
+            total_step = self.epoch
+
+        for k in metrics:
+            if k in logs:
+                temp_tag = mode + '/' + k
+
+                if isinstance(logs[k], (list, tuple)):
+                    temp_value = logs[k][0]
+                elif isinstance(logs[k], numbers.Number):
+                    temp_value = logs[k]
+                else:
+                    continue
+
+                self.writer.add_scalar(
+                    tag=temp_tag, step=total_step, value=temp_value)
+
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.train_step += 1
+
+        if self._is_write():
+            self._updates(logs, 'train')
+
+    def on_eval_begin(self, logs=None):
+        self.eval_steps = logs.get('steps', None)
+        self.eval_metrics = logs.get('metrics', [])
+        self.eval_step = 0
+        self.evaled_samples = 0
+
+    def on_train_end(self, logs=None):
+        if hasattr(self, 'writer'):
+            self.writer.close()
+            delattr(self, 'writer')
+
+    def on_eval_end(self, logs=None):
+        if self._is_write():
+            self._updates(logs, 'eval')
+
+            if (not hasattr(self, '_is_fit')) and hasattr(self, 'writer'):
+                self.writer.close()
+                delattr(self, 'writer')
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 8505544a71f58..21e3054dde7d7 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -201,8 +201,11 @@ def _init_context():
 
 
 def _update_input_shapes(inputs):
+    "Get input shape list by given inputs in Model initialization."
     shapes = None
-    if isinstance(inputs, list):
+    if isinstance(inputs, Input):
+        shapes = [list(inputs.shape)]
+    elif isinstance(inputs, list):
         shapes = [list(input.shape) for input in inputs]
     elif isinstance(inputs, dict):
         shapes = [list(inputs[name].shape) for name in inputs]
@@ -638,19 +641,14 @@ def train_batch(self, inputs, labels=None):
 
         if self._nranks > 1:
             outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-            losses = self.model._loss(*(to_list(outputs) + labels))
-            losses = to_list(losses)
-            final_loss = fluid.layers.sum(losses)
-            final_loss = self.ddp_model.scale_loss(final_loss)
-            final_loss.backward()
-            self.ddp_model.apply_collective_grads()
         else:
             outputs = self.model.network.forward(
                 * [to_variable(x) for x in inputs])
-            losses = self.model._loss(*(to_list(outputs) + labels))
-            losses = to_list(losses)
-            final_loss = fluid.layers.sum(losses)
-            final_loss.backward()
+
+        losses = self.model._loss(*(to_list(outputs) + labels))
+        losses = to_list(losses)
+        final_loss = fluid.layers.sum(losses)
+        final_loss.backward()
 
         self.model._optimizer.minimize(final_loss)
         self.model.network.clear_gradients()
@@ -922,9 +920,7 @@ def train_batch(self, inputs, labels=None):
         """
         loss = self._adapter.train_batch(inputs, labels)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def eval_batch(self, inputs, labels=None):
@@ -972,9 +968,7 @@ def eval_batch(self, inputs, labels=None):
         """
         loss = self._adapter.eval_batch(inputs, labels)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def test_batch(self, inputs):
@@ -1017,9 +1011,7 @@ def test_batch(self, inputs):
         """
         loss = self._adapter.test_batch(inputs)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def save(self, path, training=True):
@@ -1712,7 +1704,7 @@ def get_inout_spec(all_vars, return_name=False):
                 layer = self.network
                 if self._input_shapes is None:  # No provided or inferred
                     raise RuntimeError(
-                        "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training zqqdata and perform a training for shape derivation."
+                        "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training data and perform a training for shape derivation."
                     )
                 if self._is_shape_inferred:
                     warnings.warn(
@@ -1958,3 +1950,9 @@ def _len_data_loader(self, data_loader):
         except Exception:
             steps = None
         return steps
+
+    def _update_inputs(self):
+        "Update self._inputs according to given inputs."
+        self._input_shapes = self._adapter._input_shapes
+        self._is_shape_inferred = True
+        self._inputs = self._verify_spec(None, self._input_shapes, True)
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 3ead3fc295c0b..30b22a2f32c34 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -106,6 +106,12 @@ def forward(self, inputs):
         warnings.warn(
             "Your model was created in static mode, this may not get correct summary information!"
         )
+        in_train_mode = False
+    else:
+        in_train_mode = net.training
+
+    if in_train_mode:
+        net.eval()
 
     def _is_shape(shape):
         for item in shape:
@@ -143,9 +149,13 @@ def _check_input(input_size):
     result, params_info = summary_string(net, _input_size, dtypes)
     print(result)
 
+    if in_train_mode:
+        net.train()
+
     return params_info
 
 
+@paddle.no_grad()
 def summary_string(model, input_size, dtypes=None):
     def _all_is_numper(items):
         for item in items:
@@ -244,7 +254,7 @@ def build_input(input_size, dtypes):
                 dtype = dtypes[0]
             else:
                 dtype = dtypes
-            return paddle.rand(list(input_size), dtype)
+            return paddle.cast(paddle.rand(list(input_size)), dtype)
         else:
             return [
                 build_input(i, dtype) for i, dtype in zip(input_size, dtypes)
diff --git a/python/paddle/incubate/complex/tensor/manipulation.py b/python/paddle/incubate/complex/tensor/manipulation.py
index 7852260a31e3c..d1e0cbed82e99 100644
--- a/python/paddle/incubate/complex/tensor/manipulation.py
+++ b/python/paddle/incubate/complex/tensor/manipulation.py
@@ -128,16 +128,13 @@ def transpose(x, perm, name=None):
         .. code-block:: python
  
             import paddle
-            import numpy as np
-            import paddle.fluid.dygraph as dg
  
-            with dg.guard():
-                a = np.array([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j]])
-                x = dg.to_variable(a)
-                y = paddle.complex.transpose(x, [1, 0])
-                print(y.numpy())
-                # [[1.+1.j 3.+1.j]
-                #  [2.+1.j 4.+1.j]]
+            x = paddle.to_tensor([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j], [5.0+1.0j, 6.0+1.0j]])
+            x_transposed = paddle.complex.transpose(x, [1, 0])
+            print(x_transposed.numpy())
+            #[[1.+1.j 3.+1.j 5.+1.j]
+            # [2.+1.j 4.+1.j 6.+1.j]]
+
     """
     complex_variable_exists([x], "transpose")
     real = layers.transpose(x.real, perm, name)
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index b79b965f5b902..1dddef0cace1d 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -31,10 +31,9 @@
 __all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
-# from .clip import ErrorClipByValue        #DEFINE_ALIAS
-from .clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from .clip import GradientClipByNorm  #DEFINE_ALIAS
-from .clip import GradientClipByValue  #DEFINE_ALIAS
+from .clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from .clip import ClipGradByNorm  #DEFINE_ALIAS
+from .clip import ClipGradByValue  #DEFINE_ALIAS
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
@@ -45,32 +44,36 @@
 # from .control_flow import rnn        #DEFINE_ALIAS
 # from .decode import BeamSearchDecoder        #DEFINE_ALIAS
 # from .decode import Decoder        #DEFINE_ALIAS
-from .decode import beam_search  #DEFINE_ALIAS
-from .decode import beam_search_decode  #DEFINE_ALIAS
+# from .decode import beam_search  #DEFINE_ALIAS
+# from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import crf_decoding        #DEFINE_ALIAS
 # from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
-from .layer.activation import ELU
-from .layer.activation import GELU
-from .layer.activation import Tanh
-from .layer.activation import Hardshrink
-from .layer.activation import Hardtanh
-from .layer.activation import PReLU
-from .layer.activation import ReLU
+from .layer.activation import ELU  #DEFINE_ALIAS
+from .layer.activation import GELU  #DEFINE_ALIAS
+from .layer.activation import Tanh  #DEFINE_ALIAS
+from .layer.activation import Hardshrink  #DEFINE_ALIAS
+from .layer.activation import Hardswish  #DEFINE_ALIAS
+from .layer.activation import Hardtanh  #DEFINE_ALIAS
+from .layer.activation import PReLU  #DEFINE_ALIAS
+from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU6  #DEFINE_ALIAS
 from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
+from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
 from .layer.activation import LogSigmoid
 from .layer.activation import Softmax  #DEFINE_ALIAS
 from .layer.activation import Softplus  #DEFINE_ALIAS
 from .layer.activation import Softshrink  #DEFINE_ALIAS
 from .layer.activation import Softsign  #DEFINE_ALIAS
+from .layer.activation import Swish  #DEFINE_ALIAS
 from .layer.activation import Tanhshrink  #DEFINE_ALIAS
+from .layer.activation import ThresholdedReLU  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
-from .layer.activation import HSigmoid  #DEFINE_ALIAS
+from .layer.activation import Maxout  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
@@ -129,6 +132,7 @@
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
 from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
+from .layer.loss import HSigmoidLoss  #DEFINE_ALIAS
 from .layer.loss import MSELoss  #DEFINE_ALIAS
 from .layer.loss import L1Loss  #DEFINE_ALIAS
 from .layer.loss import NLLLoss  #DEFINE_ALIAS
@@ -149,10 +153,17 @@
 from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
-from .layer.rnn import *
-# from .layer.rnn import RNNCell        #DEFINE_ALIAS
-# from .layer.rnn import GRUCell        #DEFINE_ALIAS
-# from .layer.rnn import LSTMCell        #DEFINE_ALIAS
+
+from .layer.rnn import RNNCellBase  #DEFINE_ALIAS
+from .layer.rnn import SimpleRNNCell  #DEFINE_ALIAS
+from .layer.rnn import LSTMCell  #DEFINE_ALIAS
+from .layer.rnn import GRUCell  #DEFINE_ALIAS
+from .layer.rnn import RNN  #DEFINE_ALIAS
+from .layer.rnn import BiRNN  #DEFINE_ALIAS
+from .layer.rnn import SimpleRNN  #DEFINE_ALIAS
+from .layer.rnn import LSTM  #DEFINE_ALIAS
+from .layer.rnn import GRU  #DEFINE_ALIAS
+
 from .layer.transformer import MultiHeadAttention
 from .layer.transformer import TransformerEncoderLayer
 from .layer.transformer import TransformerEncoder
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index a50dad628cf32..9fd1241bd83e0 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 # TODO: define the functions to clip gradient of parameter  
-from ..fluid.clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByValue  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByValue  #DEFINE_ALIAS
 from ..fluid.layers import clip  #DEFINE_ALIAS
 
 from ..fluid.layers import clip_by_norm  #DEFINE_ALIAS
 
 __all__ = [
     #       'ErrorClipByValue',
-    'GradientClipByGlobalNorm',
-    'GradientClipByNorm',
-    'GradientClipByValue',
+    'ClipGradByGlobalNorm',
+    'ClipGradByNorm',
+    'ClipGradByValue',
     #       'set_gradient_clip',
     'clip',
     'clip_by_norm'
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index f01a5ed15b650..214744217e957 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -13,16 +13,16 @@
 # limitations under the License.
 
 # TODO: define api to implement decoding algorithm  
-from ..fluid.layers import beam_search  #DEFINE_ALIAS
-from ..fluid.layers import beam_search_decode  #DEFINE_ALIAS
+# from ..fluid.layers import beam_search  #DEFINE_ALIAS
+# from ..fluid.layers import beam_search_decode  #DEFINE_ALIAS
 
 from ..fluid.layers import gather_tree  #DEFINE_ALIAS
 
 __all__ = [
     #       'BeamSearchDecoder',
     #       'Decoder',
-    'beam_search',
-    'beam_search_decode',
+    #       'beam_search',
+    #       'beam_search_decode',
     #       'crf_decoding',
     #       'ctc_greedy_decoder',
     #       'dynamic_decode',
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 13bc99875638d..30eefb2c3912b 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -29,15 +29,13 @@
 __all__ += pooling.__all__
 from . import loss
 __all__ += loss.__all__
-from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
 from .activation import hardshrink  #DEFINE_ALIAS
 from .activation import hardtanh  #DEFINE_ALIAS
-from .activation import hard_sigmoid  #DEFINE_ALIAS
-from .activation import hard_swish  #DEFINE_ALIAS
-from .activation import hsigmoid  #DEFINE_ALIAS
+from .activation import hardsigmoid  #DEFINE_ALIAS
+from .activation import hardswish  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import log_sigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
@@ -141,7 +139,7 @@
 from .loss import cross_entropy  #DEFINE_ALIAS
 from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import edit_distance  #DEFINE_ALIAS
-from .loss import huber_loss  #DEFINE_ALIAS
+from .loss import hsigmoid_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
 from .loss import kl_div  #DEFINE_ALIAS
 from .loss import l1_loss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f7bbe0c94e03d..33ecd29162c12 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -13,27 +13,19 @@
 # limitations under the License.
 
 # TODO: define activation functions of neural network
-from ...fluid.layers import brelu  #DEFINE_ALIAS
 from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import hard_swish  #DEFINE_ALIAS
-from ...fluid.layers import maxout  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import swish  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import thresholded_relu  #DEFINE_ALIAS
 from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
-    'brelu',
     'elu',
     'erf',
     'gelu',
     'hardshrink',
     'hardtanh',
-    'hard_sigmoid',
-    'hard_swish',
-    'hsigmoid',
+    'hardsigmoid',
+    'hardswish',
     'leaky_relu',
     'log_sigmoid',
     'maxout',
@@ -75,10 +67,10 @@ def elu(x, alpha=1.0, name=None):
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -89,7 +81,7 @@ def elu(x, alpha=1.0, name=None):
             paddle.disable_static()
 
             x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
-            out = F.elu(x, alpha=0.2) 
+            out = F.elu(x, alpha=0.2)
             # [[-0.12642411  6.        ]
             #  [ 1.          15.6      ]]
     """
@@ -123,16 +115,16 @@ def gelu(x, approximate=False, name=None):
     .. math::
 
         gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         approximate (bool, optional): Wether to enable approximation. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -265,125 +257,106 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     return out
 
 
-def hsigmoid(input,
-             label,
-             weight,
-             bias,
-             num_classes,
-             path_table=None,
-             path_code=None,
-             is_sparse=False):
+def hardsigmoid(x, name=None):
     """
-	:alias_main: paddle.nn.functional.hsigmoid
-	:alias: paddle.nn.functional.hsigmoid,paddle.nn.functional.activation.hsigmoid
-
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
+    hardsigmoid activation.
+
+    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
+    which is much faster than sigmoid.
+
+    .. math::
+
+        hardsigmoid(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &1, & & \\text{if } x \\geq 3 \\\\
+            &x/6 + 1/2, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
 
     Parameters:
-        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
-            and D is the feature size. Its data type supports float32 and float64.
-        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
-            and data type is int64.
-        weight (Variable): A tensor with shape (num_classes - 1, D) if not using custom tree(path_code and path_table is None), or (num_classes, D) if using custom tree.
-        bias (Variable): A tensor with shape (num_classes - 1, 1) if not using custom tree(path_code and path_table is None), or (num_classes, 1) if using custom tree.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
-            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
-            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
-            nodes' weight matrix. Default: None.
-        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
-            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
-            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
+        A Tensor with the same data type and shape as ``x`` .
 
     Examples:
         .. code-block:: python
 
-            from paddle import fluid, nn
-            import paddle.fluid.dygraph as dg
+            import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            main = fluid.Program()
-            start = fluid.Program()
-            feature_size = 6
-            num_classes = 8
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, start):
-                    x = fluid.data("input", [-1, feature_size],
-                                  dtype="float32")
-                    label = fluid.data("labels", [-1, 1], dtype="int64")
-                    w = fluid.data("weight", (num_classes -1, feature_size), dtype="float32")
-                    b = fluid.data("bias", (num_classes -1, ), dtype="float32")
-                    y = F.hsigmoid(x, label, w, b, num_classes)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(start)
-            feed_dict = {
-                "input": np.random.randn(4, feature_size).astype(np.float32),
-                "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-                "weight": np.random.randn(num_classes - 1, feature_size).astype(np.float32),
-                "bias": np.random.randn(num_classes - 1, ).astype(np.float32),
-            }
-            y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-            print(y_np.shape)
-
-          # (4, 1)
+            x = paddle.to_tensor([-4., 5., 1.])
+            out = F.hardsigmoid(x) # [0., 1., 0.666667]
     """
 
-    attrs = {
-        "num_classes": num_classes,
-        "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse
-    }
+    if in_dygraph_mode():
+        return core.ops.hard_sigmoid(x, 'slope', 0.1666666666666667, 'offset',
+                                     0.5)
 
-    inputs = {
-        "X": input,
-        "W": weight,
-        "Bias": bias,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label
-    }
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardsigmoid')
 
-    helper = LayerHelper('hierarchical_sigmoid', **locals())
-    dtype = helper.input_dtype()
+    helper = LayerHelper('hardsigmoid', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': 0.1666666666666667,
+               'offset': 0.5})
+    return out
 
-    out = helper.create_variable_for_type_inference(dtype)
-    pre_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
 
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
+def hardswish(x, name=None):
+    """
+    hardswish activation
+
+    hardswish is proposed in MobileNetV3, and performs better in computational stability
+    and efficiency compared to swish function. For more details please refer
+    to: https://arxiv.org/pdf/1905.02244.pdf
+
+    .. math::
+
+        hardswish(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &x, & & \\text{if } x \\geq 3 \\\\
+            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            out = F.hardswish(x) # [0., 5., 0.666667]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.hard_swish(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardswish')
+
+    helper = LayerHelper('hardswish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='hard_swish', inputs={'X': x}, outputs={'Out': out})
     return out
 
 
@@ -489,7 +462,7 @@ def prelu(x, weight, name=None):
     assert len(weight.shape
                ) == 1, "The dim count of weight shape should be 1 in prelu()."
 
-    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # NOTE(): The input of this API should be ``N,C,...`` format,
     # which means x.shape[0] is batch_size and x.shape[0] is channel.
     mode = 'all'
     if weight.shape[0] > 1:
@@ -559,15 +532,15 @@ def log_sigmoid(x, name=None):
     .. math::
 
         log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -591,6 +564,81 @@ def log_sigmoid(x, name=None):
     return out
 
 
+def maxout(x, groups, axis=1, name=None):
+    """
+    maxout activation.
+
+    Assumed the input shape is (N, Ci, H, W).
+    The output shape is (N, Co, H, W).
+    Then Co = Ci/groups and the operator formula is as follows:
+
+    .. math::
+
+        &out_{si+j} = \\max_{k} x_{gsi + sk + j} \\\\
+        &g = groups \\\\
+        &s = \\frac{input.size}{num\\_channels} \\\\
+        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+        &0 \\le j < s \\\\
+        &0 \\le k < groups
+
+    Parameters:
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type
+            of input is float32 or float64.
+        groups (int, optional): The groups number of maxout. `groups` specifies the
+            index of channel dimension where maxout will be performed. This must be
+            a factor of number of features. Default is 1.
+        axis (int, optional): The axis along which to perform maxout calculations.
+            It should be 1 when data format is NCHW, be -1 or 3 when data format
+            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
+            where D is the dimensions of ``x`` . ``axis`` only supports 1, 3 or -1.
+            Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.rand([1, 2, 3, 4])
+            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
+            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
+            #    [0.666127   0.616567   0.30741522 0.24044901]
+            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
+            out = F.maxout(x, groups=2)
+            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.maxout(x, 'groups', groups, 'axis', axis)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
+    if axis not in [1, -1, 3]:
+        raise ValueError(
+            "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
+            "Attr(axis): %s." % str(axis))
+    if axis == -1:
+        axis = 3
+
+    helper = LayerHelper('maxout', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='maxout',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'groups': groups,
+               'axis': axis})
+    return out
+
+
 def relu6(x, name=None):
     """
     relu6 activation
@@ -614,8 +662,6 @@ def relu6(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
             out = F.relu6(x) # [0, 0.3, 6]
     """
@@ -666,8 +712,6 @@ def selu(x,
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
     """
@@ -778,7 +822,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
@@ -881,8 +925,6 @@ def softplus(x, beta=1, threshold=20, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
@@ -930,8 +972,6 @@ def softshrink(x, threshold=0.5, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
             out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
     """
@@ -978,8 +1018,6 @@ def softsign(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
     """
@@ -994,6 +1032,47 @@ def softsign(x, name=None):
     return out
 
 
+def swish(x, name=None):
+    """
+    swish activation.
+
+    .. math::
+
+        swish(x) = \\frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            out = F.swish(x) # [-0.238406, 0., 0.731059]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.swish(x, 'beta', 1.0)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
+    helper = LayerHelper('swish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': 1.0})
+    return out
+
+
 def tanhshrink(x, name=None):
     """
     tanhshrink activation
@@ -1017,8 +1096,6 @@ def tanhshrink(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
@@ -1033,6 +1110,52 @@ def tanhshrink(x, name=None):
     return out
 
 
+def thresholded_relu(x, threshold=1.0, name=None):
+    """
+    thresholded relu activation.
+
+    .. math::
+
+        thresholded\\_relu(x) = \\begin{cases}
+                                 x, \\text{if } x > threshold \\\\
+                                 0, \\text{otherwise}
+                                \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            out = F.thresholded_relu(x) # [2., 0., 0.]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.thresholded_relu(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'thresholded_relu')
+    helper = LayerHelper('thresholded_relu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='thresholded_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
 def log_softmax(x, axis=-1, dtype=None, name=None):
     """
     This operator implements the log_softmax layer. The calculation process is
@@ -1051,13 +1174,13 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
         specified) as x.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 7d2ed0cdcf83a..81c38c0be6557 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1406,46 +1406,53 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 def linear(x, weight, bias=None, name=None):
     """
 
-    Fully-connected linear transformation op
+    Fully-connected linear transformation operator. For each input :math:`X` ,
+    the equation is:
 
     .. math::
 
-        Out = {XW + b}
+        Out = XW + b
 
-    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+    where :math:`W` is the weight and :math:`b` is the bias.
 
-    The linear op multiplies input tensor with weight matrix and
-    produces an output Tensor of shape [N, *, output_dim], 
-    where N is batch size and `*` means any number of additional dimensions and output_dim is the last dim of ``weight``.
-    If ``bias`` is not None, a bias will be added to the output.
+    If the weight is a 2-D tensor of shape :math:`[in\_features, out\_features]` ,
+    input should be a multi-dimensional tensor of shape
+    :math:`[batch\_size, *, in\_features]` , where :math:`*` means any number of
+    additional dimensions. The linear operator multiplies input tensor with
+    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` , 
+    If :math:`bias` is not None, the bias should be a 1-D tensor of shape
+    :math:`[out\_features]` and will be added to the output.
 
-    Args:
-        x(Tensor): Input tensor, its data type is float16, float32 or float64
-        weight(Tensor): Weight tensor, its data type is float16, float32 or float64
-        bias(Tensor|None, optional): Bias tensor, its data type is float16, float32 or float64. If it is set to None, no bias will be added to the output units.
-        name(str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+    Parameters:
+        x (Tensor): Input tensor. The data type should be float16, float32 or float64.
+        weight (Tensor): Weight tensor. The data type should be float16, float32 or float64.
+        bias (Tensor, optional): Bias tensor. The data type should be float16, float32 or float64.
+                                 If it is set to None, no bias will be added to the output units.
+        name (str, optional): Normally there is no need for user to set this parameter.
+                              For detailed information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Output tensor
+        Tensor, the shape is :math:`[batch\_size, *, out\_features]` and the
+        data type is the same with input :math:`x` .
 
     Examples:
         .. code-block:: python
           
-          import numpy as np
           import paddle
-          import paddle.nn.functional as F
           
-          input = np.ones((3,1,2), dtype=np.float32)
-          weight = np.ones((2,2), dtype=np.float32)
-          bias = np.ones((2), dtype=np.float32)
-          place = paddle.CPUPlace()
-          paddle.disable_static(place)
-          input = paddle.to_tensor(input)
-          weight = paddle.to_tensor(weight)
-          bias = paddle.to_tensor(bias)
-          out = F.linear(input, weight, bias)
-          print(out) #[3 3 3 3 3 3]
-    
+          x = paddle.randn((3, 2), dtype="float32")
+          # x: [[-0.32342386 -1.200079  ]
+          #     [ 0.7979031  -0.90978354]
+          #     [ 0.40597573  1.8095392 ]]
+          weight = paddle.full(shape=[2, 4], fill_value="0.5", dtype="float32", name="weight")
+          # weight: [[0.5 0.5 0.5 0.5]
+          #          [0.5 0.5 0.5 0.5]]
+          bias = paddle.ones(shape=[4], dtype="float32", name="bias")
+          # bias: [1. 1. 1. 1.]
+          y = paddle.nn.functional.linear(x, weight, bias)
+          # y: [[0.23824859 0.23824859 0.23824859 0.23824859]
+          #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
+          #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
         pre_bias = _varbase_creator(dtype=x.dtype)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d27bac14d0a84..d085213dffc23 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -32,7 +32,6 @@
 from ...fluid.layers import rank_loss  #DEFINE_ALIAS
 from ...fluid.layers import reshape
 from ...fluid.layers import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
-from ...fluid.layers import sigmoid_focal_loss  #DEFINE_ALIAS
 from ...fluid.layers import smooth_l1  #DEFINE_ALIAS
 from ...fluid.layers import softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
@@ -40,8 +39,8 @@
 from ...fluid.layers import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
-from ...fluid.layers import huber_loss  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
+from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
@@ -55,7 +54,7 @@
     'cross_entropy',
     'dice_loss',
     'edit_distance',
-    'huber_loss',
+    'hsigmoid_loss',
     'iou_similarity',
     'kl_div',
     'l1_loss',
@@ -345,6 +344,138 @@ def binary_cross_entropy_with_logits(logit,
     return out
 
 
+def hsigmoid_loss(input,
+                  label,
+                  num_classes,
+                  weight,
+                  bias=None,
+                  path_table=None,
+                  path_code=None,
+                  is_sparse=False,
+                  name=None):
+    """
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        input (Tensor): A tensor with the shape [N, D], where N is the size of mini-batch,
+            and D is the feature size. Its data type supports float32 or float64.
+        label (Tensor): A tensor contains the labels of training data. Its shape is [N, 1]
+            and data type is int64.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (path_code and path_table is None are None), `num_classes`
+            should not be None. If the custom tree is used (path_code and path_table is None are not None),
+            `num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        weight (Tensor): A tensor with shape (num_classes - 1, D), with the same data type as `input`.
+        bias (Tensor, optional): A tensor with shape (num_classes - 1, 1), with the same data type as `input`.
+            If `bias` is None, no bias will be add. Default is None.
+        path_table (Tensor, optional): A tensor that stores each batch of samples' path from leaf to root
+            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
+            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
+            nodes' weight matrix. If `path_table` and `path_code` are None, the default tree will be used.
+            Default is None.
+        path_code (Tensor, optional): A tensor that stores each batch of samples' code of path from leaf
+            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
+            Each code of path is consisted with the code of nodes from leaf to root node. If `path_table` and
+            `path_code` are None, the default tree will be used. Default is None.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating. If `is_sparse` is True,
+            the gradient of `weight` and `input` will be sparse. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as `input`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.set_device('cpu')
+
+            input = paddle.uniform([2, 3])
+            # [[-0.8018668   0.8736385  -0.9064771 ] # random
+            #  [-0.10228515 -0.87188244 -0.8783718 ]] # random
+            label = paddle.to_tensor([0, 1, 4, 5])
+            num_classes = 5
+            weight=paddle.uniform([num_classes-1, 3])
+            # [[-0.24148715  0.8449961  -0.7399121 ] # random
+            #  [-0.9800559   0.43509364  0.9091208 ] # random
+            #  [ 0.60194826  0.10430074 -0.4521166 ] # random
+            #  [-0.4469818  -0.01536179 -0.604454  ]] # random
+
+            out=F.hsigmoid_loss(input, label, num_classes, weight)
+            # [[3.0159328]
+            #  [2.2407534]]
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.hierarchical_sigmoid(
+            input, weight, label, path_table, path_code, bias, 'num_classes',
+            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
+        return out
+
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'hsigmoid_loss')
+    check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
+    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                             'hsigmoid_loss')
+    if bias is not None:
+        check_variable_and_dtype(bias, 'bias', ['float32', 'float64'],
+                                 'hsigmoid_loss')
+    if path_table is not None:
+        check_variable_and_dtype(path_table, 'path_table', ['int64'],
+                                 'hsigmoid_loss')
+    if path_code is not None:
+        check_variable_and_dtype(path_code, 'path_code', ['int64'],
+                                 'hsigmoid_loss')
+
+    attrs = {
+        "num_classes": num_classes,
+        "is_sparse": is_sparse,
+        "remote_prefetch": is_sparse
+    }
+
+    inputs = {
+        "X": input,
+        "W": weight,
+        "Bias": bias,
+        "PathTable": path_table,
+        "PathCode": path_code,
+        "Label": label
+    }
+
+    helper = LayerHelper('hsigmoid_loss', **locals())
+    out = helper.create_variable_for_type_inference(input.dtype)
+    pre_out = helper.create_variable_for_type_inference(input.dtype)
+    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
+
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+    return out
+
+
 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
     """
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
@@ -801,6 +932,16 @@ def kl_div(input, label, reduction='mean', name=None):
             # shape=[5, 20]
 
     """
+    # ugly type promotion
+    if fluid.data_feeder.convert_dtype(
+            input.dtype) == 'float32' and fluid.data_feeder.convert_dtype(
+                label.dtype) == 'float64':
+        input = fluid.layers.cast(input, 'float64')
+    elif fluid.data_feeder.convert_dtype(
+            input.dtype) == 'float64' and fluid.data_feeder.convert_dtype(
+                label.dtype) == 'float32':
+        label = fluid.layers.cast(label, 'float64')
+
     if paddle.in_dynamic_mode():
         out = core.ops.kldiv_loss(input, label, 'reduction', reduction)
         return out
@@ -1142,3 +1283,165 @@ def cross_entropy(input,
         out = reshape(out, shape=out_shape)
 
     return out
+
+
+def sigmoid_focal_loss(logit,
+                       label,
+                       normalizer=None,
+                       alpha=0.25,
+                       gamma=2.0,
+                       reduction='sum',
+                       name=None):
+    """
+    `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
+    foreground-background class imbalance for classification tasks. It down-weights
+    easily-classified examples and thus focuses training on hard examples. For example,
+    it is used in one-stage object detection where the foreground-background class
+    imbalance is extremely high.
+
+    This operator measures focal loss function as follows: 
+
+    .. math::
+           Out = -Labels * alpha * {(1 - \\sigma(Logit))}^{gamma}\\log(\\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\\sigma(Logit)}^{gamma}\\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\exp(-Logit)}`. 
+
+    Then, if :attr:`normalizer` is not None, this operator divides the
+    normalizer tensor on the loss `Out`:
+
+    .. math::
+           Out = \\frac{Out}{normalizer}
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target ``label`` is 0 for the negative class and is 1 for the positive class.
+
+    Args:
+        logit (Tensor): The input logit tensor. The shape is [N, *], where N is batch_size,
+            `*` means any number of additional dimensions. The ``logit`` is usually the
+            output of a convolution layer. Available dtype is float32, float64.
+        label (Tensor): The target label tensor with the same shape as
+            ``logit``. The target label whose value should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        normalizer (Tensor, optional): The number normalizes the focal loss. It has to be
+            a 1-D Tensor whose shape is `[1, ]`. The data type is float32, float64.
+            For object detection task, it is the the number of positive samples.
+            If set to None, the focal loss will not be normalized. Default is None.
+        alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
+            it should be between 0 and 1.  Default value is set to 0.25. 
+        gamma(int|float, optional): Hyper-parameter to modulate the easy and hard examples.
+            Default value is set to 2.0.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'sum'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as ``logit``. The same dtype as ``logit`` tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            logit = paddle.to_tensor([[0.97, 0.91, 0.03], [0.55, 0.43, 0.71]], dtype='float32')
+            label = paddle.to_tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32')
+            one = paddle.to_tensor([1.], dtype='float32')
+            fg_label = paddle.greater_equal(label, one)
+            fg_num = paddle.reduce_sum(paddle.cast(fg_label, dtype='float32'))
+            output = paddle.nn.functional.sigmoid_focal_loss(logit, label, normalizer=fg_num)
+            print(output.numpy())  # [0.65782464]
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in sigmoid_focal_loss "
+            "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
+            % reduction)
+
+    if normalizer is not None:
+        fluid.data_feeder.check_variable_and_dtype(normalizer, 'normalizer',
+                                                   ['float32', 'float64'],
+                                                   'sigmoid_focal_loss')
+        normalizer_shape = list(normalizer.shape)
+        normalizer_dims = len(normalizer_shape)
+        if normalizer_dims > 1:
+            raise ValueError(
+                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".
+                format(normalizer_dims))
+
+    if in_dygraph_mode():
+        one = _varbase_creator(dtype=logit.dtype)
+        core.ops.fill_constant(one, 'value',
+                               float(1.0), 'force_cpu', False, 'dtype',
+                               one.dtype, 'str_value', '1.0', 'shape',
+                               logit.shape)
+        loss = core.ops.sigmoid_cross_entropy_with_logits(logit, label)
+        pred = core.ops.sigmoid(logit)
+        p_t = core.ops.elementwise_add(
+            core.ops.elementwise_mul(pred, label),
+            core.ops.elementwise_mul(
+                core.ops.elementwise_sub(one, pred),
+                core.ops.elementwise_sub(one, label)))
+
+        alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
+        alpha_t = core.ops.elementwise_add(
+            core.ops.elementwise_mul(alpha, label),
+            core.ops.elementwise_mul(
+                core.ops.elementwise_sub(one, alpha),
+                core.ops.elementwise_sub(one, label)))
+        loss = core.ops.elementwise_mul(alpha_t, loss)
+
+        gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
+        gamma_t = core.ops.elementwise_pow(
+            core.ops.elementwise_sub(one, p_t), gamma)
+        loss = core.ops.elementwise_mul(gamma_t, loss)
+
+        if normalizer is not None:
+            loss = core.ops.elementwise_div(loss, normalizer)
+
+        if reduction == "sum":
+            return core.ops.reduce_sum(loss, 'reduce_all', True)
+        elif reduction == "mean":
+            return core.ops.mean(loss)
+
+        return loss
+
+    fluid.data_feeder.check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss')
+
+    bce_name = None
+    if reduction == 'none' and normalizer is None:
+        bce_name = name
+    loss = paddle.nn.functional.binary_cross_entropy_with_logits(
+        logit, label, reduction='none', name=bce_name)
+
+    pred = fluid.layers.sigmoid(logit)
+    p_t = pred * label + (1 - pred) * (1 - label)
+
+    alpha_t = alpha * label + (1 - alpha) * (1 - label)
+    loss = paddle.multiply(alpha_t, loss)
+
+    gamma_t = paddle.pow((1 - p_t), gamma)
+    loss = paddle.multiply(gamma_t, loss)
+
+    if normalizer is not None:
+        normalizer_name = name if reduction == 'none' else None
+        loss = paddle.divide(loss, normalizer, name=normalizer_name)
+
+    if reduction == 'mean':
+        loss = paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        loss = paddle.sum(loss, name=name)
+
+    return loss
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index a74a98d5ed45b..7f86e56df1b54 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -34,7 +34,6 @@
 from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
 from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
 from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
-from ...fluid.layers import grid_sampler  #DEFINE_ALIAS
 from ...fluid.layers import image_resize  #DEFINE_ALIAS
 from ...fluid.layers import prior_box  #DEFINE_ALIAS
 from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
@@ -74,7 +73,7 @@
     'generate_mask_labels',
     'generate_proposal_labels',
     'generate_proposals',
-    'grid_sampler',
+    'grid_sample',
     'image_resize',
     'image_resize_short',
     #       'multi_box_head',
@@ -205,25 +204,35 @@ def grid_sample(x,
     data x and y is indexing the 3rd dimension (in height dimension),
     finally results is the bilinear interpolation or nearest value of 4 nearest corner
     points. The output tensor shape will be [N, C, H, W].
+
+
+    Step 1:
+
+    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+    .. code-block:: text
+
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+    Step 2:
+    
+    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+    interpolate point value by 4 nearest points or nearest interpolate point value
+    by nearest point.
+
     .. code-block:: text
-        Step 1:
-        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
-        .. code-block:: text
-            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
-        Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-        interpolate point value by 4 nearest points or nearest interpolate point value
-        by nearest point.
-          wn ------- y_n ------- en
-          |           |           |
-          |          d_n          |
-          |           |           |
-         x_w --d_w-- grid--d_e-- x_e
-          |           |           |
-          |          d_s          |
-          |           |           |
-          ws ------- y_s ------- wn
+
+        wn ------- y_n ------- en
+        |           |           |
+        |          d_n          |
+        |           |           |
+        x_w --d_w-- grid--d_e-- x_e
+        |           |           |
+        |          d_s          |
+        |           |           |
+        ws ------- y_s ------- wn
+
         For bilinear interpolation:
         x_w = floor(x)              // west side x coord
         x_e = x_w + 1               // east side x coord
@@ -237,8 +246,10 @@ def grid_sample(x,
         en = X[:, :, y_n, x_e]      // north-east point value
         ws = X[:, :, y_s, x_w]      // south-east point value
         es = X[:, :, y_s, x_w]      // north-east point value
+
         output = wn * d_e * d_s + en * d_w * d_s
-               + ws * d_e * d_n + es * d_w * d_n
+                + ws * d_e * d_n + es * d_w * d_n
+
     Args:
         x(Tensor): The input tensor, which is a 4-d tensor with shape
                      [N, C, H, W], N is the batch size, C is the channel
@@ -262,7 +273,9 @@ def grid_sample(x,
         Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
 
     Examples:
+
         .. code-block:: python
+        
             import paddle
             import paddle.nn.functional as F
             import numpy as np
@@ -287,7 +300,7 @@ def grid_sample(x,
                             [ 0.7,  0.4],
                             [ 0.2,  0.8]]]]).astype("float64")
             
-            paddle.disable_static()
+            
             x = paddle.to_tensor(x)
             grid = paddle.to_tensor(grid)
             y_t = F.grid_sample(
@@ -304,13 +317,10 @@ def grid_sample(x,
             #    [ 0.596  0.38   0.52   0.24 ]]]]
     """
     helper = LayerHelper("grid_sample", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sample')
     check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
-                             'grid_sampler')
-    if not isinstance(x, Variable):
-        raise ValueError("The x should be a Variable")
-    if not isinstance(grid, Variable):
-        raise ValueError("The grid should be a Variable")
+                             'grid_sample')
+
     _modes = ['bilinear', 'nearest']
     _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index 489f324868a3e..db0f5dbff2b80 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,21 +13,23 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-
 from ...fluid.initializer import Bilinear  #DEFINE_ALIAS
-from ...fluid.initializer import Constant  #DEFINE_ALIAS
 from ...fluid.initializer import MSRA  #DEFINE_ALIAS
 from ...fluid.initializer import Normal  #DEFINE_ALIAS
 from ...fluid.initializer import TruncatedNormal  #DEFINE_ALIAS
 from ...fluid.initializer import Uniform  #DEFINE_ALIAS
 from ...fluid.initializer import Xavier  #DEFINE_ALIAS
 
+from . import constant
+from .constant import Constant  #DEFINE_ALIAS
+
 __all__ = [
     'Bilinear',
-    'Constant',
     'MSRA',
     'Normal',
     'TruncatedNormal',
     'Uniform',
     'Xavier',
 ]
+
+__all__ += constant.__all__
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
new file mode 100644
index 0000000000000..6d21ddae0d16b
--- /dev/null
+++ b/python/paddle/nn/initializer/constant.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define the initializers of Constant in neural network
+from ...fluid.initializer import ConstantInitializer
+
+__all__ = ['Constant']
+
+
+class Constant(ConstantInitializer):
+    """Implement the constant initializer.
+
+    Args:
+        value (float32): constant value to initialize the parameter 
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.Constant(value=2.0))
+            res = linear(data)
+            print(linear.weight.numpy())
+            #result is [[2. 2. 2. 2.],[2. 2. 2. 2.]]
+
+    """
+
+    def __init__(self, value=0.0):
+        if value is None:
+            raise ValueError("value must not be none.")
+        super(Constant, self).__init__(value=value, force_cpu=False)
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 760af09f1f2f5..3a5bcaa21fe5b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -41,7 +41,6 @@
 from .activation import Sigmoid  #DEFINE_ALIAS
 # from .activation import Softmax        #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
-from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 585d369c607e5..dbb9d00f365cf 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -18,6 +18,7 @@
     'ELU',
     'GELU',
     'Hardshrink',
+    'Hardswish',
     'Tanh',
     'Hardtanh',
     'PReLU',
@@ -26,14 +27,17 @@
     'SELU',
     'LeakyReLU',
     'Sigmoid',
+    'Hardsigmoid',
     'Softmax',
     'Softplus',
     'Softshrink',
     'Softsign',
+    'Swish',
     'Tanhshrink',
+    'ThresholdedReLU',
     'LogSigmoid',
     'LogSoftmax',
-    'HSigmoid',
+    'Maxout',
 ]
 
 from ...fluid.dygraph import layers
@@ -50,18 +54,18 @@ class ELU(layers.Layer):
     ELU Activation.
 
     .. math::
-    
+
         ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
 
     Parameters:
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -106,11 +110,11 @@ class GELU(layers.Layer):
         approximate (bool, optional): Wether to enable approximation. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -120,7 +124,7 @@ class GELU(layers.Layer):
             paddle.disable_static()
 
             x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
-            
+
             m = paddle.nn.GELU()
             out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
 
@@ -184,6 +188,52 @@ def forward(self, x):
         return F.hardshrink(x, self._threshold, self._name)
 
 
+class Hardswish(layers.Layer):
+    """
+    Hardswish activation
+
+    Hardswish is proposed in MobileNetV3, and performs better in computational stability
+    and efficiency compared to swish function. For more details please refer
+    to: https://arxiv.org/pdf/1905.02244.pdf
+
+    .. math::
+
+        Hardswish(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &x, & & \\text{if } x \\geq 3 \\\\
+            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            m = paddle.nn.Hardswish()
+            out = m(x) # [0., 5., 0.666667]
+    """
+
+    def __init__(self, name=None):
+        super(Hardswish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.hardswish(x, self._name)
+
+
 class Tanh(layers.Layer):
     """
     Tanh Activation.
@@ -240,11 +290,11 @@ class Hardtanh(layers.Layer):
         max (float, optional): The value of max for Hardtanh. Default is 1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -268,142 +318,6 @@ def forward(self, x):
         return F.hardtanh(x, self._min, self._max, self._name)
 
 
-class HSigmoid(layers.Layer):
-    """
-	:alias_main: paddle.nn.HSigmoid
-	:alias: paddle.nn.HSigmoid,paddle.nn.layer.HSigmoid,paddle.nn.layer.activation.HSigmoid
-
-    Hierarchical Sigmoid Layer.
-    
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-
-    Parameters:
-        feature_size (int): The feature size.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
-            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
-            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
-            initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
-            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
-            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
-            set, the bias is initialized zero. Default: None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
-            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
-            should not be passed to its forward method. Default: False.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          from paddle import fluid, nn
-          import paddle.fluid.dygraph as dg
-          import paddle.nn.functional as F
-          import numpy as np
-
-          main = fluid.Program()
-          start = fluid.Program()
-          feature_size = 6
-          num_classes = 8
-          with fluid.unique_name.guard():
-              with fluid.program_guard(main, start):
-                  x = fluid.data("input", [-1, feature_size],
-                              dtype="float32")
-                  label = fluid.data("labels", [-1, 1], dtype="int64")
-                  hsm = nn.HSigmoid(feature_size, num_classes)
-                  y = hsm(x, label)
-
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(start)
-          feed_dict = {
-              "input": np.random.randn(4, feature_size).astype(np.float32),
-              "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-          }
-          y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-          print(y_np.shape)
-
-          # (4, 1)
-    """
-
-    def __init__(self,
-                 feature_size,
-                 num_classes,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_custom=False,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoid, self).__init__()
-        if (num_classes < 2) and (not is_custom):
-            raise ValueError(
-                "num_classes must not be less than 2 with default tree")
-
-        if (not is_custom) and (is_sparse):
-            print("Sparse mode should not be used without custom tree")
-            is_sparse = False
-
-        self._feature_size = feature_size
-        self._num_classes = num_classes
-        self._is_custom = is_custom
-        self._is_sparse = is_sparse
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        self._dtype = dtype
-
-        remote_prefetch = is_sparse
-        print("With sparse mode, if your models has only"
-              " small parameter prefetch may cause speed down")
-
-        C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter(
-            [C, self._feature_size],
-            attr=self._param_attr,
-            is_bias=False,
-            dtype=self._dtype)
-        self.bias = self.create_parameter(
-            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
-
-    def forward(self, input, label, path_table=None, path_code=None):
-        out = F.hsigmoid(
-            input,
-            label,
-            self.weight,
-            self.bias,
-            self._num_classes,
-            path_table=path_table,
-            path_code=path_code,
-            is_sparse=self._is_sparse)
-        return out
-
-
 class PReLU(layers.Layer):
     """
     PReLU Activation.
@@ -414,19 +328,19 @@ class PReLU(layers.Layer):
 
     Parameters:
         num_parameters (int, optional): Number of `weight` to learn. The supported values are:
-            1 - a single parameter `alpha` is used for all input channels; 
+            1 - a single parameter `alpha` is used for all input channels;
             Number of channels - a seperate `alpha` is used for each input channel.
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
-        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`. 
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
             Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape. Default dtype is float32.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -487,7 +401,7 @@ class ReLU(layers.Layer):
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -531,8 +445,6 @@ class ReLU6(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
             m = paddle.nn.ReLU6()
             out = m(x) # [0, 0.3, 6]
@@ -574,8 +486,6 @@ class SELU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             m = paddle.nn.SELU()
             out = m(x) # [[0, 1.050701],[2.101402, 3.152103]]
@@ -613,11 +523,11 @@ class LeakyReLU(layers.Layer):
             :math:`x < 0` . Default is 0.01.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -643,11 +553,11 @@ def forward(self, x):
 class Sigmoid(layers.Layer):
     """
     this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
-    
+
     .. math::
 
         Sigmoid(x) = \frac{1}{1 + e^{-x}}
-    
+
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -656,7 +566,7 @@ class Sigmoid(layers.Layer):
 
     Returns:
         A callable object of Sigmoid.
-    
+
     Examples:
 
         .. code-block:: python
@@ -680,6 +590,53 @@ def forward(self, x):
         return F.sigmoid(x, self.name)
 
 
+class Hardsigmoid(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
+    This layer calcluate the `hardsigmoid` of input x.
+
+    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
+    which is much faster than sigmoid.
+
+    .. math::
+
+        Hardsigmoid(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &1, & & \\text{if } x \\geq 3 \\\\
+            &x/6 + 1/2, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        x: N-D tensor, available dtype is float32, float64.
+
+    Returns:
+        A callable object of Hardsigmoid.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+
+          m = paddle.nn.Sigmoid()
+          x = paddle.to_tensor([-4., 5., 1.])
+          out = m(x) # [0., 1, 0.666667]
+    """
+
+    def __init__(self, name=None):
+        super(Hardsigmoid, self).__init__()
+        self.name = name
+
+    def forward(self, x):
+        return F.hardsigmoid(x, self.name)
+
+
 class Softplus(layers.Layer):
     """
     Softplus Activation
@@ -705,8 +662,6 @@ class Softplus(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Softplus()
             out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
@@ -749,8 +704,6 @@ class Softshrink(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
             m = paddle.nn.Softshrink()
             out = m(x) # [-0.4, 0, 0, 0.3]
@@ -787,8 +740,6 @@ class Softsign(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Softsign()
             out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
@@ -802,6 +753,41 @@ def forward(self, x):
         return F.softsign(x, self._name)
 
 
+class Swish(layers.Layer):
+    """
+    Swish Activation.
+
+    .. math::
+
+        Swish(x) = \\frac{x}{1 + e^{-x}}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            m = paddle.nn.Swish()
+            out = m(x) # [-0.238406, 0., 0.731059]
+    """
+
+    def __init__(self, name=None):
+        super(Swish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.swish(x, self._name)
+
+
 class Tanhshrink(layers.Layer):
     """
     Tanhshrink Activation
@@ -824,8 +810,6 @@ class Tanhshrink(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Tanhshrink()
             out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
@@ -839,10 +823,50 @@ def forward(self, x):
         return F.tanhshrink(x, self._name)
 
 
+class ThresholdedReLU(layers.Layer):
+    """
+    Thresholded ReLU Activation
+
+    .. math::
+
+        ThresholdedReLU(x) = \\begin{cases}
+                               x, \\text{if } x > threshold \\\\
+                               0, \\text{otherwise}
+                              \\end{cases}
+
+    Parameters:
+        threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            m = paddle.nn.ThresholdedReLU()
+            out = m(x) # [2., 0., 0.]
+    """
+
+    def __init__(self, threshold=1.0, name=None):
+        super(ThresholdedReLU, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.thresholded_relu(x, self._threshold, self._name)
+
+
 class LogSigmoid(layers.Layer):
     """
     LogSigmoid Activation.
-    
+
     .. math::
 
         LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
@@ -851,11 +875,11 @@ class LogSigmoid(layers.Layer):
         x (Tensor): The input Tensor with data type float32, or float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -961,7 +985,7 @@ class Softmax(layers.Layer):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
@@ -1013,7 +1037,7 @@ class LogSoftmax(layers.Layer):
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
+        Out[i, j] = log(softmax(x))
                   = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
@@ -1023,7 +1047,7 @@ class LogSoftmax(layers.Layer):
             same way as :math:`axis + D` . Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
@@ -1060,3 +1084,64 @@ def __init__(self, axis=-1, name=None):
 
     def forward(self, x):
         return F.log_softmax(x, self._axis)
+
+
+class Maxout(layers.Layer):
+    """
+    Maxout Activation.
+
+    Assumed the input shape is (N, Ci, H, W).
+    The output shape is (N, Co, H, W).
+    Then Co = Ci/groups and the operator formula is as follows:
+
+    .. math::
+
+        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\
+        &g = groups \\\\
+        &s = \\frac{input.size}{num\\_channels} \\\\
+        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+        &0 \\le j < s \\\\
+        &0 \\le k < groups
+
+    Parameters:
+        groups (int, optional): The groups number of maxout. `groups` specifies the
+            index of channel dimension where maxout will be performed. This must be
+            a factor of number of features. Default is 1.
+        axis (int, optional): The axis along which to perform maxout calculations.
+            It should be 1 when data format is NCHW, be -1 or 3 when data format
+            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
+            where D is the dimensions of ``x`` . Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.rand([1, 2, 3, 4])
+            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
+            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
+            #    [0.666127   0.616567   0.30741522 0.24044901]
+            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
+            m = paddle.nn.Maxout(groups=2)
+            out = m(x)
+            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+    """
+
+    def __init__(self, groups, axis=1, name=None):
+        super(Maxout, self).__init__()
+        self._groups = groups
+        self._axis = axis
+        self._name = name
+
+    def forward(self, x):
+        return F.maxout(x, self._groups, self._axis, self._name)
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 433443fee1765..05cbd96863c28 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,7 +15,6 @@
 # TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
-from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
@@ -50,56 +49,74 @@
 
 class Linear(layers.Layer):
     """
-    
-    Fully-connected linear transformation layer:
+
+    Fully-connected linear transformation layer. For each input :math:`X` ,
+    the equation is:
 
     .. math::
 
-        Out = {XW + b}
+        Out = XW + b
 
-    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+    where :math:`W` is the weight and :math:`b` is the bias.
 
-    Linear layer takes only one ``Tensor`` input.
-    The Linear layer multiplies input tensor with weight matrix and
-    produces an output Tensor of shape [N, *, `output_dim`],
-    where N is batch size and `*` means any number of additional dimensions.
-    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
+    Linear layer takes only one multi-dimensional tensor as input with the
+    shape :math:`[batch\_size, *, in\_features]` , where :math:`*` means any
+    number of additional dimensions. It multiplies input tensor with the weight
+    (a 2-D tensor of shape :math:`[in\_features, out\_features]` ) and produces
+    an output tensor of shape :math:`[batch\_size, *, out\_features]` .
+    If :math:`bias\_attr` is not False, the bias (a 1-D tensor of
+    shape :math:`[out\_features]` ) will be created and added to the output.
 
     Parameters:
-        in_features(int): The number of input units in this layer.
-        out_features(int): The number of output units in this layer.
-        weight_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
-            weights(Parameter) of this layer. Default: None.
-        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
-
-    Attributes:
-        **weight** (Parameter): the learnable weights of this layer.
+        in_features (int): The number of input units.
+        out_features (int): The number of output units.
+        weight_attr (ParamAttr, optional): The attribute for the learnable
+            weight of this layer. The default value is None and the weight will be
+            initialized to zero. For detailed information, please refer to
+            paddle.ParamAttr.
+        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
+            of this layer. If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to paddle.ParamAttr. The default value is None and the bias will be
+            initialized to zero.
+        name (str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
 
-        **bias** (Parameter or None): the learnable bias of this layer.
+    Attribute:
+        **weight** (Parameter): the learnable weight of this layer.
 
-    Returns:
-        None
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Shape:
+        - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` .
+        - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` .
 
     Examples:
         .. code-block:: python
 
           import paddle
-          from paddle import nn
-          import numpy as np
-
-          data = np.ones((3,1,2), np.float32)
-          place = paddle.CPUPlace()
-          paddle.disable_static(place)
-          data = paddle.to_tensor(data)
-          weight_attr=paddle.framework.ParamAttr(name="linear_weight", learning_rate=1.0,
-          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
-          bias_attr=paddle.framework.ParamAttr(name="linear_bias", learning_rate=1.0,
-          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
-          linear = nn.Linear(2,2,weight_attr=weight_attr, bias_attr=bias_attr)
-          res = linear(data)  # [3 3 3 3 3 3]
+
+          # Define the linear layer.
+          weight_attr = paddle.ParamAttr(
+              name="weight",
+              initializer=paddle.nn.initializer.Constant(value=0.5))
+          bias_attr = paddle.ParamAttr(
+              name="bias",
+              initializer=paddle.nn.initializer.Constant(value=1.0))
+          linear = paddle.nn.Linear(2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+          # linear.weight: [[0.5 0.5 0.5 0.5]
+          #                 [0.5 0.5 0.5 0.5]]
+          # linear.bias: [1. 1. 1. 1.]
+
+          x = paddle.randn((3, 2), dtype="float32")
+          # x: [[-0.32342386 -1.200079  ]
+          #     [ 0.7979031  -0.90978354]
+          #     [ 0.40597573  1.8095392 ]]
+          y = linear(x)
+          # y: [[0.23824859 0.23824859 0.23824859 0.23824859]
+          #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
+          #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
 
     def __init__(self,
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 98048bb7e64cf..5ce4baca55749 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -23,6 +23,7 @@
 __all__ = [
     'BCEWithLogitsLoss',
     'CrossEntropyLoss',
+    'HSigmoidLoss',
     'MSELoss',
     'L1Loss',
     'NLLLoss',
@@ -251,6 +252,128 @@ def forward(self, input, label):
             reduction=self.reduction)
 
 
+class HSigmoidLoss(fluid.dygraph.Layer):
+    """
+    Hierarchical Sigmoid Layer.
+    
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        feature_size (int): The number of features.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
+            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
+            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        weight_attr (ParamAttr, optional): The parameter attribute for the learnable weights
+            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
+            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
+            initialized with Xavier. Default is None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
+            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
+            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
+            set, the bias is initialized zero. Default is None.
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
+            should not be passed to its forward method. Default is False.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
+            the gradient of weight and input will be sparse. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, D], where N is batch size and D is feature size. It's data type should be float32, float64.
+        label (Tensor): It's shapes is [N, 1]. It's data type should be int64.
+        output (Tensor): The HSigmoid Loss of ``input`` and ``label``. Shape is [N, 1]
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_device('cpu')
+
+            input = paddle.uniform([2, 3])
+            # [[-0.2820413   0.9528898  -0.81638825] # random
+            #  [-0.6733154  -0.33866507  0.25770962]] # random
+            label = paddle.to_tensor([0, 1, 4, 5])
+            m = paddle.nn.HSigmoidLoss(3, 5)
+            out = m(input, label)
+            # [[2.4543471]
+            #  [1.9359267]]
+    """
+
+    def __init__(self,
+                 feature_size,
+                 num_classes,
+                 weight_attr=None,
+                 bias_attr=None,
+                 is_custom=False,
+                 is_sparse=False,
+                 name=None):
+        super(HSigmoidLoss, self).__init__()
+        if (num_classes < 2) and (not is_custom):
+            raise ValueError(
+                "num_classes must not be less than 2 with default tree")
+
+        if (not is_custom) and (is_sparse):
+            print("Sparse mode should not be used without custom tree")
+            is_sparse = False
+
+        self._feature_size = feature_size
+        self._num_classes = num_classes
+        self._is_custom = is_custom
+        self._is_sparse = is_sparse
+
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        self._name = name
+        self._dtype = paddle.get_default_dtype()
+
+        remote_prefetch = is_sparse
+        print("With sparse mode, if your models has only"
+              " small parameter prefetch may cause speed down")
+
+        C = self._num_classes if is_custom else self._num_classes - 1
+        self.weight = self.create_parameter(
+            [C, self._feature_size],
+            attr=self._weight_attr,
+            is_bias=False,
+            dtype=self._dtype)
+        self.bias = self.create_parameter(
+            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
+
+    def forward(self, input, label, path_table=None, path_code=None):
+        out = F.hsigmoid_loss(
+            input,
+            label,
+            self._num_classes,
+            self.weight,
+            self.bias,
+            path_table=path_table,
+            path_code=path_code,
+            is_sparse=self._is_sparse,
+            name=self._name)
+        return out
+
+
 class MSELoss(fluid.dygraph.layers.Layer):
     """
     **Mean Square Error Loss**
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index e6df5366d216c..ea4f6970bc686 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -644,7 +644,7 @@ class TransformerDecoderLayer(Layer):
             `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
             in :ref:`api_fluid_ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
             cross attention, and `bias_attr[2]` would be used as `bias_attr`
@@ -982,12 +982,12 @@ class Transformer(Layer):
     applies another layer normalization on the output of last encoder/decoder layer.
 
     Parameters:
-        d_model (int): The expected feature size in the encoder/decoder input
-            and output.
-        nhead (int): The number of heads in multi-head attention(MHA).
-        num_encoder_layers (int): The number of layers in encoder.
-        num_encoder_layers (int): The number of layers in decoder.
-        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        d_model (int, optional): The expected feature size in the encoder/decoder input
+            and output. Default 512
+        nhead (int, optional): The number of heads in multi-head attention(MHA). Default 8
+        num_encoder_layers (int, optional): The number of layers in encoder. Default 6
+        num_decoder_layers (int, optional): The number of layers in decoder. Default 6
+        dim_feedforward (int, optional): The hidden layer size in the feedforward network(FFN). Default 2048
         dropout (float, optional): The dropout probability used in pre-process
             and post-precess of MHA and FFN sub-layer. Default 0.1
         activation (str, optional): The activation function in the feedforward
@@ -1015,7 +1015,7 @@ class Transformer(Layer):
             Default: None, which means the default weight parameter property is used. 
             See usage for details
             in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
             `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
             would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
@@ -1028,9 +1028,9 @@ class Transformer(Layer):
             The `False` value means the corresponding layer would not have trainable 
             bias parameter. See usage for details in :code:`ParamAttr` . 
             Default: None,which means the default bias parameter property is used.
-        custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
+        custom_encoder (Layer, optional): If custom encoder is provided, use it as the encoder.
             Default None
-        custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
+        custom_decoder (Layer, optional): If custom decoder is provided, use it as the decoder.
             Default None
 
     Examples:
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 30de88cc29e76..6f485e2e9d62f 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -15,19 +15,17 @@
 __all__ = [
     'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
     'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
-    'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'LookaheadOptimizer',
-    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'RMSProp', 'SGD',
-    'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR', 'PiecewiseLR',
-    'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR', 'LinearLrWarmup',
-    'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR', 'ReduceLROnPlateau',
-    'CosineAnnealingLR'
+    'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer',
+    'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
+    'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 
 
 from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
             AdagradOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, \
-            FtrlOptimizer, AdadeltaOptimizer, ModelAverage, \
-            LookaheadOptimizer
+            FtrlOptimizer, AdadeltaOptimizer
 
 from .optimizer import Optimizer
 from .adam import Adam
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 9cbb45ce60d14..366d8b953e3d4 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -29,7 +29,7 @@ class Adam(Optimizer):
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
     the 1st moment estimates and the 2nd moment estimates of the gradient.
-    
+
     The parameter ``param_out`` update rule with gradient ``grad``:
 
     .. math::
@@ -68,13 +68,10 @@ class Adam(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
@@ -82,17 +79,17 @@ class Adam(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=0.1,
@@ -105,12 +102,9 @@ class Adam(Optimizer):
 
             # Adam with beta1/beta2 as Tensor and weight_decay as float
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
 
@@ -140,8 +134,8 @@ def __init__(self,
                  parameters=None,
                  weight_decay=None,
                  grad_clip=None,
-                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 name=None):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -258,7 +252,7 @@ def _append_optimize_op(self, block, param_and_grad):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -266,13 +260,11 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                paddle.disable_static()
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
+                
+                a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0b04f03eb14da..00c197a58b3dd 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -23,7 +23,7 @@
 
 class AdamW(Adam):
     """
-    The AdamW optimizer is implemented based on the AdamW Optimization 
+    The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
     it can resolves the problem of L2 regularization failure in the Adam optimizer.
 
@@ -32,7 +32,7 @@ class AdamW(Adam):
         t & = t + 1
 
         moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
-        
+
         moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
 
         learning\_rate & = learning\_rate * \\
@@ -57,16 +57,13 @@ class AdamW(Adam):
             The default value is 1e-08.
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
-            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            only tensors that makes apply_decay_param_fun(Tensor)==True
             will be updated. It only works when we want to specify tensors.
             Default: None.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
@@ -74,18 +71,18 @@ class AdamW(Adam):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
     **Notes**:
         **Currently, AdamW doesn't support sparse parameter optimization.**
 
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
 
@@ -112,8 +109,8 @@ def __init__(self,
                  weight_decay=0.01,
                  apply_decay_param_fun=None,
                  grad_clip=None,
-                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 name=None):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 2609972d85ccd..5e17ca34ff218 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -90,9 +90,9 @@ class RMSProp(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -104,24 +104,18 @@ class RMSProp(Optimizer):
           .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            inp = paddle.rand([10,10], dtype="float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
 
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
-
-            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
-                    parameters=linear.parameters(),
-                    weight_decay=0.01)
+            rmsprop = paddle.optimizer.RMSProp(learning_rate=0.1,
+                             parameters=linear.parameters(),
+                                       weight_decay=0.01)
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            rmsprop.step()
+            rmsprop.clear_grad()
 
     """
 
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 0f65083dc52e7..909a1b6f39503 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -23,6 +23,7 @@
 ]
 
 from . import nn
+from ..fluid import Scope  #DEFINE_ALIAS
 from .input import data  #DEFINE_ALIAS
 from .input import InputSpec  #DEFINE_ALIAS
 from ..fluid.executor import Executor  #DEFINE_ALIAS
@@ -50,3 +51,5 @@
 from ..fluid.io import load_inference_model  #DEFINE_ALIAS
 from ..fluid.io import load_program_state  #DEFINE_ALIAS
 from ..fluid.io import set_program_state  #DEFINE_ALIAS
+from ..fluid.layers import create_parameter  #DEFINE_ALIAS
+from ..fluid.layers import create_global_var  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 510e11312f4ce..3ae65e879f723 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -25,20 +25,22 @@
     'create_parameter',
     'crf_decoding',
     'data_norm',
-    'deformable_conv',
+    'deform_conv2d',
     'group_norm',
-    'hsigmoid',
     'instance_norm',
     'layer_norm',
     'multi_box_head',
     'nce',
     'prelu',
+    'py_func',
     'row_conv',
     'spectral_norm',
     'switch_case',
 ]
 
-from ...fluid.layers import fc  #DEFINE_ALIAS
+from .common import fc  #DEFINE_ALIAS
+from .common import deform_conv2d  #DEFINE_ALIAS
+
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
 from ...fluid.layers import case  #DEFINE_ALIAS
@@ -49,14 +51,13 @@
 from ...fluid.layers import create_parameter  #DEFINE_ALIAS
 from ...fluid.layers import crf_decoding  #DEFINE_ALIAS
 from ...fluid.layers import data_norm  #DEFINE_ALIAS
-from ...fluid.layers import deformable_conv  #DEFINE_ALIAS
 from ...fluid.layers import group_norm  #DEFINE_ALIAS
-from ...fluid.layers import hsigmoid  #DEFINE_ALIAS
 from ...fluid.layers import instance_norm  #DEFINE_ALIAS
 from ...fluid.layers import layer_norm  #DEFINE_ALIAS
 from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
 from ...fluid.layers import nce  #DEFINE_ALIAS
 from ...fluid.layers import prelu  #DEFINE_ALIAS
+from ...fluid.layers import py_func  #DEFINE_ALIAS
 from ...fluid.layers import row_conv  #DEFINE_ALIAS
 from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
 from ...fluid.layers import switch_case  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
new file mode 100644
index 0000000000000..93a603f4770a7
--- /dev/null
+++ b/python/paddle/static/nn/common.py
@@ -0,0 +1,342 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.framework import static_only
+
+__all__ = ['fc', 'deform_conv2d']
+
+
+@static_only
+def fc(x,
+       size,
+       num_flatten_dims=1,
+       weight_attr=None,
+       bias_attr=None,
+       activation=None,
+       name=None):
+    """
+
+    Fully-Connected layer can take a tensor or a list of tensor as its inputs.
+    It creates a 2-D weight tensor for each input tensor, which represents its
+    weight matrix from each input unit to each output unit. The fully connected
+    layer multiplies each input tensor with its corresponding weight to produce
+    an output tensor with shape :math:`[batch\_size, *, size]` , where :math:`*`
+    means any number of additional dimensions. If a list of tensor is given,
+    the results of multiple output tensors with shape :math:`[batch\_size, *, size]`
+    will be summed up. If :attr:`bias_attr` is not False, a 1-D bias tensor will
+    be created and added to the output. Finally, if :attr:`activation` is not None,
+    it will be applied to the output as well.
+
+    For a single input tensor :math:`X` , the equation is:
+
+    .. math::
+
+        Out = Act({XW + b})
+
+    For a list of input tensor, the equation is:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    where:
+
+    * :math:`N`: The number of the input tensors. :math:`N` equals to :math:`len(X)` if :math:`X` is list of tensor.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weight matrix corresponding i-th input tensor.
+    * :math:`b`: The bias created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    .. code-block:: text
+
+        # Case 1, input is a single tensor:
+        x.data = [[[0.1, 0.2],
+                   [0.3, 0.4]]]
+        x.shape = (1, 2, 2) # 1 is batch_size
+
+        out = paddle.static.nn.fc(x=x, size=1, num_flatten_dims=2)
+
+        # Get the output:
+        out.data = [[0.83234344], [0.34936576]]
+        out.shape = (1, 2, 1)
+
+        # Case 2, input is a list of tensor:
+        x0.data = [[[0.1, 0.2],
+                    [0.3, 0.4]]]
+        x0.shape = (1, 2, 2) # 1 is batch_size
+
+        x1.data = [[[0.1, 0.2, 0.3]]]
+        x1.shape = (1, 1, 3)
+
+        out = paddle.static.nn.fc(x=[x0, x1], size=2)
+
+        # Get the output:
+        out.data = [[0.18669507, 0.1893476]]
+        out.shape = (1, 2)
+
+    Args:
+        x (Tensor|list of Tensor): A tensor or a list of tensor. The number of dimensions
+            of each tensor is at least 2. The data type should be float16, float32 or float64.
+        size (int): The number of output units in this layer, which also means the feature
+            size of output tensor.
+        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multi-dimensional tensor will first be flattened
+            into a 2-D matrix. The parameter :attr:`num_flatten_dims` determines how the input
+            tensor is flattened: the first :math:`num\_flatten\_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest :math:`rank(x) - num\_flatten\_dims` dimensions are
+            flattened to form the second dimension of the final matrix (width of the matrix).
+            For example, assuming that :attr:`x` is a 5-dimensional tensor with a shape
+            :math:`[2, 3, 4, 5, 6]` , and :attr:`num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape :math:`[2 * 3 * 4, 5 * 6] = [24, 30]` .
+            Default: 1.
+        weight_attr (ParamAttr, optional): The attribute for the learnable weight.
+            The default value is None, and the weight will be initialized to zero.
+            For detailed information, please refer to :attr:`paddle.ParamAttr`.
+        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
+            If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to :attr:`paddle.ParamAttr`. The default value is None and the bias will be
+            initialized to zero. 
+        activation (str, optional): Activation to be applied to the output of
+            this layer, such as tanh, softmax, sigmoid, relu. For more information,
+            please refer to :ref:`api_guide_activations_en` . Default: None.
+        name (str, optional): The default value is None. Normally there is no need for user to set
+            it. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor, its shape is :math:`[batch\_size, *, size]` , and the data type is same with input.
+
+    Raises:
+        ValueError: If dimensions of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          paddle.enable_static()
+
+          # When input is a single tensor
+          x = paddle.static.data(name="x", shape=[1, 2, 2], dtype="float32")
+          # x: [[[0.1 0.2]
+          #      [0.3 0.4]]]
+          out = paddle.static.nn.fc(
+              x=x,
+              size=1,
+              num_flatten_dims=2,
+              weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=0.5)),
+              bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
+          # out: [[[1.15]
+          #        [1.35]]]
+
+          # When input is multiple tensors
+          x0 = paddle.static.data(name="x0", shape=[1, 2, 2], dtype="float32")
+          # x0: [[[0.1 0.2]
+          #       [0.3 0.4]]]
+          x1 = paddle.static.data(name="x1", shape=[1, 1, 3], dtype="float32")
+          # x1: [[[0.1 0.2 0.3]]]
+          out = paddle.static.nn.fc(
+              x=[x0, x1],
+              size=2,
+              weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=0.5)),
+              bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
+          # out: [[1.8 1.8]]
+    """
+    return paddle.fluid.layers.fc(input=x,
+                                  size=size,
+                                  num_flatten_dims=num_flatten_dims,
+                                  param_attr=weight_attr,
+                                  bias_attr=bias_attr,
+                                  act=activation,
+                                  name=name)
+
+
+@static_only
+def deform_conv2d(x,
+                  offset,
+                  mask,
+                  num_filters,
+                  filter_size,
+                  stride=1,
+                  padding=0,
+                  dilation=1,
+                  groups=1,
+                  deformable_groups=1,
+                  im2col_step=1,
+                  weight_attr=None,
+                  bias_attr=None,
+                  name=None):
+    """
+
+    Compute 2-D deformable convolution on 4-D input.
+    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
+
+
+    Deformable Convolution v2:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
+
+    Deformable Convolution v1:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}
+
+    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
+    Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
+    <https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
+
+    Example:
+        - Input:
+
+          X shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+          Offset shape: :math:`(N, 2 * deformable\_groups * H_f * H_w, H_{in}, W_{in})`
+
+          Mask shape: :math:`(N, deformable\_groups * H_f * H_w, H_{in}, W_{in})`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        x (Tensor): The input image with [N, C, H, W] format. A Tensor with type
+            float32, float64.
+        offset (Tensor): The input coordinate offset of deformable convolution layer.
+            A Tensor with type float32, float64.
+        Mask (Tensor, Optional): The input mask of deformable convolution layer.
+            A Tensor with type float32, float64. It should be None when you use
+            deformable convolution v1.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the deformable conv layer. According to
+            grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        deformable_groups (int): The number of deformable group partitions.
+            Default: deformable_groups = 1.
+        im2col_step (int): Maximum number of images per im2col computation;
+            The total batch size should be devisable by this value or smaller
+            than this value; if you face out of memory problem, you can try
+            to use a smaller value here.
+            Default: im2col_step = 1.
+        weight_attr (ParamAttr, Optional): The parameter attribute for learnable parameters/weights
+            of deformable conv. If it is set to None or one attribute of ParamAttr,
+            deformable conv will create ParamAttr as weight_attr.
+            If the Initializer of the weight_attr is not set, the parameter is
+            initialized with :math:`Normal(0.0, std)`, and the
+            :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool, Optional): The parameter attribute for the bias of
+            deformable conv layer. If it is set to False, no bias will be added
+            to the output units. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
+                        Generally, no setting is required. Default: None.
+    Returns:
+        Tensor: The tensor storing the deformable convolution \
+                  result. A Tensor with type float32, float64.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+    Examples:
+        .. code-block:: python
+
+          #deformable conv v2:
+
+          import paddle
+          paddle.enable_static()
+
+          C_in, H_in, W_in = 3, 32, 32
+          filter_size, deformable_groups = 3, 1
+          data = paddle.static.data(name='data', shape=[None, C_in, H_in, W_in], dtype='float32')
+          offset = paddle.static.data(name='offset', shape=[None, 2*deformable_groups*filter_size**2, H_in, W_in], dtype='float32')
+          mask = paddle.static.data(name='mask', shape=[None, deformable_groups*filter_size**2, H_in, W_in], dtype='float32')
+          out = paddle.static.nn.deform_conv2d(x=data, offset=offset, mask=mask,
+                                             num_filters=2, filter_size=filter_size, padding=1)
+
+          #deformable conv v1:
+
+          import paddle
+          paddle.enable_static()
+
+          C_in, H_in, W_in = 3, 32, 32
+          filter_size, deformable_groups = 3, 1
+          data = paddle.static.data(name='data', shape=[None, C_in, H_in, W_in], dtype='float32')
+          offset = paddle.static.data(name='offset', shape=[None, 2*deformable_groups*filter_size**2, H_in, W_in], dtype='float32')
+          out = paddle.static.nn.deform_conv2d(x=data, offset=offset, mask=None,
+                                             num_filters=2, filter_size=filter_size, padding=1)
+    """
+
+    if mask is None:
+        return paddle.fluid.layers.deformable_conv(
+            input=x,
+            offset=offset,
+            mask=mask,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deformable_groups=deformable_groups,
+            im2col_step=im2col_step,
+            param_attr=weight_attr,
+            bias_attr=bias_attr,
+            modulated=False,
+            name=name)
+    else:
+        return paddle.fluid.layers.deformable_conv(
+            input=x,
+            offset=offset,
+            mask=mask,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deformable_groups=deformable_groups,
+            im2col_step=im2col_step,
+            param_attr=weight_attr,
+            bias_attr=bias_attr,
+            modulated=True,
+            name=name)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index c41c9226d16b4..2745464995f5d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -687,27 +687,24 @@ def t(input, name=None):
 
 def cross(x, y, axis=None, name=None):
     """
-	:alias_main: paddle.cross
-	:alias: paddle.cross,paddle.tensor.cross,paddle.tensor.linalg.cross
-
     Computes the cross product between two tensors along an axis.
+    
     Inputs must have the same shape, and the length of their axes should be equal to 3.
     If `axis` is not given, it defaults to the first axis found with the length 3.
     
     Args:
-        x (Variable): The first input tensor variable.
-        y (Variable): The second input tensor variable.
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
         axis (int, optional): The axis along which to compute the cross product. It defaults to the first axis found with the length 3.
-        name (str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with same data type as `x`.
+        Tensor. A Tensor with same data type as `x`.
         
     Examples:
         .. code-block:: python
+
             import paddle
-            paddle.disable_static()
 
             x = paddle.to_tensor([[1.0, 1.0, 1.0],
                                   [2.0, 2.0, 2.0],
@@ -715,14 +712,13 @@ def cross(x, y, axis=None, name=None):
             y = paddle.to_tensor([[1.0, 1.0, 1.0],
                                   [1.0, 1.0, 1.0],
                                   [1.0, 1.0, 1.0]])
+
             z1 = paddle.cross(x, y)
-            print(z1.numpy())
             # [[-1. -1. -1.]
             #  [ 2.  2.  2.]
             #  [-1. -1. -1.]]
 
             z2 = paddle.cross(x, y, axis=1)
-            print(z2.numpy())
             # [[0. 0. 0.]
             #  [0. 0. 0.]
             #  [0. 0. 0.]]
@@ -848,6 +844,10 @@ def bmm(x, y, name=None):
         raise ValueError(
             "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
             format(x_shape, y_shape))
+    if x_shape[0] != y_shape[0]:
+        raise ValueError(
+            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".
+            format(x_shape, y_shape))
     helper = LayerHelper('bmm', **locals())
     if in_dygraph_mode():
         return core.ops.bmm(x, y)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 86bf9b31f9a9e..531629c573fb6 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -25,7 +25,6 @@
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
-from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
@@ -1461,3 +1460,89 @@ def gather_nd(x, index, name=None):
     """
 
     return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
+
+
+def strided_slice(x, axes, starts, ends, strides, name=None):
+    """
+    This operator produces a slice of ``x`` along multiple axes. Similar to numpy:
+    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+    Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and
+    end dimension for each axis in the list of axes and Slice uses this information
+    to slice the input data tensor. If a negative value is passed to
+    ``starts`` or ``ends`` such as :math:`-i`,  it represents the reverse position of the
+    axis :math:`i-1` th(here 0 is the initial position). The ``strides`` represents steps of
+    slicing and if the ``strides`` is negative, slice operation is in the opposite direction.
+    If the value passed to ``starts`` or ``ends`` is greater than n
+    (the number of elements in this dimension), it represents n.
+    For slicing to the end of a dimension with unknown size, it is recommended
+    to pass in INT_MAX. The size of ``axes`` must be equal to ``starts`` , ``ends`` and ``strides``.
+    Following examples will explain how strided_slice works:
+
+    .. code-block:: text
+
+        Case1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+                strides = [1, 1]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Case2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [2, 0]
+                strides = [1, -1]
+            Then:
+                result = [ [8, 7, 6], ]
+        Case3:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [-1, 1000]
+                strides = [1, 3]
+            Then:
+                result = [ [2], ]
+    Args:
+        x (Tensor): An N-D ``Tensor``. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
+                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of                                                                                          it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.                                                                                    It represents starting indices of corresponding axis in ``axes``.
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .                                                                                     It represents ending indices of corresponding axis in ``axes``.
+        strides (list|tuple|Tensor): The data type is ``int32`` . If ``strides`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``strides`` is an Tensor, it should be an 1-D Tensor .                                                                                  It represents slice step of corresponding axis in ``axes``.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor:  A ``Tensor`` with the same dimension as ``x``. The data type is same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.zeros(shape=[3,4,5,6], dtype="float32")
+            # example 1:
+            # attr starts is a list which doesn't contain Tensor.
+            axes = [1, 2, 3]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            strides_1 = [1, 1, 1]
+            strides_2 = [1, 1, 2]
+            sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
+            # example 2:
+            # attr starts is a list which contain tensor Tensor.
+            minus_3 = paddle.fill_constant([1], "int32", -3)
+            sliced_2 = paddle.strided_slice(x, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2)
+            # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
+    """
+
+    return paddle.fluid.layers.strided_slice(
+        input=x, axes=axes, starts=starts, ends=ends, strides=strides)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 51dc771281393..138841fcf074b 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -472,15 +472,27 @@ def multiply(x, y, axis=-1, name=None):
     """
     op_type = 'elementwise_mul'
     act = None
+
     if x.dtype != y.dtype:
         raise TypeError(
             'Input tensors must be same type, but received type of x: %s, type of y: %s '
             % (x.dtype, y.dtype))
 
     if in_dygraph_mode():
+        if not isinstance(x, (paddle.Tensor)):
+            x = paddle.to_tensor(x)
+        if not isinstance(y, (paddle.Tensor)):
+            y = paddle.to_tensor(y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
+    if not isinstance(x, (paddle.Tensor, Variable)):
+        x = paddle.static.data(
+            name='x', shape=x.shape, dtype=x.dtype)
+    if not isinstance(y, (paddle.Tensor, Variable)):
+        y = paddle.static.data(
+            name='y', shape=y.shape, dtype=y.dtype)
+
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 def maximum(x, y, axis=-1, name=None):
@@ -999,7 +1011,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
-       logsumexp(x) = \log\sum exp(x)
+       logsumexp(x) = \\log\\sum exp(x)
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1030,8 +1042,6 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
 
         import paddle
 
-        paddle.disable_static()
-
         x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
         out1 = paddle.logsumexp(x) # [3.4691226]
         out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
@@ -1624,39 +1634,37 @@ def kron(x, y, name=None):
 
 def cumsum(x, axis=None, dtype=None, name=None):
     """
-    The cumulative sum of the elements along a given axis. The first element of the result is the same of the first element of the input. 
+    The cumulative sum of the elements along a given axis. 
+    
+    **Note**:
+    The first element of the result is the same of the first element of the input. 
 
     Args:
-        x (Tensor): Input of cumsum operator, the Tensor needed to be cumsumed. 
+        x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
         dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the result of cumsum operator, output of cumsum operator. 
+        Tensor, the result of cumsum operator. 
 
     Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
-
-            paddle.disable_static()
-            data_np = np.arange(12).reshape(3, 4)
-            data = paddle.to_tensor(data_np)
+            
+            data = paddle.arange(12)
+            data = paddle.reshape(data, (3, 4))
 
             y = paddle.cumsum(data)
-            print(y.numpy())
             # [ 0  1  3  6 10 15 21 28 36 45 55 66]
 
             y = paddle.cumsum(data, axis=0)
-            print(y.numpy())
             # [[ 0  1  2  3]
             #  [ 4  6  8 10]
             #  [12 15 18 21]]
             
             y = paddle.cumsum(data, axis=-1)
-            print(y.numpy())
             # [[ 0  1  3  6]
             #  [ 4  9 15 22]
             #  [ 8 17 27 38]]
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index f55d285586f0e..19d8fc58b0e7e 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -339,11 +339,8 @@ def index_select(x, index, axis=0, name=None):
     return out
 
 
-def nonzero(input, as_tuple=False):
+def nonzero(x, as_tuple=False):
     """
-	:alias_main: paddle.nonzero
-	:alias: paddle.nonzero,paddle.tensor.nonzero,paddle.tensor.search.nonzero
-
     Return a tensor containing the indices of all non-zero elements of the `input` 
     tensor. If as_tuple is True, return a tuple of 1-D tensors, one for each dimension 
     in `input`, each containing the indices (in that dimension) of all non-zero elements 
@@ -353,17 +350,17 @@ def nonzero(input, as_tuple=False):
     a 1-D tensor tuple of length `n`, and the shape of each 1-D tensor is [z, 1].
 
     Args:
-        inputs (Variable): The input tensor variable.
+        x (Tensor): The input tensor variable.
         as_tuple (bool): Return type, Tensor or tuple of Tensor.
 
     Returns:
-        Variable. The data type is int64.
+        Tensor. The data type is int64.
 
     Examples:
+    
         .. code-block:: python
-            import paddle
 
-            paddle.disable_static()
+            import paddle
 
             x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
                           [0.0, 2.0, 0.0],
@@ -402,13 +399,13 @@ def nonzero(input, as_tuple=False):
             #[]                    
     """
     list_out = []
-    shape = input.shape
+    shape = x.shape
     rank = len(shape)
 
     if in_dygraph_mode():
-        outs = core.ops.where_index(input)
+        outs = core.ops.where_index(x)
     else:
-        outs = layers.where(input)
+        outs = layers.where(x)
 
     if not as_tuple:
         return outs
diff --git a/python/paddle/tests/test_callbacks.py b/python/paddle/tests/test_callbacks.py
index f0d9a132b90eb..b9442c46b8fd4 100644
--- a/python/paddle/tests/test_callbacks.py
+++ b/python/paddle/tests/test_callbacks.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import unittest
 import time
 import random
 import tempfile
 import shutil
+import paddle
 
 from paddle import Model
 from paddle.static import InputSpec
@@ -102,6 +104,32 @@ def test_callback_verbose_2(self):
         self.verbose = 2
         self.run_callback()
 
+    def test_visualdl_callback(self):
+        # visualdl not support python3
+        if sys.version_info < (3, ):
+            return
+
+        inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+
+        train_dataset = paddle.vision.datasets.MNIST(mode='train')
+        eval_dataset = paddle.vision.datasets.MNIST(mode='test')
+
+        net = paddle.vision.LeNet()
+        model = paddle.Model(net, inputs, labels)
+
+        optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
+        model.prepare(
+            optimizer=optim,
+            loss=paddle.nn.CrossEntropyLoss(),
+            metrics=paddle.metric.Accuracy())
+
+        callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')
+        model.fit(train_dataset,
+                  eval_dataset,
+                  batch_size=64,
+                  callbacks=callback)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 96c4483a35ba8..56105b6d7f15a 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -501,6 +501,11 @@ def test_summary_nlp(self):
         rnn = paddle.nn.LSTM(16, 32, 2)
         paddle.summary(rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
 
+    def test_summary_dtype(self):
+        input_shape = (3, 1)
+        net = paddle.nn.Embedding(10, 3, sparse=True)
+        paddle.summary(net, input_shape, dtypes='int64')
+
     def test_summary_error(self):
         with self.assertRaises(TypeError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
@@ -551,9 +556,10 @@ def test_export_deploy_model(self):
                 shutil.rmtree(save_dir)
             paddle.enable_static()
 
-    def test_dygraph_export_deploy_model_without_inputs(self):
+    def test_dygraph_export_deploy_model_about_inputs(self):
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
+        # without inputs
         for initial in ["fit", "train_batch", "eval_batch", "test_batch"]:
             save_dir = tempfile.mkdtemp()
             if not os.path.exists(save_dir):
@@ -579,6 +585,18 @@ def test_dygraph_export_deploy_model_without_inputs(self):
 
             model.save(save_dir, training=False)
             shutil.rmtree(save_dir)
+        # with inputs, and the type of inputs is InputSpec
+        save_dir = tempfile.mkdtemp()
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        net = LeNet()
+        inputs = InputSpec([None, 1, 28, 28], 'float32', 'x')
+        model = Model(net, inputs)
+        optim = fluid.optimizer.Adam(
+            learning_rate=0.001, parameter_list=model.parameters())
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+        model.save(save_dir, training=False)
+        shutil.rmtree(save_dir)
 
 
 class TestRaiseError(unittest.TestCase):
diff --git a/python/setup.py.in b/python/setup.py.in
index 414258a3b3756..f09c189a68e1c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -192,6 +192,7 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet.parameter_server.ir',
           'paddle.fluid.incubate.fleet.collective',
           'paddle.fluid.incubate.fleet.utils',
+          'paddle.amp',
           'paddle.hapi',
           'paddle.vision',
           'paddle.vision.models',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 56c8be862f887..b61ba138441c9 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -2,3 +2,5 @@ PyGithub
 coverage
 pycrypto ; platform_system != "Windows"
 mock
+opencv-python<=4.2.0.32
+visualdl ; python_version>="3.5"
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 9fe58885fa553..9b5602d4943ad 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,14 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python==4.2.0.32
+    pip --no-cache-dir install 'ipykernel==4.6.0'
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index e7827b6598eeb..212e9acfea541 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -28,11 +28,13 @@ function ref_whl(){
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   else
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   fi
   
   if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
@@ -40,11 +42,13 @@ function ref_whl(){
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
   else
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   fi
 }
 
@@ -55,6 +59,7 @@ function install_whl(){
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle3_whl} && pip3.5 install ${ref_paddle3_whl} && rm  -f ${ref_paddle3_whl}" Dockerfile.tmp
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle36_whl} && pip3.6 install ${ref_paddle36_whl} && rm -f ${ref_paddle36_whl}" Dockerfile.tmp
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle37_whl} && pip3.7 install ${ref_paddle37_whl} && rm -f ${ref_paddle37_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle38_whl} && pip3.8 install ${ref_paddle38_whl} && rm -f ${ref_paddle38_whl}" Dockerfile.tmp
 }
 
 function install_gcc(){
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index a1881f551da1c..81eb19dc0661e 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -36,7 +36,7 @@ if [ $numa_nodes -lt $sockets ]; then
 fi
 
 echo "********** Software Information **********"
-echo "OS Version             : `cat /proc/version`"
+echo "OS Version             : `uname -o`"
 echo "Kernel Release Version : `uname -r`"
 echo "Kernel Patch Version   : `uname -v`"
 echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index c27fdcea2401c..55c30579fb91e 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -174,16 +174,12 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3 --no-cache-dir install opencv-python  && \
     pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3.6 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3.6 --no-cache-dir install opencv-python  && \
     pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3.7 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3.7 --no-cache-dir install opencv-python  && \
     pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
-    pip --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip --no-cache-dir install  opencv-python
+    pip --no-cache-dir install ipykernel==4.6.0 
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort 
diff --git a/tools/wlist.json b/tools/wlist.json
index 3ca14cd1dd6f9..22bab658464cb 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -279,7 +279,6 @@
         "thresholded_relu",
         "group_norm",
         "random_crop",
-        "py_func",
         "row_conv",
         "hard_shrink",
         "ssd_loss",