diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d8112837dc962..c8af603d53363 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,5 +48,13 @@ repos:
         name: copyright_checker
         entry: python ./tools/codestyle/copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
         exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
+-   repo: local
+    hooks:
+    -   id: shellcheck
+        name: shellcheck
+        entry: shellcheck
+        language: system
+        files: .sh$
+        exclude: (paddle_build.sh|fast_install.sh|check_file_diff_approvals.sh)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa87cc14f2668..06d687fc9c4f3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -240,6 +240,12 @@ if(WITH_AMD_GPU)
     include(hip)
 endif(WITH_AMD_GPU)
 
+if(WITH_DISTRIBUTE)
+    if(LINUX)
+        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+    endif()
+endif()
+
 if(WITH_ARM)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 895bc0849a2a3..ace71a7f63413 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -14,55 +14,41 @@
 
 INCLUDE(ExternalProject)
 
-execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
-
 SET(GLOO_PROJECT       "extern_gloo")
-IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(GLOO_VER "master" CACHE STRING "" FORCE)
-  SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-
-  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
-  else()
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
-  endif()
-ENDIF()
-
-MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
-SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
-SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
-SET(GLOO_DST_DIR       "gloo")
-SET(GLOO_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(GLOO_INSTALL_DIR   ${GLOO_INSTALL_ROOT}/${GLOO_DST_DIR})
-SET(GLOO_ROOT          ${GLOO_INSTALL_DIR})
-SET(GLOO_INC_DIR       ${GLOO_ROOT}/include)
-SET(GLOO_LIB_DIR       ${GLOO_ROOT}/lib)
-SET(GLOO_LIB           ${GLOO_LIB_DIR}/libgloo.a)
-#SET(GLOO_IOMP_LIB      ${GLOO_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${GLOO_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${GLOO_INC_DIR})
-
-FILE(WRITE ${GLOO_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(GLOO)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${GLOO_NAME}/include ${GLOO_NAME}/lib \n"
-  "        DESTINATION ${GLOO_DST_DIR})\n")
+SET(GLOO_PREFIX_DIR    ${THIRD_PARTY_PATH}/gloo)
+SET(GLOO_SOURCE_DIR    ${THIRD_PARTY_PATH}/gloo/src/extern_gloo/gloo)
+SET(GLOO_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gloo)
+SET(GLOO_INCLUDE_DIR   "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
+SET(GLOO_LIBRARY_DIR   "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
+# As we add extra features for gloo, we use the non-official repo
+SET(GLOO_REPOSITORY    https://github.com/sandyhouse/gloo.git)
+SET(GLOO_TAG           v0.0.2)
+SET(GLOO_LIBRARIES     "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
+
+INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
+
+cache_third_party(extern_gloo
+    REPOSITORY    ${GLOO_REPOSITORY}
+    TAG           ${GLOO_TAG}
+    DIR           GLOO_SOURCE_DIR)
 
 ExternalProject_Add(
-    ${GLOO_PROJECT}
+    extern_gloo
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${GLOO_SOURCE_DIR}
-    DOWNLOAD_DIR          ${GLOO_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${GLOO_URL} -c -q -O ${GLOO_NAME}.tar.gz
-                          && tar zxvf ${GLOO_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
+    ${SHALLOW_CLONE}
+    "${GLOO_DOWNLOAD_CMD}"
+    PREFIX                "${GLOO_PREFIX_DIR}"
+    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
     UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${GLOO_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GLOO_INSTALL_ROOT}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
 )
 
-ADD_LIBRARY(gloo SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIB})
+
+ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
 ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d31943289d7a1..b40cbdcc1b1bf 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -247,7 +247,8 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         graph build_strategy collective_helper
         fast_threaded_ssa_graph_executor variable_helper)
 
-cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS executor)
+cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
+    conditional_block_op executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index f6f3098613ba1..4951ada9bd55a 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <set>
 #include <string>
 #include <thread>         // NOLINT
 #include <unordered_map>  // NOLINT
@@ -313,6 +314,10 @@ class DownpourWorker : public HogwildWorker {
   std::map<uint64_t, std::vector<std::string>> dense_value_names_;
   std::map<uint64_t, uint64_t> table_dependency_;
   std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
+  // multitask
+  std::map<int32_t, uint64_t> cond2table_map_;
+  std::set<uint64_t> condvalue_set_;
+  bool flag_partial_push_;
 
  private:
   // std::vector<std::string> dump_param_;
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 00f721701a4a5..e2c85ab3905ff 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cstdlib>
+#include <ctime>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 
@@ -65,6 +67,13 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
     }
   }
 
+  flag_partial_push_ = false;
+  for (auto& m : param_.program_config(0).partial_pushdense_condtable_map()) {
+    cond2table_map_[m.key()] = m.value();
+    condvalue_set_.insert(m.value());
+    flag_partial_push_ = true;
+  }
+
   skip_ops_.resize(param_.skip_ops_size());
   for (int i = 0; i < param_.skip_ops_size(); ++i) {
     skip_ops_[i] = param_.skip_ops(i);
@@ -876,14 +885,42 @@ void DownpourWorker::TrainFiles() {
 #endif
 
     if (need_to_push_dense_) {
-      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_dense_table_id(i));
-        fleet_ptr_->PushDenseVarsAsync(
-            *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
-            scale_datanorm_, cur_batch);
+      if (flag_partial_push_) {
+        Variable* var = (*thread_scope_).FindVar("cond_tag");
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        // check type in python code
+        int64_t* cond_value_batch = tensor->data<int64_t>();
+
+        for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+             ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).push_dense_table_id(i));
+          if (condvalue_set_.find(tid) != condvalue_set_.end()) {
+            // common dense table must push dense
+            if (cond2table_map_[cond_value_batch[0]] != tid) {
+              // can't push dense
+              continue;
+            }
+          }
+
+          VLOG(3) << "push multitask dense gradient " << tid;
+          fleet_ptr_->PushDenseVarsAsync(
+              *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
+              scale_datanorm_, cur_batch);
+        }
+
+      } else {
+        for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+             ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).push_dense_table_id(i));
+
+          fleet_ptr_->PushDenseVarsAsync(
+              *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
+              scale_datanorm_, cur_batch);
+        }
       }
+
       VLOG(3) << "push dense gradient done.";
 
       // the following code should be more precise and clean
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 693073d1fc73a..25086001598b4 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -29,6 +29,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include <algorithm>
 #include <utility>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 8e232560ab687..6ed58d96333ca 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -27,6 +27,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
 #ifdef PADDLE_WITH_PSLIB
 
 namespace paddle {
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
index 747fd434ae783..83838f4df67d0 100644
--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -12,6 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
 #ifdef PADDLE_WITH_PSLIB
 
 #if defined _WIN32 || defined __APPLE__
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 1117d676a5ece..9aea9d4a83284 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"
@@ -47,6 +48,8 @@ void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
     ops_.push_back(local_op_ptr);
     continue;
   }
+  operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
+      program, 0, ops_);
 }
 
 void HogwildWorker::CreateThreadScope(const ProgramDesc &program) {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 63524294b68ef..dfb030a7cc768 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -84,19 +84,6 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
-    if (conv->Op()->HasAttr("dilations")) {
-      auto dilations =
-          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
-      for (const auto& d : dilations) {
-        if (d != 1) {
-          LOG(WARNING)
-              << "dilation conv not supported in MKLDNN, fuse not apply "
-              << "and set conv attribute use_mkldnn = false";
-          conv->Op()->SetAttr("use_mkldnn", false);
-          return;
-        }
-      }
-    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 92e995579fa9e..b9bd660043bf1 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 6aeef8a39b533..bfb5aa4a26aec 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -14,14 +14,11 @@ limitations under the License. */
 #include <time.h>
 
 #include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
-class Scope;
-class Variable;
-
 std::shared_ptr<PullDenseWorker> PullDenseWorker::s_instance_ = NULL;
 std::mutex PullDenseWorker::mutex_for_version_;
 std::map<uint64_t, uint64_t> PullDenseWorker::last_versions_;
@@ -70,7 +67,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
 }
 
 void PullDenseWorker::CreatePinVar() {
-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 4fe01aff79e52..4730f6a4ec887 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor_util.h"
+
 #include <algorithm>
 #include <limits>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -81,6 +84,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@@ -282,6 +291,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@@ -943,6 +958,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
 #endif
 }
 
+template <typename T>
+std::string format_tensor(const framework::Tensor& tensor) {
+  // TODO(zhiqiu): use the print option to format tensor.
+  return "NOT IMPLEMENTED";
+}
+
 template <typename T>
 std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
   auto inspect = tensor.data<T>();
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 1e811a41e90af..50644370bc6b6 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -25,6 +25,26 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class PrintOptions {
+ public:
+  static PrintOptions& Instance() {
+    static PrintOptions instance;
+    return instance;
+  }
+  ~PrintOptions() {}
+  PrintOptions(const PrintOptions& o) = delete;
+  const PrintOptions& operator=(const PrintOptions& o) = delete;
+
+  int precision = 8;
+  int threshold = 1000;
+  int edgeitems = 3;
+  int linewidth = 75;
+  bool sci_mode = false;
+
+ private:
+  PrintOptions() {}
+};
+
 // NOTE(zcd): Because TensorCopy is an async operation, when the src_place
 // and dst_place are two different GPU, to ensure that the operation can
 // be carried out correctly, there is a src_ctx wait operation in TensorCopy.
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 1985742fc4aa6..87de436617e11 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -148,12 +148,17 @@ message CopyTableConfig {
   repeated TableDependencyMap table_denpendency_map = 12;
 }
 
+message CondTableMap {
+  required int32 key = 1;
+  required int32 value = 2;
+}
 message ProgramConfig {
   required string program_id = 1;
   repeated int32 push_sparse_table_id = 2;
   repeated int32 push_dense_table_id = 3;
   repeated int32 pull_sparse_table_id = 4;
   repeated int32 pull_dense_table_id = 5;
+  repeated CondTableMap partial_pushdense_condtable_map = 10;
 }
 
 message PullDenseWorkerParameter {
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 892acffb712d9..bb6a48c6e649c 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -17,10 +17,12 @@
 //
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -286,9 +288,9 @@ TEST(test_tracer, test_unique_name_generator) {
   ASSERT_STREQ("fc_1", fc_2.c_str());
   // use `eager_tmp` as key if not specify it.
   auto tmp_var_2 = tracer.GenerateUniqueName();
-  ASSERT_STREQ("eager_tmp_2", tmp_var_2.c_str());
-  auto tmp_var_3 = tracer.GenerateUniqueName("eager_tmp");
-  ASSERT_STREQ("eager_tmp_3", tmp_var_3.c_str());
+  ASSERT_STREQ("dygraph_tmp_2", tmp_var_2.c_str());
+  auto tmp_var_3 = tracer.GenerateUniqueName("dygraph_tmp");
+  ASSERT_STREQ("dygraph_tmp_3", tmp_var_3.c_str());
 }
 
 TEST(test_tracer, test_current_tracer) {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 71996b3e1ac99..dd3950e7e0347 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
@@ -32,7 +33,7 @@ namespace imperative {
 class UniqueNameGenerator {
  public:
   explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
-  std::string Generate(std::string key = "eager_tmp") {
+  std::string Generate(std::string key = "dygraph_tmp") {
     return prefix_ + key + "_" + std::to_string(id_++);
   }
 
@@ -83,7 +84,7 @@ class Tracer {
   // name like `tmp_0` in some cases when transform dygraph into static layers.
   // So we modify the default prefix key into `eager_tmp` to distinguish with
   // static graph.
-  std::string GenerateUniqueName(std::string key = "eager_tmp") {
+  std::string GenerateUniqueName(std::string key = "dygraph_tmp") {
     return generator_->Generate(key);
   }
 
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 40ca3e85868fb..cd0fc03852a4d 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -178,6 +178,10 @@ struct Argument {
 
   // Scales for variables to be quantized
   DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
+
+  // A set of op types to enable their bfloat16 kernels
+  DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, Bfloat16EnabledOpTypes,
+                      std::unordered_set<std::string>);
 #endif
 
   // Passed from config.
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 4abe293c930e2..6965a0c904105 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
@@ -125,6 +126,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
   CP_MEMBER(mkldnn_cache_capacity_);
+  // Bfloat16 related.
+  CP_MEMBER(use_mkldnn_bfloat16_);
+  CP_MEMBER(bfloat16_enabled_op_types_);
   // Quantization related.
   CP_MEMBER(use_mkldnn_quantizer_);
   CP_MEMBER(mkldnn_quantizer_config_);
@@ -219,7 +223,12 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
 
 void AnalysisConfig::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
-  use_mkldnn_bfloat16_ = true;
+  if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) {
+    use_mkldnn_bfloat16_ = true;
+  } else {
+    LOG(INFO) << "CPU does not support BFLOAT16 calculations";
+    use_mkldnn_bfloat16_ = false;
+  }
 #else
   LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
   use_mkldnn_bfloat16_ = false;
@@ -417,6 +426,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << use_mkldnn_quantizer_;
   ss << use_mkldnn_bfloat16_;
+  for (auto &item : bfloat16_enabled_op_types_) ss << item;
+  ss << ";";
   ss << model_from_memory_;
 
   ss << with_profile_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 98bee2d4bb471..5dae7368a8e7d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -501,6 +501,10 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetQuantizeExcludedOpIds(
         config_.mkldnn_quantizer_config()->excluded_op_ids());
   }
+  if (config_.use_mkldnn_bfloat16_) {
+    LOG(INFO) << "Bfloat16 is enabled";
+    argument_.SetBfloat16EnabledOpTypes(config_.bfloat16_enabled_op_types_);
+  }
 #endif
 
   auto passes = config_.pass_builder()->AllPasses();
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 6a3760e1f749b..b7e8f40e40859 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -47,19 +47,24 @@ if (WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
   option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
   if (MSVC_STATIC_CRT)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    if (WITH_MKL)
+      set(FLAG_OPENMP "/openmp")
+    endif()
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     safe_set_static_flag()
     if (WITH_STATIC_LIB)
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  if(WITH_MKL)
+    set(FLAG_OPENMP "-fopenmp")
+  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
 endif()
-message("flags" ${CMAKE_CXX_FLAGS})
 
 if(WITH_GPU)
   if(NOT WIN32)
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index b1244e4e3dfdd..7ad3aaf1f9d08 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -414,6 +414,14 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
 
+  /// \brief Specify the operator type list to use Bfloat16 acceleration.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void SetBfloat16Op(std::unordered_set<std::string> op_list) {
+    bfloat16_enabled_op_types_ = op_list;
+  }
+
   ///
   /// \brief A boolean state telling whether the thread local CUDA stream is
   /// enabled.
@@ -606,6 +614,7 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
   bool use_mkldnn_bfloat16_{false};
+  std::unordered_set<std::string> bfloat16_enabled_op_types_;
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 5d6970fc4e385..17d2c0c0eef8b 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -21,6 +21,12 @@ function(download_int8_data install_dir data_file)
     endif()
 endfunction()
 
+function(download_bfloat16_data install_dir data_file)
+    if (NOT EXISTS ${install_dir}/${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file})
+    endif()
+endfunction()
+
 function(download_GRU_data install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
         inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file})
@@ -60,6 +66,7 @@ function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir
              --infer_data=${data_path}
              --warmup_batch_size=${WARMUP_BATCH_SIZE}
              --batch_size=50
+             --enable_int8=true
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=2)
 endfunction()
@@ -69,6 +76,17 @@ function(inference_analysis_api_int8_test_run_custom_warmup_batch_size TARGET_NA
     inference_analysis_api_int8_test_run(${TARGET_NAME} ${test_binary} ${model_dir} ${data_path})
 endfunction()
 
+function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary model_dir data_path)
+	inference_analysis_test_run(${TARGET_NAME}
+	COMMAND ${test_binary}
+        ARGS --infer_model=${model_dir}/model
+             --infer_data=${data_path}
+             --batch_size=50
+             --enable_bf16=true
+             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+	     --iterations=2)
+endfunction()
+
 function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_binary model_dir data_path)
 	inference_analysis_test_run(${TARGET_NAME}
 	COMMAND ${test_binary}
@@ -76,6 +94,7 @@ function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_bi
              --infer_data=${data_path}
              --warmup_batch_size=10
              --batch_size=300
+             --enable_int8=true
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=1)
 endfunction()
@@ -99,6 +118,7 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
              --int8_model=${int8_model_dir}
              --infer_data=${data_path}
              --batch_size=50
+             --enable_int8=true
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
              --with_accuracy_layer=false
              --iterations=2)
@@ -346,6 +366,16 @@ if(WITH_MKLDNN)
   download_int8_data(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10)
 
+  ### BFLOAT16 tests
+
+  # build test binary to be used in subsequent tests
+  set(BF16_IMG_CLASS_TEST_APP "test_analyzer_bfloat16_image_classification")
+  set(BF16_IMG_CLASS_TEST_APP_SRC "analyzer_bfloat16_image_classification_tester.cc")
+  inference_analysis_api_test_build(${BF16_IMG_CLASS_TEST_APP} ${BF16_IMG_CLASS_TEST_APP_SRC})
+
+  # resnet50 bfloat16
+  inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_resnet50 ${BF16_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
   ### Object detection models
   set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin")
   set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection")
diff --git a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
new file mode 100644
index 0000000000000..3621477148fff
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_num_threads);
+  cfg->EnableMKLDNN();
+}
+
+TEST(Analyzer_int8_image_classification, bfloat16) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  AnalysisConfig q_cfg;
+  SetConfig(&q_cfg);
+
+  // read data from file and prepare batches with test data
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInputs(&input_slots_all);
+  q_cfg.SwitchIrDebug();
+  q_cfg.EnableMkldnnBfloat16();
+  q_cfg.SetBfloat16Op({"conv2d"});
+  CompareBFloat16AndAnalysis(&cfg, &q_cfg, input_slots_all);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index 5f2c879fe0a0c..6bfa8a821ae8c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -30,123 +30,6 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->EnableMKLDNN();
 }
 
-template <typename T>
-class TensorReader {
- public:
-  TensorReader(std::ifstream &file, size_t beginning_offset,
-               std::vector<int> shape, std::string name)
-      : file_(file), position_(beginning_offset), shape_(shape), name_(name) {
-    numel_ = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
-                             std::multiplies<size_t>());
-  }
-
-  PaddleTensor NextBatch() {
-    PaddleTensor tensor;
-    tensor.name = name_;
-    tensor.shape = shape_;
-    tensor.dtype = GetPaddleDType<T>();
-    tensor.data.Resize(numel_ * sizeof(T));
-
-    file_.seekg(position_);
-    file_.read(static_cast<char *>(tensor.data.data()), numel_ * sizeof(T));
-    position_ = file_.tellg();
-
-    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
-    if (file_.fail())
-      throw std::runtime_error(name_ + ": failed reading file.");
-
-    return tensor;
-  }
-
- protected:
-  std::ifstream &file_;
-  size_t position_;
-  std::vector<int> shape_;
-  std::string name_;
-  size_t numel_;
-};
-
-std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
-    const std::vector<std::vector<PaddleTensor>> &test_data,
-    int num_images = FLAGS_warmup_batch_size) {
-  int test_data_batch_size = test_data[0][0].shape[0];
-  auto iterations = test_data.size();
-  auto all_test_data_size = iterations * test_data_batch_size;
-  PADDLE_ENFORCE_LE(static_cast<size_t>(num_images), all_test_data_size,
-                    platform::errors::InvalidArgument(
-                        "The requested quantization warmup data size must be "
-                        "lower or equal to the test data size. But received"
-                        "warmup size is %d and test data size is %d. Please "
-                        "use --warmup_batch_size parameter to set smaller "
-                        "warmup batch size.",
-                        num_images, all_test_data_size));
-
-  PaddleTensor images;
-  images.name = "image";
-  images.shape = {num_images, 3, 224, 224};
-  images.dtype = PaddleDType::FLOAT32;
-  images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
-
-  PaddleTensor labels;
-  labels.name = "label";
-  labels.shape = {num_images, 1};
-  labels.dtype = PaddleDType::INT64;
-  labels.data.Resize(sizeof(int64_t) * num_images);
-
-  for (int i = 0; i < num_images; i++) {
-    auto batch = i / test_data_batch_size;
-    auto element_in_batch = i % test_data_batch_size;
-    std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
-                    element_in_batch * 3 * 224 * 224,
-                3 * 224 * 224,
-                static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
-
-    std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
-                    element_in_batch,
-                1, static_cast<int64_t *>(labels.data.data()) + i);
-  }
-
-  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
-  (*warmup_data)[0] = std::move(images);
-  (*warmup_data)[1] = std::move(labels);
-  return warmup_data;
-}
-
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
-              int32_t batch_size = FLAGS_batch_size) {
-  std::ifstream file(FLAGS_infer_data, std::ios::binary);
-  if (!file) {
-    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
-  }
-
-  int64_t total_images{0};
-  file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
-  LOG(INFO) << "Total images in file: " << total_images;
-
-  std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
-  std::vector<int> label_batch_shape{batch_size, 1};
-  auto images_offset_in_file = static_cast<size_t>(file.tellg());
-  auto labels_offset_in_file =
-      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
-
-  TensorReader<float> image_reader(file, images_offset_in_file,
-                                   image_batch_shape, "image");
-  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
-                                     label_batch_shape, "label");
-
-  auto iterations_max = total_images / batch_size;
-  auto iterations = iterations_max;
-  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
-    iterations = FLAGS_iterations;
-  }
-  for (auto i = 0; i < iterations; i++) {
-    auto images = image_reader.NextBatch();
-    auto labels = label_reader.NextBatch();
-    inputs->emplace_back(
-        std::vector<PaddleTensor>{std::move(images), std::move(labels)});
-  }
-}
-
 TEST(Analyzer_int8_image_classification, quantization) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
@@ -156,13 +39,13 @@ TEST(Analyzer_int8_image_classification, quantization) {
 
   // read data from file and prepare batches with test data
   std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
+  SetInputs(&input_slots_all);
 
   if (FLAGS_enable_int8) {
     // prepare warmup batch from input data read earlier
     // warmup batch size can be different than batch size
     std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
-        GetWarmupData(input_slots_all);
+        paddle::inference::GetWarmupData(input_slots_all);
 
     // configure quantizer
     q_cfg.EnableMkldnnQuantizer();
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 252bca2d5522e..c9292ddc710e7 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -17,10 +17,12 @@
 #include <gtest/gtest.h>
 
 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
 #include <unordered_map>
+#include <utility>
 #include <vector>
 #ifdef WITH_GPERFTOOLS
 #include <gperftools/profiler.h>
@@ -48,7 +50,8 @@ DEFINE_bool(ernie_large, false, "Test ernie large");
 DEFINE_bool(with_accuracy_layer, true,
             "Calculate the accuracy while label is in the input");
 DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
-DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
+DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction");
+DEFINE_bool(enable_int8, false, "Enable INT8 type prediction");
 DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
 // setting iterations to 0 means processing the whole dataset
 DEFINE_int32(iterations, 0, "number of batches to process");
@@ -124,6 +127,123 @@ class Barrier {
   std::size_t _count;
 };
 
+template <typename T>
+class TensorReader {
+ public:
+  TensorReader(std::ifstream &file, size_t beginning_offset,
+               std::vector<int> shape, std::string name)
+      : file_(file), position_(beginning_offset), shape_(shape), name_(name) {
+    numel_ = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
+                             std::multiplies<size_t>());
+  }
+
+  PaddleTensor NextBatch() {
+    PaddleTensor tensor;
+    tensor.name = name_;
+    tensor.shape = shape_;
+    tensor.dtype = GetPaddleDType<T>();
+    tensor.data.Resize(numel_ * sizeof(T));
+
+    file_.seekg(position_);
+    file_.read(static_cast<char *>(tensor.data.data()), numel_ * sizeof(T));
+    position_ = file_.tellg();
+
+    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
+    if (file_.fail())
+      throw std::runtime_error(name_ + ": failed reading file.");
+
+    return tensor;
+  }
+
+ protected:
+  std::ifstream &file_;
+  size_t position_;
+  std::vector<int> shape_;
+  std::string name_;
+  size_t numel_;
+};
+
+std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
+    const std::vector<std::vector<PaddleTensor>> &test_data,
+    int num_images = FLAGS_warmup_batch_size) {
+  int test_data_batch_size = test_data[0][0].shape[0];
+  auto iterations = test_data.size();
+  auto all_test_data_size = iterations * test_data_batch_size;
+  PADDLE_ENFORCE_LE(static_cast<size_t>(num_images), all_test_data_size,
+                    platform::errors::InvalidArgument(
+                        "The requested quantization warmup data size must be "
+                        "lower or equal to the test data size. But received"
+                        "warmup size is %d and test data size is %d. Please "
+                        "use --warmup_batch_size parameter to set smaller "
+                        "warmup batch size.",
+                        num_images, all_test_data_size));
+
+  PaddleTensor images;
+  images.name = "image";
+  images.shape = {num_images, 3, 224, 224};
+  images.dtype = PaddleDType::FLOAT32;
+  images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
+
+  PaddleTensor labels;
+  labels.name = "label";
+  labels.shape = {num_images, 1};
+  labels.dtype = PaddleDType::INT64;
+  labels.data.Resize(sizeof(int64_t) * num_images);
+
+  for (int i = 0; i < num_images; i++) {
+    auto batch = i / test_data_batch_size;
+    auto element_in_batch = i % test_data_batch_size;
+    std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
+                    element_in_batch * 3 * 224 * 224,
+                3 * 224 * 224,
+                static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
+
+    std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
+                    element_in_batch,
+                1, static_cast<int64_t *>(labels.data.data()) + i);
+  }
+
+  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
+  (*warmup_data)[0] = std::move(images);
+  (*warmup_data)[1] = std::move(labels);
+  return warmup_data;
+}
+
+void SetInputs(std::vector<std::vector<PaddleTensor>> *inputs,
+               int32_t batch_size = FLAGS_batch_size) {
+  std::ifstream file(FLAGS_infer_data, std::ios::binary);
+  if (!file) {
+    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
+  }
+
+  int64_t total_images{0};
+  file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
+  LOG(INFO) << "Total images in file: " << total_images;
+
+  std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
+  std::vector<int> label_batch_shape{batch_size, 1};
+  auto images_offset_in_file = static_cast<size_t>(file.tellg());
+  auto labels_offset_in_file =
+      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
+
+  TensorReader<float> image_reader(file, images_offset_in_file,
+                                   image_batch_shape, "image");
+  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
+                                     label_batch_shape, "label");
+
+  auto iterations_max = total_images / batch_size;
+  auto iterations = iterations_max;
+  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
+    iterations = FLAGS_iterations;
+  }
+  for (auto i = 0; i < iterations; i++) {
+    auto images = image_reader.NextBatch();
+    auto labels = label_reader.NextBatch();
+    inputs->emplace_back(
+        std::vector<PaddleTensor>{std::move(images), std::move(labels)});
+  }
+}
+
 // Compare result between two PaddleTensor
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                    const std::vector<PaddleTensor> &ref_outputs) {
@@ -519,8 +639,9 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
-void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
-                       int compared_idx) {
+void SummarizeAccuracy(float avg_acc_ref, float avg_acc, int compared_idx) {
+  std::string data_type_name = "INT8";
+  if (FLAGS_enable_bf16) data_type_name = "BF16";
   PADDLE_ENFORCE_LE(
       compared_idx, 2,
       platform::errors::InvalidArgument(
@@ -539,12 +660,12 @@ void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
   LOG(INFO) << "--- Accuracy summary --- ";
   LOG(INFO) << "Accepted " << prefix
             << "drop threshold: " << FLAGS_quantized_accuracy
-            << ". (condition: (FP32_" << prefix << " - INT8_" << prefix
-            << ") <= threshold)";
+            << ". (condition: (FP32_" << prefix << " - " << data_type_name
+            << "_" << prefix << ") <= threshold)";
   LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_fp32;
-  LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_int8;
+            << std::setprecision(4) << avg_acc_ref;
+  LOG(INFO) << data_type_name << ": avg " << prefix << std::fixed
+            << std::setw(6) << std::setprecision(4) << avg_acc;
 }
 
 void SummarizePerformance(const char *title, float sample) {
@@ -555,10 +676,11 @@ void SummarizePerformance(const char *title, float sample) {
             << " ms";
 }
 
-void SummarizePerformance(float sample_latency_fp32,
-                          float sample_latency_int8) {
-  if (FLAGS_enable_fp32) SummarizePerformance("FP32", sample_latency_fp32);
-  if (FLAGS_enable_int8) SummarizePerformance("INT8", sample_latency_int8);
+void SummarizePerformance(const char *title_fp32, float sample_latency_fp32,
+                          const char *title, float sample_latency) {
+  if (FLAGS_enable_fp32) SummarizePerformance(title_fp32, sample_latency_fp32);
+  if (FLAGS_enable_int8 || FLAGS_enable_bf16)
+    SummarizePerformance(title, sample_latency);
 }
 
 float CompareAccuracyOne(
@@ -613,7 +735,7 @@ void CompareAccuracy(
     const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
     const std::vector<std::vector<PaddleTensor>> &output_slots_ref,
     int compared_idx) {
-  if ((FLAGS_enable_fp32 && FLAGS_enable_int8) &&
+  if ((FLAGS_enable_fp32 && (FLAGS_enable_int8 || FLAGS_enable_bf16)) &&
       (output_slots_quant.size() == 0 || output_slots_ref.size()) == 0)
     throw std::invalid_argument(
         "CompareAccuracy: output_slots vector is empty.");
@@ -621,7 +743,7 @@ void CompareAccuracy(
   float avg_acc_quant = 0.0;
   float avg_acc_ref = 0.0;
 
-  if (FLAGS_enable_int8)
+  if (FLAGS_enable_int8 || FLAGS_enable_bf16)
     avg_acc_quant = CompareAccuracyOne(output_slots_quant, compared_idx);
 
   if (FLAGS_enable_fp32)
@@ -631,9 +753,9 @@ void CompareAccuracy(
 
   if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0);
 
-  if (FLAGS_enable_int8) CHECK_GT(avg_acc_quant, 0.0);
+  if (FLAGS_enable_int8 || FLAGS_enable_bf16) CHECK_GT(avg_acc_quant, 0.0);
 
-  if (FLAGS_enable_fp32 && FLAGS_enable_int8)
+  if (FLAGS_enable_fp32 && (FLAGS_enable_int8 || FLAGS_enable_bf16))
     CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy);
 }
 
@@ -708,11 +830,51 @@ void CompareQuantizedAndAnalysis(
     TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
                             VarType::INT8, &sample_latency_int8);
   }
-  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
+  SummarizePerformance("FP32", sample_latency_fp32, "INT8",
+                       sample_latency_int8);
 
   CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
 }
 
+void CompareBFloat16AndAnalysis(
+    const AnalysisConfig *config, const AnalysisConfig *qconfig,
+    const std::vector<std::vector<PaddleTensor>> &inputs,
+    const int compared_idx = 1) {
+  PADDLE_ENFORCE_EQ(
+      inputs[0][0].shape[0], FLAGS_batch_size,
+      platform::errors::InvalidArgument(
+          "Input data has to be packed batch by batch. The batchsize is set to "
+          "%d, but the real input is packed with batchsize = %d",
+          FLAGS_batch_size, inputs[0][0].shape[0]));
+  LOG(INFO) << "FP32 & BF16 prediction run: batch_size " << FLAGS_batch_size;
+
+  LOG(INFO) << "--- FP32 prediction start ---";
+  auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
+  PrintConfig(cfg, true);
+  std::vector<std::vector<PaddleTensor>> analysis_outputs;
+  float sample_latency_fp32{-1};
+
+  if (FLAGS_enable_fp32) {
+    TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32,
+                            &sample_latency_fp32);
+  }
+
+  LOG(INFO) << "--- BF16 prediction start ---";
+  auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
+  PrintConfig(qcfg, true);
+  std::vector<std::vector<PaddleTensor>> bf16_outputs;
+  float sample_latency_bf16{-1};
+
+  if (FLAGS_enable_bf16) {
+    TestOneThreadPrediction(qcfg, inputs, &bf16_outputs, true, VarType::FP32,
+                            &sample_latency_bf16);
+  }
+  SummarizePerformance("FP32", sample_latency_fp32, "BF16",
+                       sample_latency_bf16);
+
+  CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx);
+}
+
 void CompareAnalysisAndAnalysis(
     const AnalysisConfig *config1, const AnalysisConfig *config2,
     const std::vector<std::vector<PaddleTensor>> &inputs,
@@ -749,7 +911,8 @@ void CompareAnalysisAndAnalysis(
     TestOneThreadPrediction(cfg2, inputs, &int8_outputs, true, VarType::INT8,
                             &sample_latency_int8);
   }
-  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
+  SummarizePerformance("FP32", sample_latency_fp32, "INT8",
+                       sample_latency_int8);
   if (with_accuracy_layer) {
     CompareAccuracy(int8_outputs, analysis_outputs, compared_idx);
   }
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 736483c3304ac..cd83443f0522f 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -13,12 +13,49 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/allclose_op.h"
+#include <cmath>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct GetTensorValue<platform::CPUDeviceContext, T> {
+  T operator()(const platform::CPUDeviceContext& dev_ctx,
+               const framework::Tensor& tensor) const {
+    return *(tensor.data<T>());
+  }
+};
+
+template <typename T>
+struct AllcloseFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& other,
+                  const double rtol, const double atol, bool equal_nan,
+                  framework::Tensor* output) {
+    auto* in_a = in.data<T>();
+    auto* in_b = other.data<T>();
+    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
+    auto num = in.numel();
+    *out_data = true;
+    for (int i = 0; i < num; i++) {
+      const T a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = (a > b ? a - b : b - a);
+        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+        T diff = (left > right ? left - right : right - left);
+        val = a == b || left <= right || diff <= 1e-15;
+      }
+      *out_data &= val;
+    }
+  }
+};
+
 class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -26,12 +63,9 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input tensor, it's data type should be float32, float64.");
     AddInput("Other",
              "The input tensor, it's data type should be float32, float64.");
+    AddInput("Rtol", "The relative tolerance.");
+    AddInput("Atol", "The absolute tolerance.");
     AddOutput("Out", "The output tensor, it's data type is bool.");
-
-    AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
-        .SetDefault(1e-5);
-    AddAttr<float>("atol", "The absolute tolerance. Default: :math:`1e-8` .")
-        .SetDefault(1e-8);
     AddAttr<bool>("equal_nan",
                   "If :math:`True` , then two :math:`NaNs` will be "
                   "compared as equal. Default: :math:`False` .")
@@ -54,16 +88,12 @@ class AllcloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::NotFound(
-                          "Input(Input) of allclose op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Other"), true,
-                      platform::errors::NotFound(
-                          "Input(Other) of allclose op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound(
-                          "The output(Out) of allclose op must not be null."));
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
+    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
+    OP_INOUT_CHECK(ctx->HasInput("Rtol"), "Input", "Rtol", "Allclose");
+    OP_INOUT_CHECK(ctx->HasInput("Atol"), "Input", "Atol", "Allclose");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");
 
     auto input_dim = ctx->GetInputDim("Input");
     auto other_dim = ctx->GetInputDim("Other");
@@ -96,7 +126,7 @@ class AllcloseOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
         ctx.device_context());
@@ -105,7 +135,7 @@ class AllcloseOp : public framework::OperatorWithKernel {
 
 class AllcloseOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
+  void operator()(framework::InferVarTypeContext* ctx) const override {
     ctx->SetOutputDataType("Out", framework::proto::VarType::BOOL);
   }
 };
diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu
index aaca4e5b1226d..f98fe75cd681a 100644
--- a/paddle/fluid/operators/allclose_op.cu
+++ b/paddle/fluid/operators/allclose_op.cu
@@ -12,12 +12,70 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#define EIGEN_USE_GPU
-
+#include <cuda_runtime.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/allclose_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GetTensorValue<platform::CUDADeviceContext, T> {
+  T operator()(const platform::CUDADeviceContext& dev_ctx,
+               const framework::Tensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place =
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
+                 dev_ctx.stream());
+    return value;
+  }
+};
+
+template <typename T>
+__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data,
+                                   const double rtol, const double atol,
+                                   bool equal_nan, int num, bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    if (!val) *out_data = false;
+  }
+}
+
+template <typename T>
+struct AllcloseFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor& in, const framework::Tensor& other,
+                  const double rtol, const double atol, bool equal_nan,
+                  framework::Tensor* output) {
+    int num = in.numel();
+    const T* in_data = in.data<T>();
+    const T* other_data = other.data<T>();
+    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+    cudaMemset(out_data, true, sizeof(bool));
+    AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, other_data, rtol, atol, equal_nan, num, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h
index 51893c087ce72..a08ddca9eb679 100644
--- a/paddle/fluid/operators/allclose_op.h
+++ b/paddle/fluid/operators/allclose_op.h
@@ -22,38 +22,38 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 
+template <typename DeviceContext, typename T>
+struct GetTensorValue {
+  T operator()(const platform::DeviceContext& ctx,
+               const framework::Tensor& tensor) const;
+};
+
+template <typename DeviceContext, typename T>
+struct AllcloseFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& other, const float rtol,
+                  const float atol, bool equal_nan, framework::Tensor* output);
+};
+
 template <typename DeviceContext, typename T>
 class AllcloseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // get attrs
-    float rtol = ctx.Attr<float>("rtol");
-    float atol = ctx.Attr<float>("atol");
     bool equal_nan = ctx.Attr<bool>("equal_nan");
     // get input/output
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* other = ctx.Input<Tensor>("Other");
+    const auto* input = ctx.Input<Tensor>("Input");
+    const auto* other = ctx.Input<Tensor>("Other");
+    const auto* rtol = ctx.Input<Tensor>("Rtol");
+    const auto* atol = ctx.Input<Tensor>("Atol");
     auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-    // get place
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto input_v = framework::EigenVector<T>::Flatten(*input);
-    auto other_v = framework::EigenVector<T>::Flatten(*other);
-    auto out_v = framework::EigenScalar<bool>::From(*out);
-
-    auto left = (input_v - other_v).abs();
-    auto right = static_cast<T>(atol) + static_cast<T>(rtol) * other_v.abs();
-    auto compare_res = left <= right;
-
-    if (equal_nan) {
-      auto input_nan = input_v.isnan();
-      auto other_nan = other_v.isnan();
-      out_v.device(place) =
-          (input_nan == other_nan).all() && (compare_res != input_nan).all();
-    } else {
-      out_v.device(place) = compare_res.all();
-    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    GetTensorValue<DeviceContext, double> get_tensor_value;
+    double rtol_v = get_tensor_value(dev_ctx, *rtol);
+    double atol_v = get_tensor_value(dev_ctx, *atol);
+    AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
+                                        equal_nan, out);
   }
 };
 
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index ed4ba24a74bea..001a18859618c 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -54,7 +54,7 @@ class AssignFunctor {
     out_rows.set_height(rows.height());
     auto &t = rows.value();
     auto *m = out_rows.mutable_value();
-    framework::TensorCopy(t, dev_ctx_.GetPlace(), dev_ctx_, m);
+    framework::TensorCopy(t, t.place(), m);
   }
 
   template <typename T>
@@ -70,7 +70,7 @@ class AssignFunctor {
                    framework::LoDTensor *out) const {
     if (lod_tensor.numel() == 0) return;
     auto &out_tensor = *out;
-    TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
+    TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
 
diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc
new file mode 100644
index 0000000000000..6255b5d341e09
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/assign_op.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+class AssignOp : public framework::OperatorWithKernel {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->HasInput("X")) {
+      auto type = ctx->GetInputsVarType("X")[0];
+      if (type == framework::proto::VarType::SELECTED_ROWS ||
+          type == framework::proto::VarType::LOD_TENSOR) {
+        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+        if (type == framework::proto::VarType::LOD_TENSOR) {
+          ctx->ShareLoD("X", /*->*/ "Out");
+        }
+      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+        if (ctx->IsRuntime()) {
+          // The runtime output shape is determined in kernel.
+          return;
+        } else {
+          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+        }
+      }
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   expected_kernel_type.place_,
+                                   tensor.layout());
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const framework::Variable *var = ctx.InputVar("X");
+    if (var->IsType<framework::LoDTensorArray>()) {
+      auto t_arr = var->Get<framework::LoDTensorArray>();
+      // NOTE(liym27): Support an empty tensor array as Input.
+      // And set the kernel type is float.
+      if (t_arr.size() == 0) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       ctx.device_context());
+      }
+    }
+
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class AssignInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SyncTypeAndDataType("X", "Out");
+  }
+};
+
+class AssignKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *x = ctx.InputVar("X");
+    if (x == nullptr) {
+      return;
+    }
+    PADDLE_ENFORCE_EQ(
+        ctx.HasOutput("Out"), true,
+        platform::errors::NotFound("Output(Out) of assign_op is not found."));
+    auto *out = ctx.OutputVar("Out");
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+template <typename T>
+class AssignGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("assign");
+    op->SetInput("X", this->OutputGrad("Out"));
+    op->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
+                               ops::AssignKernel, int, ops::AssignKernel,
+                               int64_t, ops::AssignKernel, bool,
+                               ops::AssignKernel);
+#endif
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 7a88403aa9daa..370ba8619f188 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -381,7 +381,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           break;
         }
         default:
-          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Unknown storage order: %s", data_layout_str));
       }
 
       // if MomentumTensor is set, use MomentumTensor value, momentum
@@ -446,7 +447,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %d", data_layout);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unknown storage order: %d", data_layout));
     }
   }
 };
@@ -799,7 +801,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unknown storage order: %s", data_layout_str));
     }
   }
 };
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 1440b74290ce4..32e956e15282a 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/layout_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
 
@@ -41,127 +42,6 @@ template <typename T>
 using ConstEigenVectorArrayMap =
     Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[4];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[3];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  } else if (dim == 1) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelFirst(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 4, 1, 2, 3};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 3, 1, 2};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  } else if (dim == 1) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
-    trans3(dev_ctx, *input, transformed_input, axis);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelLast(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[4];
-    in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  } else if (dim == 1) {
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelLast(const framework::ExecutionContext& context,
-                               const Tensor* input, Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 4, 1};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 1};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  } else if (dim == 1) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
-    trans3(dev_ctx, *input, transformed_input, axis);
-  }
-}
-
 class BatchNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
new file mode 100644
index 0000000000000..624d5fe65ead7
--- /dev/null
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/batch_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class BatchNormXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto momentum = ctx.Attr<float>("momentum");
+    const auto is_test = ctx.Attr<bool>("is_test");
+    const auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const auto trainable_stats = ctx.Attr<bool>("trainable_statistics");
+    bool test_mode = is_test && (!trainable_stats);
+    bool global_stats = test_mode || use_global_stats;
+    const auto& data_layout_str = ctx.Attr<std::string>("data_layout");
+    const auto data_layout = framework::StringToDataLayout(data_layout_str);
+    PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
+                      platform::errors::InvalidArgument(
+                          "The 'data_layout' attribute must be NCHW. But "
+                          "recevived 'data_layout' is [%s].",
+                          data_layout_str));
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto& x_dims = x->dims();
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimension must equal to 4. But "
+                          "received X's shape = [%s], X's dimension = [%d].",
+                          x_dims, x_dims.size()));
+    const int N = x_dims[0];
+    const int C = x_dims[1];
+    const int H = x_dims[2];
+    const int W = x_dims[3];
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    const auto* x_data = x->data<T>();
+    const auto* scale_data = scale->data<T>();
+    const auto* bias_data = bias->data<T>();
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (!global_stats) {
+      auto* mean_out = ctx.Output<Tensor>("MeanOut");
+      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+      mean_out->mutable_data<T>(ctx.GetPlace());
+      variance_out->mutable_data<T>(ctx.GetPlace());
+      saved_mean->mutable_data<T>(ctx.GetPlace());
+      saved_variance->mutable_data<T>(ctx.GetPlace());
+      auto* mean_out_data = mean_out->data<T>();
+      auto* variance_out_data = variance_out->data<T>();
+      auto* saved_mean_data = saved_mean->data<T>();
+      auto* saved_variance_data = saved_variance->data<T>();
+      int r = xpu::batch_norm_train_forward(
+          dev_ctx.x_context(), epsilon, momentum, N, C, H, W, x_data, y_data,
+          scale_data, bias_data, mean_out_data, variance_out_data,
+          saved_mean_data, saved_variance_data);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API(batch_norm_train_forward) return "
+                                     "wrong value[%d], please check whether "
+                                     "Baidu Kunlun Card is properly installed.",
+                                     r));
+    } else {
+      const auto* mean = ctx.Input<Tensor>("Mean");
+      const auto* variance = ctx.Input<Tensor>("Variance");
+      const auto* mean_data = mean->data<T>();
+      const auto* variance_data = variance->data<T>();
+      int r = xpu::batch_norm_infer_forward(
+          dev_ctx.x_context(), epsilon, N, C, H, W, x_data, y_data, scale_data,
+          bias_data, mean_data, variance_data);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API(batch_norm_infer_forward) return "
+                                     "wrong value[%d], please check whether "
+                                     "Baidu Kunlun Card is properly installed.",
+                                     r));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BatchNormGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    // SavedVariance have been reverted in forward operator
+    const auto* saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+    const auto& data_layout_str = ctx.Attr<std::string>("data_layout");
+    const auto data_layout = framework::StringToDataLayout(data_layout_str);
+    PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
+                      platform::errors::InvalidArgument(
+                          "The 'data_layout' attribute must be NCHW. But "
+                          "recevived 'data_layout' is [%s].",
+                          data_layout_str));
+    const auto& x_dims = x->dims();
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimension must equal to 4. But "
+                          "received X's shape = [%s], X's dimension = [%d].",
+                          x_dims, x_dims.size()));
+    const int N = x_dims[0];
+    const int C = x_dims[1];
+    const int H = x_dims[2];
+    const int W = x_dims[3];
+    const auto* x_data = x->data<T>();
+    const auto* dy_data = dy->data<T>();
+    const auto* scale_data = scale->data<T>();
+    const auto* saved_mean_data = saved_mean->data<T>();
+    const auto* saved_inv_variance_data = saved_inv_variance->data<T>();
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* dscale_data = dscale->mutable_data<T>(ctx.GetPlace());
+    auto* dbias_data = dbias->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::batch_norm_backward(dev_ctx.x_context(), N, C, H, W, x_data,
+                                     dy_data, scale_data, saved_mean_data,
+                                     saved_inv_variance_data, dx_data,
+                                     dscale_data, dbias_data);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(batch_norm_infer_forward) return "
+                                   "wrong value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    batch_norm,
+    ops::BatchNormXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    batch_norm_grad,
+    ops::BatchNormGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 16db4f05e31d3..1a967c57385a0 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -70,7 +70,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     auto x_numel = x->numel();
 
     platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(x_numel, ctx);
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel);
 
     Tensor x_cpu;
     framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
@@ -89,9 +89,9 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
-    GPUBCELossForward<
-        T><<<config.blocks, config.threads, 0, dev_ctx.stream()>>>(
-        x_data, labels->data<T>(), out_data, x_numel);
+    GPUBCELossForward<T><<<config.block_per_grid, config.thread_per_block, 0,
+                           dev_ctx.stream()>>>(x_data, labels->data<T>(),
+                                               out_data, x_numel);
   }
 };
 
@@ -106,12 +106,12 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int x_numel = x->numel();
-    platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(x_numel, ctx);
     auto& dev_ctx = ctx.cuda_device_context();
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
 
-    GPUBCELossBackward<
-        T><<<config.blocks, config.threads, 0, dev_ctx.stream()>>>(
+    GPUBCELossBackward<T><<<config.block_per_grid, config.thread_per_block, 0,
+                            dev_ctx.stream()>>>(
         x->data<T>(), labels->data<T>(), dout->data<T>(), dx_data, x_numel);
   }
 };
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index e46950f61887d..e56a4be53d149 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -165,10 +165,11 @@ class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
     int total_count = batch_size * h * w * output_dims[1];
 
     platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(total_count, ctx);
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), total_count);
 
-    BilateralSliceCudaForwardKernel<T><<<config.blocks, config.threads, 0,
-                                         ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaForwardKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
         total_count, output_dims[1]);
   }
@@ -472,24 +473,29 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
     grid_sizes.input_chans = input_chans;
 
     platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(grid_count, ctx, 512);
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
 
-    BilateralSliceCudaGridGradKernel<T><<<config.blocks, config.threads, 0,
-                                          ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaGridGradKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
         has_offset, grid_count, output_chans);
 
-    config = platform::getGpuLaunchConfig(guide_count, ctx, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
 
-    BilateralSliceCudaGuideGradKernel<T><<<
-        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaGuideGradKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
         grid_sizes, has_offset, guide_count, output_chans);
 
-    config = platform::getGpuLaunchConfig(input_count, ctx, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
 
-    BilateralSliceCudaInputGradKernel<T><<<
-        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaInputGradKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
         has_offset, input_count, output_chans);
   }
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
new file mode 100644
index 0000000000000..56160bd297e28
--- /dev/null
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/cast_op.h"
+#include <memory>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename InT>
+class CastXPUKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto in_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("in_dtype"));
+    auto out_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("out_dtype"));
+    auto* in_data = in->data<InT>();
+    auto numel = in->numel();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = -1;
+    if (out_type == framework::proto::VarType::FP32) {
+      auto* out_data = out->mutable_data<float>(context.GetPlace());
+      r = xpu::cast<InT, float>(dev_ctx.x_context(), in_data, out_data, numel);
+    } else if (out_type == framework::proto::VarType::INT32) {
+      auto* out_data = out->mutable_data<int>(context.GetPlace());
+      r = xpu::cast<InT, int>(dev_ctx.x_context(), in_data, out_data, numel);
+    } else if (out_type == framework::proto::VarType::INT64) {
+      auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
+      r = xpu::cast<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
+                                  numel);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
+                                                 in_type, out_type));
+    }
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index d67d90c348e6f..a7c0f12711d50 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -67,6 +67,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     }
 
     auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
+    bool use_align = context.Attr<bool>("use_align");
 
     if (context.Attr<bool>("check_name")) {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
@@ -93,7 +94,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
         context.Attr<int>("dtype"));
     size_t size_of_dtype = framework::SizeOfType(dtype);
     GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
-                       context.GetPlace());
+                       context.GetPlace(), use_align);
 
     // Alloc the continuous space
     auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
@@ -111,8 +112,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
         framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
                               &sub_tensor);
 
-        offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-                  size_of_dtype;
+        offset +=
+            use_align
+                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                      size_of_dtype
+                : len;
       }
     } else if (context.Attr<bool>("set_constant")) {
       math::SetConstant<DeviceContext, T> set_constant;
@@ -131,8 +135,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           ->ShareDataWith(fused_tensor->Slice(
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
-      len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-            size_of_dtype;
+      len = use_align
+                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                      size_of_dtype
+                : len;
       offset += len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
          << " address: " << out_tensors[i]->data<void>() << ", ";
@@ -144,7 +150,8 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
   void GetMemSizeAndDtype(
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
-      const size_t &size_of_dtype, const platform::Place &place) const {
+      const size_t &size_of_dtype, const platform::Place &place,
+      const bool use_align = true) const {
     PADDLE_ENFORCE_EQ(
         lod_tensors.size(), var_names.size(),
         platform::errors::InvalidArgument(
@@ -167,9 +174,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
-      *numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                                    place) /
-                size_of_dtype;
+      *numel += use_align
+                    ? platform::Alignment(
+                          static_cast<size_t>(size) * size_of_dtype, place) /
+                          size_of_dtype
+                    : static_cast<size_t>(size);
     }
 
     VLOG(10) << ss.str();
@@ -223,6 +232,10 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Whether to check the name of Input and Output to ensure "
                   "they are the same separately.")
         .SetDefault(false);
+    AddAttr<bool>("use_align",
+                  "Whether to consider memory chunk and take alignment into "
+                  "account for inputs and outputs.")
+        .SetDefault(true);
     AddComment(R"DOC(
 CoalesceTensor Operator.
 
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
new file mode 100644
index 0000000000000..9c9c72c7f6f78
--- /dev/null
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#ifdef PADDLE_WITH_MKLDNN
+#include <paddle/fluid/platform/mkldnn_helper.h>
+#endif
+
+#ifdef PADDLE_WITH_XPU
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ConcatXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
+    int axis = ctx.Attr<int>("axis");
+    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
+                                           "The input should not be null."));
+    PADDLE_ENFORCE_NE(ctx.HasInput("AxisTensor"), true,
+                      platform::errors::InvalidArgument(
+                          "XPU donot surpport AxisTensor for now"));
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+    PADDLE_ENFORCE_GE(
+        axis, 0, platform::errors::InvalidArgument("concat: axis shoud >= 0!"));
+    PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(),
+                      platform::errors::InvalidArgument(
+                          "concat: axis shoud < ins[0]->dims()!"));
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    std::vector<int> choose_idx;
+    int n = 0;
+    for (unsigned int i = 0; i < ins.size(); ++i) {
+      if (ins[i] && ins[i]->numel() > 0) {
+        choose_idx.push_back(i);
+        n++;
+      }
+    }
+    PADDLE_ENFORCE_LE(n, 8, platform::errors::InvalidArgument(
+                                "XPU only surpport at most 8 tensors for now"));
+    PADDLE_ENFORCE_GT(
+        n, 0, platform::errors::InvalidArgument("No tensor need concat?"));
+    int h = 1;
+    int w_except_axis = 1;
+    for (int i = 0; i < axis; ++i) {
+      h *= (ins[choose_idx[0]]->dims())[i];
+    }
+    for (int i = axis + 1; i < ins[0]->dims().size(); ++i) {
+      w_except_axis *= (ins[choose_idx[0]]->dims())[i];
+    }
+    for (int i = 1; i < n; ++i) {
+      int hh = 1;
+      int ww = 1;
+      for (int j = 0; j < axis; ++j) {
+        hh *= (ins[choose_idx[i]]->dims())[j];
+      }
+      for (int j = axis + 1; j < ins[i]->dims().size(); ++j) {
+        ww *= (ins[choose_idx[i]]->dims())[j];
+      }
+      PADDLE_ENFORCE_EQ(hh, h, platform::errors::InvalidArgument(
+                                   "concat: h should be eual!"));
+      PADDLE_ENFORCE_EQ(ww, w_except_axis,
+                        platform::errors::InvalidArgument(
+                            "concat: w should be eual except for axis!"));
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::unique_ptr<int[]> in_w_host(new int[n]);
+    std::unique_ptr<const float* []> ptrs(new const float*[n]);
+    for (int i = 0; i < n; ++i) {
+      ptrs[i] = ins[choose_idx[i]]->data<T>();
+      in_w_host[i] = w_except_axis * (ins[choose_idx[i]]->dims())[axis];
+    }
+    int r =
+        xpu::concat<float>(dev_ctx.x_context(), h, (const int*)in_w_host.get(),
+                           n, (const float**)ptrs.get(), out->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+template <typename DeviceContext, typename T>
+class ConcatGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+    {
+      auto dx = outs;
+      auto x = ins;
+      for (size_t i = 0; i < dx.size(); ++i) {
+        if (dx[i] != nullptr) {
+          dx[i]->set_lod(x[i]->lod());
+        }
+      }
+    }
+    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
+                                           "The input should not be null."));
+    auto axis = ctx.Attr<int>("axis");
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
+    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
+                                   "concat_grad: axis shoud >= 0!"));
+    PADDLE_ENFORCE_LT(axis, out_grad->dims().size(),
+                      platform::errors::InvalidArgument(
+                          "concat_grad: axis shoud < ins[0]->dims()!"));
+    auto out_grad_stride = framework::stride_numel(out_grad->dims());
+    int n = outputs.size();
+    PADDLE_ENFORCE_LE(n, 16,
+                      platform::errors::InvalidArgument(
+                          "XPU only surpport at most 16 tensors for now"));
+    int h = out_grad_stride[0] / out_grad_stride[axis];
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::unique_ptr<int[]> in_w_host(new int[n]);
+    std::unique_ptr<float* []> ptrs(new float*[n]);
+    for (int i = 0; i < n; ++i) {
+      auto out_stride = framework::stride_numel(outputs[i]->dims());
+      ptrs[i] = outputs[i]->data<T>();
+      in_w_host[i] = out_stride[axis];
+    }
+    int r = xpu::concat_grad(dev_ctx.x_context(), h, in_w_host.get(), n,
+                             reinterpret_cast<float**>(ptrs.get()),
+                             out_grad->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    concat, ops::ConcatXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    concat_grad,
+    ops::ConcatGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 680abc5ddffc3..4d409ed00a0b3 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,5 +1,7 @@
 include(operators)
-register_operators(DEPS naive_executor)
+register_operators(EXCLUDES conditional_block_op DEPS naive_executor)
+
+cc_library(conditional_block_op SRCS conditional_block_op.cc DEPS executor)
 cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
 cc_library(conditional_block_op_helper SRCS conditional_block_op_helper.cc DEPS operator op_variant conditional_block_op)
 cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
index 00b86121c0dda..500e1ccea92c7 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
@@ -162,6 +162,32 @@ void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
   PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
       program, &fwd_ops, &bwd_ops);
 }
+void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
+    const framework::ProgramDesc &program, int block_id,
+    const std::vector<framework::OperatorBase *> &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all conditional_block_ops and conditional_block_grad_ops
+  // in the whole program would be processed when block_id is 0 (i.e.
+  // when Executor::Run() or ParallelExecutor constructs).
+
+  // What's more, all conditional_block_ops and conditional_block_grad_ops
+  // must be processed when block_id is zero. If not, conditional_block_op
+  // may run first and erase variables used in conditional_block_grad_op,
+  // and in this moment, conditional_block_grad_ops may be not constructed yet.
+  if (block_id != 0) return;
+
+  std::vector<OpVariant> fwd_ops, bwd_ops;
+  for (auto *op : all_ops) {
+    if (op->Type() == "conditional_block") {
+      fwd_ops.emplace_back(op);
+    } else if (op->Type() == "conditional_block_grad") {
+      bwd_ops.emplace_back(op);
+    }
+  }
+
+  PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl(
+      program, &fwd_ops, &bwd_ops);
+}
 
 void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
     const framework::ProgramDesc &program,
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
index abaaa8976065c..22eb2ece4b05b 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
@@ -33,6 +33,10 @@ void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
     const framework::ProgramDesc &program, int block_id,
     const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
 
+void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
+    const framework::ProgramDesc &program, int block_id,
+    const std::vector<framework::OperatorBase *> &all_ops);
+
 void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
     const framework::ProgramDesc &program,
     const std::vector<framework::OperatorBase *> &ifelse_ops,
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index fac8e24251033..55502eaf4e549 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <array>
 #include <memory>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
@@ -90,6 +91,61 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
   return out;
 }
 
+inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
+  int max_algos = 0;
+#if CUDNN_VERSION_MIN(7, 0, 1)
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+          cudnn_handle, &max_algos));
+#endif
+  return max_algos;
+}
+
+template <typename PerfType, typename AlgoType>
+void ChooseAlgo(const std::vector<PerfType>& perf_results,
+                size_t workspace_byte, AlgoType* algo) {
+  VLOG(3) << "=========BwdFilterAlgo Perf result=========";
+  for (const auto& result : perf_results) {
+    auto math_type_str = "False";
+    if (result.mathType == CUDNN_TENSOR_OP_MATH) {
+      math_type_str = "True";
+    }
+    VLOG(3) << "    algo: " << result.algo << ", TensorCore: " << math_type_str
+            << ", time: " << result.time << " ms"
+            << ", wksp = " << result.memory << ", status = " << result.status;
+  }
+
+  for (size_t i = 0; i != perf_results.size(); ++i) {
+    const auto& result = perf_results[i];
+    if (result.status == CUDNN_STATUS_SUCCESS &&
+        (result.memory <= workspace_byte)) {
+      if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
+          (i != perf_results.size() - 1)) {
+        const auto& next_result = perf_results[i + 1];
+        if (next_result.status == CUDNN_STATUS_SUCCESS &&
+            next_result.algo == result.algo &&
+            next_result.memory == result.memory &&
+            next_result.mathType != CUDNN_TENSOR_OP_MATH &&
+            next_result.time < 1.01 * result.time) {
+          // Skip over this result- it's not really a Tensor Core algo.
+          // Because it is only 1% performance difference.
+          // Prefer to choose the next equivalent non-Tensor Core algo.
+          continue;
+        }
+      }
+      *algo = result.algo;
+      auto math_type_str = "0";
+      if (result.mathType == CUDNN_TENSOR_OP_MATH) {
+        math_type_str = "1";
+      }
+      VLOG(3) << "    choose algo: " << result.algo << ", TC: " << math_type_str
+              << ", time: " << result.time << " ms"
+              << ", wksp = " << result.memory << ", status = " << result.status;
+      return;
+    }
+  }
+}
+
 using framework::ConvSearchCache;
 
 struct ConvArgs {
@@ -401,7 +457,6 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
                      bool deterministic,
                      const framework::ExecutionContext& ctx) {
     auto dtype = platform::CudnnDataType<T>::type;
-    bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF);
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
     bool has_got_workspace_size = true;
@@ -422,7 +477,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
 #endif
 
     algo_t algo;
-    if (!exhaustive && !deterministic) {
+    if (!exhaustive_search && !deterministic) {
 #if CUDNN_VERSION >= 7001
       using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
       int perf_count;
@@ -462,34 +517,57 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
                << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
                << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0,
-          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            int returned_algo_count;
-            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      if (dtype != CUDNN_DATA_HALF) {
+        algo = algo_cache.GetAlgorithm(
+            x_dims, w_dims, args.s, args.p, args.d, 0,
+            static_cast<int64_t>(args.cudnn_dtype), [&]() {
+              int returned_algo_count;
+              std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
+              auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+                PADDLE_ENFORCE_CUDA_SUCCESS(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                            args.handle, args.idesc.desc(), args.x->data<T>(),
+                            args.odesc.desc(), args.o->data<T>(),
+                            args.cdesc.desc(), args.wdesc.desc(),
+                            const_cast<T*>(args.w->data<T>()),
+                            kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
+                            perf_stat.data(), cudnn_workspace_ptr,
+                            workspace_size_limit));
+              };
+              workspace_handle.RunFuncSync(cudnn_find_func,
+                                           workspace_size_limit);
+
+              VLOG(3)
+                  << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
+              for (int i = 0; i < returned_algo_count; ++i) {
+                const auto& stat = perf_stat[i];
+                VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
+                        << " " << stat.memory;
+              }
+              return perf_stat[0].algo;
+            });
+      } else {
+        auto max_algos = MaxBwdFilterAlgos(args.handle);
+        algo = algo_cache.GetAlgorithm(
+            x_dims, w_dims, args.s, args.p, args.d, 0,
+            static_cast<int64_t>(args.cudnn_dtype), [&]() {
+              algo_t chosen_algo;
+              std::vector<perf_t> perf_results(max_algos);
+              int actual_algos = 0;
               PADDLE_ENFORCE_CUDA_SUCCESS(
                   platform::dynload::
-                      cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                          args.handle, args.idesc.desc(), args.x->data<T>(),
-                          args.odesc.desc(), args.o->data<T>(),
+                      cudnnFindConvolutionBackwardFilterAlgorithm(
+                          args.handle, args.idesc.desc(), args.odesc.desc(),
                           args.cdesc.desc(), args.wdesc.desc(),
-                          const_cast<T*>(args.w->data<T>()),
-                          kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                          perf_stat.data(), cudnn_workspace_ptr,
-                          workspace_size_limit));
-            };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-            VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return perf_stat[0].algo;
-          });
+                          perf_results.size(), &actual_algos,
+                          perf_results.data()));
+              perf_results.resize(actual_algos);
+              ChooseAlgo<perf_t, algo_t>(perf_results, workspace_size_limit,
+                                         &chosen_algo);
+              return chosen_algo;
+            });
+      }
     }
     VLOG(3) << "choose algo " << algo;
     return algo;
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 662fac9e77e02..364e3ab8d26c3 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/layout_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@@ -138,102 +139,6 @@ inline bool IsExpand(const std::vector<int64_t>& filter_dim,
   return !(filter_1 && strides_1 && padding_0 && dilation_1);
 }
 
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[4];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[3];
-    in_dims_vec[2] = input->dims()[1];
-    in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void ResizeToChannelLast(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[4];
-    in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
-  } else if (dim == 2) {
-    // input
-    transformed_input->Resize(input->dims());
-
-    auto in_dims_vec = framework::vectorize(input->dims());
-    in_dims_vec[1] = input->dims()[2];
-    in_dims_vec[2] = input->dims()[3];
-    in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(framework::make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelFirst(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 4, 1, 2, 3};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 3, 1, 2};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  }
-}
-
-template <typename DeviceContext, typename T>
-inline void TransToChannelLast(const framework::ExecutionContext& context,
-                               const Tensor* input, Tensor* transformed_input) {
-  int dim = input->dims().size() - 2;
-  if (dim == 3) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 4, 1};
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(dev_ctx, *input, transformed_input, axis);
-
-  } else if (dim == 2) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> axis{0, 2, 3, 1};
-    math::Transpose<DeviceContext, T, 4> trans4;
-    trans4(dev_ctx, *input, transformed_input, axis);
-  }
-}
 // Define Op classes in .h file so that other conv
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
new file mode 100644
index 0000000000000..65ed34e8a5e4c
--- /dev/null
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/conv_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#ifdef PADDLE_WITH_XPU
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GemmConvXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    // Tensor* max_input = context.Output<Tensor>("MaxInput");
+    // Tensor* max_filter = context.Output<Tensor>("MaxFilter");
+    // max_input->mutable_data<T>(context.GetPlace());
+    // max_filter->mutable_data<T>(context.GetPlace());
+    output->mutable_data<T>(context.GetPlace());
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    const int img_c = static_cast<int>(input->dims()[1]);
+    const int img_h = static_cast<int>(input->dims()[2]);
+    const int img_w = static_cast<int>(input->dims()[3]);
+    const int f = static_cast<int>(filter.dims()[0]);
+    const int win_h = static_cast<int>(filter.dims()[2]);
+    const int win_w = static_cast<int>(filter.dims()[3]);
+    PADDLE_ENFORCE_EQ(
+        dilations[0] == 1 && dilations[1] == 1, true,
+        platform::errors::InvalidArgument("XPU only support dilation == 1."));
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // PADDLE_ENFORCE_EQ(
+    //     xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
+    //                  max_input->data<T>()) == xpu::Error_t::SUCCESS,
+    //     true, platform::errors::InvalidArgument(
+    //               "XPU conv kernel error,can not finde max_input,please "
+    //               "check whether Baidu Kunlun "
+    //               "Card is properly installed."));
+    // PADDLE_ENFORCE_EQ(
+    //     xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
+    //                  max_filter->data<T>()) == xpu::Error_t::SUCCESS,
+    //     true, platform::errors::InvalidArgument(
+    //               "XPU conv kernel error,can not find max_filter,please "
+    //               "check whether Baidu Kunlun "
+    //               "Card is properly installed."));
+    if (groups == 1) {
+      int r = xpu::conv2d_forward_int16<float, float, float, float>(
+          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
+          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
+          dilations[1], groups, input->data<float>(), filter.data<float>(),
+          output->data<float>(), nullptr, nullptr, xpu::Activation_t::LINEAR,
+          nullptr, nullptr);
+      // max_input->data<float>(), max_filter->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
+    } else {
+      int r = xpu::conv2d_int16_with_group<float, float, float>(
+          dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
+          output->data<float>(), batch_size, img_c, img_h, img_w, f, win_h,
+          win_w, groups, strides[0], strides[1], paddings[0], paddings[1],
+          nullptr, nullptr);
+      // max_input->data<float>(), max_filter->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class GemmConvGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // const Tensor* max_input = context.Input<Tensor>("MaxInput");
+    // const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
+    // Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    // The filter and filter_grad will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    if (!input_grad && !filter_grad) return;
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    PADDLE_ENFORCE_EQ(groups == 1, true, platform::errors::InvalidArgument(
+                                             "XPU only support groups == 1."));
+    PADDLE_ENFORCE_EQ(
+        dilations[0] == 1 && dilations[1] == 1, true,
+        platform::errors::InvalidArgument("XPU only support dilation == 1."));
+    const int img_c = static_cast<int>(input->dims()[1]);
+    const int img_h = static_cast<int>(input->dims()[2]);
+    const int img_w = static_cast<int>(input->dims()[3]);
+    const int f = static_cast<int>(filter.dims()[0]);
+    const int win_h = static_cast<int>(filter.dims()[2]);
+    const int win_w = static_cast<int>(filter.dims()[3]);
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+    }
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+    }
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // max_output_grad->Resize({4});
+    // max_output_grad->mutable_data<T>(context.GetPlace());
+    // PADDLE_ENFORCE_EQ(
+    //     xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
+    //                  output_grad->numel(),
+    //                  max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
+    //     true,
+    //     platform::errors::External(
+    //         "XPU conv kernel error, can not find max_output_grad, please
+    //         check "
+    //         "whether Baidu Kunlun Card is "
+    //         "properly installed."));
+    if (input_grad) {
+      int r = xpu::conv2d_backward_int16(
+          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
+          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
+          dilations[1], groups, output_grad->data<float>(),
+          filter.data<float>(), input_grad->data<float>(), nullptr, nullptr);
+      // max_output_grad->data<float>(), max_filter->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
+    }
+    if (filter_grad) {
+      int r = xpu::conv2d_backward_weight_int16(
+          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
+          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
+          dilations[1], groups, output_grad->data<float>(),
+          input->data<float>(), filter_grad->data<float>(), nullptr, nullptr);
+      // max_output_grad->data<float>(), max_input->data<float>());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU conv kernel return wrong value[%d], "
+                                     "please check whether Baidu Kunlun Card "
+                                     "is properly installed.",
+                                     r));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_XPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index bea7d9c02ca7d..e935a3c0aac13 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -156,6 +157,21 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     bool is_test = ctx.Attr<bool>("is_test");
     int seed = ctx.Attr<int>("seed");
 
+    if (!is_test) {
+      int device_id =
+          BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+      if (gen_cuda->GetIsInitPy() && seed == 0) {
+        // If perform `manual_seed` in python and inner seed is not specified
+        // (equals 0), use global generator generated seed.
+        seed = static_cast<int>(gen_cuda->Random64());
+      } else if (seed == 0) {
+        // use random generated seed
+        std::random_device rd;
+        seed = rd();
+      }  // else use `ctx.Attr<int>("seed")` specified seed
+    }
+
     bool has_seq_length = ctx.HasInput("SequenceLength");
     std::vector<int> SequenceLength;
     if (has_seq_length) {
@@ -194,13 +210,25 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 
       if (!continuous) {
         LOG_FIRST_N(WARNING, 2)
-            << "If the memory space of the Input WeightList is not "
-               "continuous, less efficient calculation will be "
-               "called. Please call coalesce_tensor op to make the "
-               "input memory continuous.";
+            << "If the memory space of the Input WeightList is not continuous, "
+               "less efficient calculation will be called. Please call "
+               "flatten_parameters() to make the input memory continuous.";
         weight_whole.mutable_data<T>({weight_numel}, place);
         weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
         w_data = weight_whole.data<T>();
+        if (is_test) {  // maybe also reset small weights' ptr for training
+          int offset = 0;
+          for (size_t i = 0; i < weight_list.size(); ++i) {
+            size_t len = weight_list[i]->numel();
+            auto dim = weight_list[i]->dims();
+            const_cast<Tensor *>(weight_list[i])
+                ->ShareDataWith(
+                    weight_whole.Slice(static_cast<int64_t>(offset),
+                                       static_cast<int64_t>(offset + len)))
+                .Resize(dim);
+            offset += len;
+          }
+        }
       } else {
         w_data = const_cast<T *>(weight_list[0]->data<T>());
       }
@@ -226,12 +254,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
                       init_h_data, init_c_data, w_data, out_data, last_h_data,
                       last_c_data, &workspace_data_, workspace_size);
-      if (!w_initialized && ctx.HasInput("W") && ctx.HasInput("WeightList")) {
-        auto *W = const_cast<Tensor *>(ctx.Input<Tensor>("W"));
-        auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
-        W->mutable_data<T>({weight_numel}, place);
-        weight_to_tensor<T>(place, stream, weight_list, W);
-      }
     } else {
       if (!has_seq_length) {
         // for train
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index cff0a101e03d5..85cbf444a564e 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 #include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 using Tensor = paddle::framework::Tensor;
 using LoDTensor = paddle::framework::LoDTensor;
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index 9ebdb369feb8c..e29b057ed57a7 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -41,7 +41,8 @@ struct StridedMemcpyFunctor<T, 0> {
       memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
                    cuda_ctx.stream());
 #else
-      PADDLE_THROW("Paddle is not compiled with GPU");
+      PADDLE_THROW(
+          platform::errors::Unavailable("Paddle is not compiled with GPU."));
 #endif
     }
   }
@@ -64,7 +65,8 @@ struct StridedMemcpyFunctor<T, 1> {
       memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
                    cuda_ctx.stream());
 #else
-      PADDLE_THROW("Paddle is not compiled with GPU");
+      PADDLE_THROW(
+          platform::errors::Unavailable("Paddle is not compiled with GPU."));
 #endif
     }
   }
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 9fdf39a7a0b8a..01edf7b41b2a8 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -26,8 +26,9 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
                             T* scores, const float conf_thresh,
                             const int* anchors, const int n, const int h,
                             const int w, const int an_num, const int class_num,
-                            const int box_num, int input_size, bool clip_bbox,
-                            const float scale, const float bias) {
+                            const int box_num, int input_size_h,
+                            int input_size_w, bool clip_bbox, const float scale,
+                            const float bias) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   T box[4];
@@ -51,8 +52,9 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
 
     int box_idx =
         GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
-    GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
-                  grid_num, img_height, img_width, scale, bias);
+    GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
+                  input_size_w, box_idx, grid_num, img_height, img_width, scale,
+                  bias);
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
     CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
 
@@ -86,7 +88,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     const int w = input->dims()[3];
     const int box_num = boxes->dims()[1];
     const int an_num = anchors.size() / 2;
-    int input_size = downsample_ratio * h;
+    int input_size_h = downsample_ratio * h;
+    int input_size_w = downsample_ratio * w;
 
     auto& dev_ctx = ctx.cuda_device_context();
     int bytes = sizeof(int) * anchors.size();
@@ -111,8 +114,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
 
     KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
         input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
-        anchors_data, n, h, w, an_num, class_num, box_num, input_size,
-        clip_bbox, scale, bias);
+        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
+        input_size_w, clip_bbox, scale, bias);
   }
 };
 
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 388467d37ba64..1cfef142bca73 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -27,17 +27,18 @@ HOSTDEVICE inline T sigmoid(T x) {
 
 template <typename T>
 HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                  int j, int an_idx, int grid_size,
-                                  int input_size, int index, int stride,
+                                  int j, int an_idx, int grid_size_h,
+                                  int grid_size_w, int input_size_h,
+                                  int input_size_w, int index, int stride,
                                   int img_height, int img_width, float scale,
                                   float bias) {
-  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size;
+  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
   box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
-           grid_size;
+           grid_size_h;
   box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size;
+           input_size_w;
   box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size;
+           img_height / input_size_h;
 }
 
 HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
@@ -99,7 +100,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     const int w = input->dims()[3];
     const int box_num = boxes->dims()[1];
     const int an_num = anchors.size() / 2;
-    int input_size = downsample_ratio * h;
+    int input_size_h = downsample_ratio * h;
+    int input_size_w = downsample_ratio * w;
 
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
@@ -134,8 +136,9 @@ class YoloBoxKernel : public framework::OpKernel<T> {
 
             int box_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
-            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
-                          box_idx, stride, img_height, img_width, scale, bias);
+            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
+                          input_size_h, input_size_w, box_idx, stride,
+                          img_height, img_width, scale, bias);
             box_idx = (i * box_num + j * stride + k * w + l) * 4;
             CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
                                 clip_bbox);
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 67dc284334568..ae78517182a22 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -32,16 +32,28 @@ class DiagV2Op : public framework::OperatorWithKernel {
     auto offset = ctx->Attrs().Get<int>("offset");
 
     if (x_dims.size() == 1UL) {
-      int64_t size = x_dims[0] + std::abs(offset);
-      ctx->SetOutputDim("Out", {size, size});
+      int64_t size_ = x_dims[0] + std::abs(offset);
+      ctx->SetOutputDim("Out", {size_, size_});
     } else if (x_dims.size() == 2UL) {
-      int64_t size;
+      int64_t size_ = 0;
       if (offset >= 0) {
-        size = std::min(x_dims[0], x_dims[1] - offset);
+        // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+        // of `size_` will have unexpected result on Windows Python3.8
+        if (x_dims[0] < x_dims[1] - offset) {
+          size_ = x_dims[0];
+        } else {
+          size_ = x_dims[1] - offset;
+        }
       } else {
-        size = std::min(x_dims[0] + offset, x_dims[1]);
+        // Note(LutaoChu): Do not use std::min here, otherwise the calculation
+        // of `size_` will have unexpected result on Windows Python3.8
+        if (x_dims[0] + offset < x_dims[1]) {
+          size_ = x_dims[0] + offset;
+        } else {
+          size_ = x_dims[1];
+        }
       }
-      ctx->SetOutputDim("Out", {size});
+      ctx->SetOutputDim("Out", {size_});
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "The input tensor X's dimensions of DiagV2Op should be either 1 or "
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index e584e02508815..47fbb42fd6a81 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()
 
 
 cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
+    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op )
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index cb93b8d910a23..b2a26089c8689 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -448,6 +448,7 @@ VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
 VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
                                                const std::string& dirname,
                                                const std::string& varname,
+                                               const int mode,
                                                int64_t time_out) {
   sendrecv::VariableMessage req;
   req.set_varname(varname);
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
index 2ea90d560f568..91f94b4c9d5a3 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -103,7 +103,7 @@ class BRPCClient : public RPCClient {
 
   VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dirname,
-      const std::string& varname,
+      const std::string& varname, const int mode,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
   bool Wait() override;
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 92959cb22ed1b..8fa6673e2a2aa 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -466,41 +466,34 @@ void GeoCommunicator::Send(const std::vector<std::string> &var_names,
                            const std::vector<std::string> &var_tables,
                            const framework::Scope &scope) {
   waiting_ = false;
+  PADDLE_ENFORCE_EQ(
+      var_tables.size(), 1,
+      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
+
+  auto table_name = var_tables[0];
+  if (table_name == STEP_COUNTER) return;
 
   auto before_send = GetCurrentUS();
-  std::unordered_map<std::string, std::unordered_set<int64_t>> ids_table;
+  size_t splited_var_nums =
+      send_varname_to_ctx_[table_name].splited_varnames.size();
 
-  for (size_t i = 0; i < var_tables.size(); i++) {
-    auto table_name = var_tables[i];
-    if (table_name == STEP_COUNTER) {
-      continue;
-    } else {
-      size_t splited_var_nums =
-          send_varname_to_ctx_[table_name].splited_varnames.size();
-
-      for (size_t j = 0; j < splited_var_nums; j++) {
-        if (ids_table.find(
-                send_varname_to_ctx_[table_name].splited_varnames[j]) ==
-            ids_table.end()) {
-          ids_table.insert(std::pair<std::string, std::unordered_set<int64_t>>(
-              send_varname_to_ctx_[table_name].splited_varnames[j],
-              std::unordered_set<int64_t>()));
-        }
-      }
+  std::unordered_map<std::string, std::unordered_set<int64_t>> ids_table;
 
-      auto *var = scope.FindVar(var_names[i]);
-      auto var_tensor = var->Get<framework::LoDTensor>();
-      int element_number = var_tensor.numel();
-      const int64_t *var_mutable_data = var_tensor.data<int64_t>();
+  for (size_t j = 0; j < splited_var_nums; j++) {
+    ids_table.insert(std::pair<std::string, std::unordered_set<int64_t>>(
+        send_varname_to_ctx_[table_name].splited_varnames[j],
+        std::unordered_set<int64_t>()));
+  }
+  auto *var = scope.FindVar(var_names[0]);
+  auto &rows = var->Get<framework::SelectedRows>().rows();
 
-      // insert ids which has not been record
-      for (int j = 0; j < element_number; j++) {
-        auto ep_idx = var_mutable_data[j] % splited_var_nums;
-        ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx])
-            .insert(var_mutable_data[j]);
-      }
-    }
+  // insert ids which has not been record
+  for (size_t j = 0; j < rows.size(); j++) {
+    auto ep_idx = rows[j] % splited_var_nums;
+    ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx])
+        .insert(rows[j]);
   }
+
   auto before_push = GetCurrentUS();
   for (auto &iter : ids_table) {
     auto &key = iter.first;
@@ -512,8 +505,8 @@ void GeoCommunicator::Send(const std::vector<std::string> &var_names,
             << "'s queue";
   }
   auto after_send = GetCurrentUS();
-  VLOG(3) << "run send_op finish. using " << (before_push - before_send) << "; "
-          << (after_send - before_push);
+  VLOG(3) << "run send " << table_name << " op finish. using "
+          << (before_push - before_send) << "; " << (after_send - before_push);
 }
 
 void GeoCommunicator::MainThread() {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 9fd828bfa55c2..0320ef6595deb 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -422,6 +422,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
 VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
                                                const std::string& dirname,
                                                const std::string& varname,
+                                               const int mode,
                                                int64_t time_out) {
   const auto ch = GetChannel(ep);
 
@@ -435,6 +436,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
 
   sendrecv::VariableMessage req;
   req.set_varname(varname);
+  req.set_table_name(std::to_string(mode));
   req.set_out_varname(dirname);
 
   platform::RecordRPCEvent record_event(method);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index 22ca74a67e72b..7b269f4d80c60 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -258,7 +258,7 @@ class GRPCClient : public RPCClient {
 
   VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dirname,
-      const std::string& varname,
+      const std::string& varname, const int mode,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncDistributeNotify(
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index a1cbf7db7e275..912520d782d75 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -106,9 +106,8 @@ class RequestSend final : public RequestBase {
                        ::grpc::ServerCompletionQueue* cq,
                        RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(
-        request_handler->scope(), request_handler->dev_ctx(),
-        request_handler->distributed_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
     int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -399,12 +398,13 @@ class RequestCheckpointNotify final : public RequestBase {
     std::string checkpoint_notify = request_->Varname();
     std::string checkpoint_dir = request_->OutVarname();
     int trainer_id = request_->GetTrainerId();
+    std::string table_name = request_->TableName();
 
     VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
             << ", dir: " << checkpoint_dir;
 
     request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                             trainer_id, checkpoint_dir);
+                             trainer_id, checkpoint_dir, table_name);
     Finish(reply_, &responder_);
   }
 
@@ -420,9 +420,8 @@ class RequestNotify final : public RequestBase {
                          ::grpc::ServerCompletionQueue* cq,
                          RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(
-        request_handler->scope(), request_handler->dev_ctx(),
-        request_handler->distributed_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
     int method_id = static_cast<int>(distributed::GrpcMethod::kRequestNotify);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -455,9 +454,8 @@ class RequestSendAndRecv final : public RequestBase {
                               ::grpc::ServerCompletionQueue* cq,
                               RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(
-        request_handler->scope(), request_handler->dev_ctx(),
-        request_handler->distributed_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
 
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index b4388c0002a78..52b76b7bfe7d6 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -245,6 +245,7 @@ struct VALUE {
 
   std::vector<std::string> names_;
   int count_;
+  bool seen_after_last_save_;
   int unseen_days_;
   bool is_entry_;
   std::vector<std::vector<float>> values_;
@@ -321,6 +322,7 @@ class ValueBlock {
 
     auto value = new VALUE(value_names_);
     value->set(values);
+    value->seen_after_last_save_ = true;
     value->count_ = count;
     values_[id] = value;
   }
@@ -589,9 +591,9 @@ class SparseVariable {
     }
   }
 
-  void Save(const std::string &dirname) {
+  void Save(const std::string &dirname, const int mode = 0) {
     rwlock_->WRLock();
-    VLOG(1) << "save " << meta_.name << " in dir: " << dirname << " begin";
+    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin";
 
     MkDirRecursively(dirname.c_str());
 
@@ -600,22 +602,15 @@ class SparseVariable {
       auto filename = string::Sprintf("%s/%s", dirname, value_name);
       filenames.push_back(filename);
     }
-    SaveToSelectedRows(filenames, meta_.value_names);
 
-    //    // save sparse to text
-    //    std::vector<std::string> txt_filenames;
-    //    for (auto &value_name : meta_.value_names) {
-    //      auto filename = string::Sprintf("%s/%s.txt", dirname, value_name);
-    //      txt_filenames.push_back(filename);
-    //    }
-    //    SaveToText(txt_filenames, meta_.value_names);
-
-    VLOG(1) << "save " << meta_.name << " in dir: " << dirname << " done";
+    SaveToSelectedRows(filenames, meta_.value_names, mode);
+    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done";
     rwlock_->UNLock();
   }
 
   void SaveToSelectedRows(const std::vector<std::string> &filenames,
-                          const std::vector<std::string> &valuenames) {
+                          const std::vector<std::string> &valuenames,
+                          const int mode) {
     for (auto &value_name : valuenames) {
       auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
                           value_name);
@@ -629,14 +624,34 @@ class SparseVariable {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
-    int64_t ids_num = 0;
+    std::vector<int64_t> ids;
+
     for (auto &block : shard_blocks_) {
-      ids_num += block->values_.size();
+      for (auto value : block->values_) {
+        if (mode == 0) {
+          ids.push_back(value.first);
+        } else {
+          bool id_need_save = false;
+          // save all params
+          if (mode == 1) {
+            id_need_save = true;
+          } else {
+            id_need_save = value.second->seen_after_last_save_;
+          }
+
+          if (id_need_save) {
+            ids.push_back(value.first);
+          }
+          value.second->seen_after_last_save_ = false;
+        }
+      }
     }
 
+    VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name
+            << " with mode: " << mode;
+
     std::vector<std::shared_ptr<framework::Variable>> variables;
     std::vector<float *> tensors;
-    std::vector<int64_t> ids;
     std::vector<int64_t> dims;
 
     for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
@@ -645,7 +660,7 @@ class SparseVariable {
       auto *slr = var->GetMutable<framework::SelectedRows>();
       auto *src_t = slr->mutable_value();
 
-      src_t->Resize({ids_num, dim});
+      src_t->Resize({static_cast<int64_t>(ids.size()), dim});
       auto *value = src_t->mutable_data<float>(place);
 
       dims.push_back(dim);
@@ -653,20 +668,17 @@ class SparseVariable {
       tensors.push_back(value);
     }
 
-    int64_t offset = 0;
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        ids.push_back(value.first);
-        std::vector<std::vector<float> *> vss = value.second->get(valuenames);
-
-        for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-          auto &vs = vss[i];
-          std::memcpy(tensors[i] + offset * dims[i], vs->data(),
-                      sizeof(float) * dims[i]);
-        }
+    std::vector<std::vector<std::vector<float> *>> values;
+    Get(ids, valuenames, &values);
 
-        offset += 1;
+    int64_t offset = 0;
+    for (auto &vss : values) {
+      for (int i = 0; i < static_cast<int>(vss.size()); i++) {
+        auto &vs = vss[i];
+        std::memcpy(tensors[i] + offset * dims[i], vs->data(),
+                    sizeof(float) * dims[i]);
       }
+      offset += 1;
     }
 
     for (auto &var : variables) {
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 51b13bc2c569d..d5d3c9c3c7c48 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -52,22 +52,25 @@ void RecvSparseLodTensor(const CommContext &rpc_ctx,
   std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
   std::vector<const float *> tensors;
   std::vector<distributed::VarHandlePtr> rets;
+  std::vector<std::string> recv_varnames;
   for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
     auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *local_var = local_scope->Var(recv_var_name);
     VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
+    local_scope->Var(recv_var_name);
     // sparse param in recv_scope is LoDTensor
     rets.push_back(rpc_client->AsyncGetVarNoBarrier(
         rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
         recv_var_name));
-
-    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
-    tensors.push_back(value);
+    recv_varnames.push_back(recv_var_name);
   }
 
   for (size_t i = 0; i < rets.size(); i++) {
     PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
                                                "internal error in RPCClient"));
+    auto &recv_var_name = recv_varnames[i];
+    auto *local_var = local_scope->FindVar(recv_var_name);
+    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
+    tensors.push_back(value);
   }
 
   auto *merged_var = scope.FindVar(rpc_ctx.var_name);
@@ -83,8 +86,10 @@ void RecvSparseLodTensor(const CommContext &rpc_ctx,
     height += splited_var->Get<framework::LoDTensor>().dims()[0];
   }
 
-  PADDLE_ENFORCE_EQ(merged_var->Get<framework::LoDTensor>().dims()[0], height,
-                    "recved var must has same dims with local var");
+  PADDLE_ENFORCE_EQ(
+      merged_var->Get<framework::LoDTensor>().dims()[0], height,
+      platform::errors::InvalidArgument(
+          "Received variable must has same dimension with local variable."));
 
   auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
   auto *merged_d = merged_t->mutable_data<float>(cpu_place);
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 0d67fc0021a53..8c4f2ef57a32c 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -274,13 +274,13 @@ bool RequestCheckpointHandler::Handle(const std::string &varname,
                                       const int trainer_id,
                                       const std::string &out_var_name,
                                       const std::string &table_name) {
-  VLOG(4) << "receive save var " << varname << " with path " << out_var_name;
+  VLOG(4) << "receive save var " << varname << " with path " << out_var_name
+          << " mode " << table_name;
+
+  int mode = std::stoi(table_name);
 
   auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Save(out_var_name);
-  //  auto checkpoint_op = BuildCheckpointOp(varname, out_var_name);
-  //  paddle::platform::CPUPlace cpu_place;
-  //  checkpoint_op->Run(*scope_, cpu_place);
+  ins->Get(varname)->Save(out_var_name, mode);
   return true;
 }
 
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 6a6a795a46b52..2c756a6f71ff9 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -87,7 +87,8 @@ class RPCClient {
 
   virtual VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dirname,
-      const std::string& varname, int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& varname, const int mode,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual VarHandlePtr AsyncDistributeNotify(
       const std::string& ep, const platform::DeviceContext& ctx,
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index b6d4d59485520..f59285400033d 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <stdlib.h>
 #include <unistd.h>
+#include <chrono>  // NOLINT
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -26,6 +27,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
@@ -35,6 +37,7 @@ namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
 USE_NO_KERNEL_OP(lookup_sparse_table_read);
+USE_NO_KERNEL_OP(checkpoint_notify);
 USE_OP(scale);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
@@ -122,7 +125,7 @@ void StartServer(const std::string& rpc_name) {
 
   g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
 
-  distributed::HeartBeatMonitor::Init(2, true, "w@grad");
+  //  distributed::HeartBeatMonitor::Init(1, true, "w@grad");
 
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
@@ -232,3 +235,110 @@ TEST(SENDANDRECV, CPU) {
   g_rpc_service.reset(nullptr);
   g_req_handler.reset(nullptr);
 }
+
+void StartCheckpointServer(const std::string& rpc_name) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+
+  std::vector<distributed::SparseMeta> metas;
+
+  auto meta = distributed::SparseMeta();
+  meta.name = "embedding.block0";
+  meta.value_names = {"Param"};
+  meta.value_dims = {64};
+  meta.mode = distributed::Mode::training;
+  meta.grad_name = "embedding@Grad";
+  meta.cached_varnames = {"kSparseIds"};
+  meta.initializer_attrs = {"fill_constant&1.0"};
+  meta.entry = "none";
+
+  metas.push_back(meta);
+  distributed::LargeScaleKV::Init(metas);
+
+  auto* ins = distributed::LargeScaleKV::GetInstance();
+  ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
+
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
+}
+
+TEST(LARGE_SCALE_CHECKPOINT, CPU) {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  g_req_handler.reset(new distributed::RequestCheckpointHandler(
+      distributed::DistributedMode::kAsync));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+
+  PADDLE_ENFORCE_NE(client, nullptr,
+                    platform::errors::InvalidArgument(
+                        "Client Start Fail, Check Your Code & Env"));
+
+  std::thread server_thread(StartCheckpointServer,
+                            distributed::kRequestCheckpoint);
+  g_rpc_service->WaitServerReady();
+
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
+  auto save_path =
+      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base",
+                              "embedding", "embedding.block0");
+  int mode = 0;
+  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
+  client->Wait();
+
+  save_path =
+      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta",
+                              "embedding", "embedding.block0");
+  mode = 1;
+  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
+  client->Wait();
+
+  paddle::framework::AttributeMap attrs;
+
+  std::vector<std::string> eps = {ep};
+  attrs["endpoints"] = eps;
+  attrs["dirname"] = std::string("/tmp/large_scale_table/delta1");
+  attrs["varname"] = std::string("embedding");
+  attrs["mode"] = 2;
+  std::vector<std::string> slices = {"embedding.block0"};
+  attrs["slice_varnames"] = slices;
+  std::vector<std::string> remotes = {"embedding.block0"};
+  attrs["remote_varnames"] = remotes;
+
+  auto ops =
+      framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true);
+  ops->Run(scope, place);
+
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
index abc8d912840db..051d9d65c7714 100644
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -42,8 +42,12 @@ class CheckpointNotifyOp : public framework::OperatorBase {
         Attr<std::vector<std::string>>("endpoints");
     std::string dirname = Attr<std::string>("dirname");
     std::string varname = Attr<std::string>("varname");
-    auto is_slice = Attr<bool>("is_slice");
-    VLOG(1) << "is_slice: " << is_slice;
+    auto mode = Attr<int>("mode");
+
+    if (mode != 0 && mode != 1 && mode != 2) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "mode expected in [0/1/2], but got %d", mode));
+    }
 
     std::vector<std::string> slice_varnames =
         Attr<std::vector<std::string>>("slice_varnames");
@@ -58,11 +62,12 @@ class CheckpointNotifyOp : public framework::OperatorBase {
       auto save_path =
           string::Sprintf("%s/%s/%s", dirname, varname, slice_varnames[i]);
 
-      rpc_client->AsyncCheckpointNotify(epmap[i], save_path,
-                                        remote_varnames[i]);
+      rpc_client->AsyncCheckpointNotify(epmap[i], save_path, remote_varnames[i],
+                                        mode);
 
       VLOG(3) << "checkpoint notify sending with path: " << save_path
-              << " and var:" << slice_varnames[i] << " to " << epmap[i];
+              << " and var:" << slice_varnames[i] << " to " << epmap[i]
+              << " with mode " << mode;
     }
     PADDLE_ENFORCE_EQ(
         rpc_client->Wait(), true,
@@ -85,9 +90,8 @@ class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
         "slice_varnames", "(string vector) the slice vars need to be saved");
     AddAttr<std::vector<std::string>>(
         "remote_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<bool>(
-        "is_slice",
-        "is_slice=True means the var has been slice by parameter server");
+    AddAttr<int>("mode", "mode=0/1/2 means nothing/save base/save delta")
+        .SetDefault(0);
     AddComment(R"DOC(
 CheckpointNotify operator
 This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
index 80b322fbe6773..2e54bb3961cd2 100644
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
@@ -108,7 +108,8 @@ void FlListenAndServOp::RunSyncLoop(framework::Executor *executor,
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::InvalidArgument(
+                        "server program should have at least 2 blocks"));
 
   // Prepare all the server block
   std::vector<int> optimize_blocks_list;
@@ -192,7 +193,8 @@ void FlListenAndServOp::RunImpl(const framework::Scope &scope,
   auto fan_in = Attr<int>("Fanin");
   auto inputs = Inputs("X");
 
-  PADDLE_ENFORCE_EQ(!rpc_service_, true, "rpc_service_ must null");
+  PADDLE_ENFORCE_EQ(!rpc_service_, true, platform::errors::InvalidArgument(
+                                             "rpc_service_ must null"));
   std::string endpoint = Attr<std::string>("endpoint");
 
   VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
@@ -215,7 +217,8 @@ void FlListenAndServOp::RunImpl(const framework::Scope &scope,
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(
       optimize_blocks.size(), 1,
-      "optimize blocks should be 1 at least on the pserver side.");
+      platform::errors::InvalidArgument(
+          "optimize blocks should be 1 at least on the pserver side."));
   auto *program = optimize_blocks[0]->Program();
   framework::Executor executor(dev_place);
 
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
index e53ce8cc67c08..b8328b88da7d1 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
@@ -23,22 +23,27 @@ class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of LargeScaleFuseAdamOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Grad"),
+        platform::errors::InvalidArgument(
+            "Input(Grad) of LargeScaleFuseAdamOp should not be null."));
     PADDLE_ENFORCE(
         ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of LargeScaleFuseAdamOp should not be null.");
+        platform::errors::InvalidArgument(
+            "Input(LearningRate) of LargeScaleFuseAdamOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
 
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
 
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 element"));
   }
 
  protected:
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
index 010658b5280d7..8794b87f3ff40 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
@@ -23,22 +23,27 @@ class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of LargeScaleFuseSGDOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Grad"),
+        platform::errors::InvalidArgument(
+            "Input(Grad) of LargeScaleFuseSGDOp should not be null."));
     PADDLE_ENFORCE(
         ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of LargeScaleFuseSGDOp should not be null.");
+        platform::errors::InvalidArgument(
+            "Input(LearningRate) of LargeScaleFuseSGDOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
 
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
 
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 element"));
   }
 
  protected:
diff --git a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc b/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
new file mode 100644
index 0000000000000..6cd01089f9bc2
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
@@ -0,0 +1,217 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void **buf, Tensor *tensor,
+                          const platform::Place &place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void apply() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void **buf_;
+  Tensor *tensor_;
+  platform::Place place_;
+};
+
+template <typename DeviceContext, typename T>
+class SparseTensorLoadKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    std::ifstream fin(filename, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Load operator fail to open file %s, please check "
+                          "whether the model file is complete or damaged.",
+                          filename));
+    auto name = ctx.OutputNames("Out")[0];
+    VLOG(4) << "Sparse Load Var name: " << name;
+    auto *out_var = ctx.OutputVar("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::InvalidArgument(
+                     "The variable %s to be loaded cannot be found.", name));
+    PADDLE_ENFORCE_EQ(out_var->IsType<paddle::framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "SparseLoad OP only support LoDTensor"));
+    LoadLodTensor(fin, place, out_var, ctx);
+  }
+
+  void LoadLodTensor(std::istream &is, const platform::Place &place,
+                     paddle::framework::Variable *var,
+                     const paddle::framework::ExecutionContext &ctx) const {
+    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
+
+    auto node_index = ctx.Attr<int64_t>("node_index");
+    auto node_num = ctx.Attr<int64_t>("node_num");
+    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
+    VLOG(4) << "Sparse LoadLodTensor node_num" << node_num;
+    VLOG(4) << "Sparse LoadLodTensor node_index" << node_index;
+    VLOG(4) << "Sparse LoadLodTensor shape[0]" << shape[0];
+    PADDLE_ENFORCE_GE(node_index, 0, platform::errors::InvalidArgument(
+                                         "node_num great than or equal to 0"));
+    PADDLE_ENFORCE_GE(node_num, 1, platform::errors::InvalidArgument(
+                                       "node_num great than or equal to 1"));
+
+    {
+      // the 1st field, unit32_t version for LoDTensor
+      uint32_t version;
+      is.read(reinterpret_cast<char *>(&version), sizeof(version));
+      PADDLE_ENFORCE_EQ(paddle::framework::IsTensorVersionSupported(version),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Tensor version %u is not supported.", version));
+      PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
+                                         "Tensor version %u is not supported, "
+                                         "only version 0 is supported.",
+                                         version));
+    }
+
+    {
+      // the 2st field, LoD information
+      // Todo sparse load need change LoDTensor's lod level
+      uint64_t lod_level;
+      is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+      auto &lod = *tensor->mutable_lod();
+      lod.resize(lod_level);
+    }
+
+    // the 3st filed, Tensor
+
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+
+    PADDLE_ENFORCE_EQ(
+        version, 0U,
+        platform::errors::InvalidArgument(
+            "tensor version %u is not supported, Only version 0 is supported",
+            version));
+
+    paddle::framework::proto::VarType::TensorDesc desc;
+
+    {  // int32_t size
+      // proto buffer
+      int32_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::unique_ptr<char[]> buf(new char[size]);
+      is.read(reinterpret_cast<char *>(buf.get()), size);
+      PADDLE_ENFORCE_EQ(
+          desc.ParseFromArray(buf.get(), size), true,
+          platform::errors::InvalidArgument("Cannot parse tensor desc"));
+    }
+
+    {  // read tensor
+      std::vector<int64_t> dims;
+      dims.reserve(static_cast<size_t>(desc.dims().size()));
+      std::copy(desc.dims().begin(), desc.dims().end(),
+                std::back_inserter(dims));
+
+      int64_t line_numel = 1;
+      for (size_t dim = 1; dim < dims.size(); dim++) {
+        line_numel *= dims[dim];
+      }
+      auto total_line = dims[0];
+
+      tensor->Resize(paddle::framework::make_ddim(shape));
+
+      void *buf;
+      auto ctx = platform::CPUDeviceContext();
+
+      paddle::framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+
+      auto line_size =
+          line_numel * paddle::framework::SizeOfType(desc.data_type());
+      char *cur_buf = static_cast<char *>(buf);
+      char *temp_row = new char[line_size];
+      VLOG(4) << "TensorFromStream: line_size " << line_size;
+      VLOG(4) << "TensorFromStream: total_line " << total_line;
+      for (size_t line_index = 0; line_index < static_cast<size_t>(total_line);
+           ++line_index) {
+        is.read(temp_row, line_size);
+        if (static_cast<int64_t>(line_index) % node_num == node_index) {
+          memcpy(cur_buf, temp_row, line_size);
+          cur_buf += line_size;
+        }
+      }
+    }
+  }
+};
+
+class SparseTensorLoadOp : public paddle::framework::OperatorWithKernel {
+ public:
+  using paddle::framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(paddle::framework::InferShapeContext *ctx) const override {}
+
+ protected:
+  paddle::framework::OpKernelType GetExpectedKernelType(
+      const paddle::framework::ExecutionContext &ctx) const override {
+    paddle::framework::OpKernelType kt = paddle::framework::OpKernelType(
+        paddle::framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
+  }
+};
+
+class SparseTensorLoadOpMaker
+    : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
+    AddAttr<std::string>("file_path",
+                         R"(Variable will be loaded from "file_path")")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddAttr<int64_t>("node_index", "role id from 0 ~ node_num.").SetDefault(0);
+    AddAttr<int64_t>("node_num", "role nums which need load current varibale.")
+        .SetDefault(0);
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output")
+        .SetDefault({});
+    AddComment(R"DOC(
+    SparseTensorLoad OP, Load sprase tensor on parameter server
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sparse_tensor_load, ops::SparseTensorLoadOp,
+                  ops::SparseTensorLoadOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    sparse_tensor_load,
+    ops::SparseTensorLoadKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
index 5d26c80f8830a..042a22b8ff199 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
@@ -40,9 +40,9 @@ class SplitByrefOp : public framework::OperatorWithKernel {
       if (ctx->IsRuntime()) {
         in_axis_dim = in_dims[0];
       }
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                        "tensor split does not result"
-                        " in an equal division");
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, platform::errors::InvalidArgument(
+                                                  "tensor split does not result"
+                                                  " in an equal division"));
       size_t out_axis_dim = in_axis_dim / num;
       for (size_t i = 0; i < outs_number; ++i) {
         auto dim = in_dims;
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
new file mode 100644
index 0000000000000..f5d831fa24012
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/dropout_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_XPU
+static std::map<int, float*> mask_data_tables;
+static const int max_data_size = 32 * 1024 * 1024;
+static std::mutex s_mask_data_table_lock;
+template <typename DeviceContext, typename T>
+class DropoutXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    const auto* x_data = x->data<T>();
+    auto* y_data = y->mutable_data<T>(context.GetPlace());
+    float dropout_prob = context.Attr<float>("dropout_prob");
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
+    float* mask_data_table = nullptr;
+    PADDLE_ENFORCE_EQ(!context.HasInput("Seed"), true,
+                      platform::errors::InvalidArgument(
+                          ("Input(Seed) not supported on XPU")));
+    if (!context.Attr<bool>("is_test")) {
+      int dev_id =
+          BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId();
+      int prop = static_cast<int>(dropout_prob * 100);
+      int is_upscale = (dropout_implementation == "upscale_in_train");
+      /* mask_data_tables key contains 3 part:
+       *  | 31-16  | 15-8 | 7-0        |
+       *  | dev_id | prob | is_upscale |
+       */
+      int index = (dev_id << 16) + (prop << 8) + is_upscale;
+      std::lock_guard<std::mutex> lock(s_mask_data_table_lock);
+      if (mask_data_tables.find(index) == mask_data_tables.end()) {
+        float* mask_data_host = new float[max_data_size];
+        std::random_device rnd;
+        std::minstd_rand engine;
+        int seed =
+            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+        engine.seed(seed);
+        std::uniform_real_distribution<float> dist(0, 1);
+        for (size_t i = 0; i < max_data_size; ++i) {
+          if (dist(engine) < dropout_prob) {
+            mask_data_host[i] = 0.0f;
+          } else {
+            if (is_upscale) {
+              mask_data_host[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
+            } else {
+              mask_data_host[i] = 1.0;
+            }
+          }
+        }
+        PADDLE_ENFORCE_EQ(
+            xpu_malloc(reinterpret_cast<void**>(&mask_data_table),
+                       max_data_size * sizeof(float)),
+            XPU_SUCCESS,
+            platform::errors::ResourceExhausted(
+                "\n\nOut of memory error on XPU, Cannot"
+                "allocate %s memory on XPU. \n\nPlease "
+                "check whether there is any other process "
+                "using XPU.\n",
+                string::HumanReadableSize(max_data_size * sizeof(void*))));
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
+                     mask_data_table, platform::CPUPlace(), mask_data_host,
+                     max_data_size * sizeof(float));
+        mask_data_tables[index] = mask_data_table;
+        free(mask_data_host);
+      } else {
+        mask_data_table = mask_data_tables[index];
+      }
+    }
+    if (!context.Attr<bool>("is_test")) {  // Train
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      size_t size = framework::product(mask->dims());
+      auto& dev_ctx = context.template device_context<DeviceContext>();
+      int r = xpu::dropout(dev_ctx.x_context(), mask_data_table, x_data,
+                           mask_data, y_data, max_data_size, size);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External(
+              "XPU dropout return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {  // Infer
+      float scale = 0.0f;
+      if (dropout_implementation == "upscale_in_train") {
+        scale = 1.0f;
+      } else {
+        scale = static_cast<T>(1.0f - dropout_prob);
+      }
+      auto& dev_ctx = context.template device_context<DeviceContext>();
+      int r = xpu::scale(dev_ctx.x_context(), x->numel(), scale, 0.0f, 0,
+                         x_data, y_data);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External(
+              "XPU dropout return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class DropoutGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(!context.Attr<bool>("is_test"), true,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = context.Input<Tensor>("Mask");
+    grad_x->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::elementwise_mul(dev_ctx.x_context(), grad_y->data<T>(),
+                                 mask->data<T>(), grad_x->data<T>(),
+                                 grad_y->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "XPU dropout return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    dropout, ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    dropout_grad,
+    ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 9ff7a71d7f03a..ad4a16c6e06cd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -49,7 +49,8 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
 
     int axis = ctx.Attr<int>("axis");
     PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
-                      "Rank of first input must >= rank of second input.");
+                      platform::errors::InvalidArgument(
+                          "Rank of first input must >= rank of second input."));
 
     if (dx != nullptr) {
       dx->mutable_data<T>(ctx.GetPlace());
@@ -69,8 +70,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
       n = dout->numel();
     } else {
       axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
-      PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
-                     "Axis should be in range [0, dx_dims)");
+      PADDLE_ENFORCE_EQ(axis >= 0 && axis < dx_dims.size(), true,
+                        platform::errors::InvalidArgument(
+                            "Axis should be in range [0, dx_dims)"));
       auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
       axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
       get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
@@ -84,30 +86,46 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
       int r = xpu::matrix_vector_add_grad(
           dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
           dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External(
-              "XPU API return wrong value[%d], please check whether "
-              "Baidu Kunlun Card is properly installed.",
-              r));
+      if (r == xpu::Error_t::INVALID_PARAM) {
+        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                          platform::errors::InvalidArgument(
+                              "XPU kernel error of ElementWiseAddOp, error "
+                              "message: INVALID_PARAM, "
+                              "please check your input & output."));
+      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                          platform::errors::Unavailable(
+                              "XPU kernel error of ElementWiseAddOp, error "
+                              "message: RUNTIME_ERROR, "
+                              "please check whether Baidu Kunlun card is "
+                              "properly installed."));
+      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::ResourceExhausted(
+                "XPU kernel error of ElementWiseAddOp, error message: "
+                "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
+      }
       return;
     }
 
     if (dx == nullptr) {
       PADDLE_ENFORCE_EQ(
           xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
-          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
 
     if (dy == nullptr) {
       PADDLE_ENFORCE_EQ(
           xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
-          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     } else {
       if (len != n) {
         PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
                                      len * sizeof(float)),
-                          XPU_SUCCESS, platform::errors::External(
+                          XPU_SUCCESS, platform::errors::ResourceExhausted(
                                            "XPU has no enough memory"));
       }
     }
@@ -115,22 +133,50 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
     int r = xpu::elementwise_add_grad(
         dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
         dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    if (r == xpu::Error_t::INVALID_PARAM) {
+      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                        platform::errors::InvalidArgument(
+                            "XPU kernel error of ElementWiseAddOp, error "
+                            "message: INVALID_PARAM, "
+                            "please check your input & output."));
+    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::Unavailable(
+              "XPU kernel error of ElementWiseAddOp, error message: "
+              "RUNTIME_ERROR, "
+              "please check whether Baidu Kunlun card is properly installed."));
+    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::ResourceExhausted(
+              "XPU kernel error of ElementWiseAddOp, error message: "
+              "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
+    }
 
     if ((dy != nullptr) && (len != n)) {
       r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
                          post, xpu::ElementwiseOp::ASSIGN);
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External(
-              "XPU API return wrong value[%d], please check whether "
-              "Baidu Kunlun Card is properly installed.",
-              r));
+      if (r == xpu::Error_t::INVALID_PARAM) {
+        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                          platform::errors::InvalidArgument(
+                              "XPU kernel error of ElementWiseAddOp, error "
+                              "message: INVALID_PARAM, "
+                              "please check your input & output."));
+      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                          platform::errors::Unavailable(
+                              "XPU kernel error of ElementWiseAddOp, error "
+                              "message: RUNTIME_ERROR, "
+                              "please check whether Baidu Kunlun card is "
+                              "properly installed."));
+      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::ResourceExhausted(
+                "XPU kernel error of ElementWiseAddOp, error message: "
+                "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
+      }
       dev_ctx.Wait();
       xpu_free(dy_data);
     }
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
new file mode 100644
index 0000000000000..6cc4276680010
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct XPUDivFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_div(ctx, x, y, z, len);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T, XPUDivFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
new file mode 100644
index 0000000000000..232cfa023970d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct XPUMaxFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_max(ctx, x, y, z, len);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T, XPUMaxFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
new file mode 100644
index 0000000000000..d9a6ca844aecd
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ElementwiseMulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T, XPUMulFunctor<T>>(ctx);
+  }
+};
+DEFINE_XPU_GRAD_KERNEL(Mul, mul, true);
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_mul_grad,
+                       ops::ElementwiseMulGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
new file mode 100644
index 0000000000000..4e205fe49216f
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct XPUSubFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_sub(ctx, x, y, z, len);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseSubXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    XPUElementwise<T, XPUSubFunctor<T>>(ctx);
+  }
+};
+
+DEFINE_XPU_GRAD_KERNEL(Sub, sub, false);
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
index 53c4332e9190d..53f2cd2dcccf1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -13,9 +13,153 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
+inline std::string get_xpu_error_message(int error_type) {
+  static std::unordered_map<int, std::string> xpu_error_map = {
+      {baidu::xpu::api::INVALID_PARAM, "Parameter is invalid."},
+      {baidu::xpu::api::RUNTIME_ERROR,
+       "Please check whether Baidu Kunlun Card "
+       "is properly installed."},
+      {baidu::xpu::api::NO_ENOUGH_WORKSPACE,
+       "There is not enough memory in Baidu"
+       " Kunlun Card."}};
+  if (xpu_error_map.find(error_type) == xpu_error_map.end()) {
+    return "Unknown error type!";
+  }
+  return xpu_error_map[error_type];
+}
+
+#define XPU_MALLOC(addr, num_bytes)                                        \
+  PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(addr), num_bytes), \
+                    XPU_SUCCESS,                                           \
+                    platform::errors::ResourceExhausted(                   \
+                        "\n\nOut of memory error on XPU, Cannot"           \
+                        "allocate %s memory on XPU. \n\nPlease "           \
+                        "check whether there is any other process "        \
+                        "using XPU.\n",                                    \
+                        string::HumanReadableSize(num_bytes)))
+
+#define DEFINE_XPU_GRAD_KERNEL(kernel_type, kernel_name, use_x_y_data)         \
+  template <typename DeviceContext, typename T>                                \
+  class Elementwise##kernel_type##GradXPUKernel                                \
+      : public ElemwiseGradKernel<T> {                                         \
+   public:                                                                     \
+    void Compute(const framework::ExecutionContext& ctx) const override {      \
+      ElemwiseGradKernel<T>::Compute(ctx);                                     \
+      using Tensor = framework::Tensor;                                        \
+      auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));           \
+      auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));              \
+      auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));              \
+      auto dx_dims = dout->dims();                                             \
+      auto dy_dims_untrimed = dout->dims();                                    \
+      T* dx_data = NULL;                                                       \
+      T* dy_data = NULL;                                                       \
+      const T* y_data = nullptr;                                               \
+      const T* x_data = nullptr;                                               \
+      T* y_broadcast = nullptr;                                                \
+      if (use_x_y_data) {                                                      \
+        auto* x = ctx.Input<Tensor>("X");                                      \
+        auto* y = ctx.Input<Tensor>("Y");                                      \
+        y_data = y->data<T>();                                                 \
+        x_data = x->data<T>();                                                 \
+      } else {                                                                 \
+        x_data = dout->data<T>();                                              \
+        y_data = dout->data<T>();                                              \
+      }                                                                        \
+      int axis = ctx.Attr<int>("axis");                                        \
+      PADDLE_ENFORCE_GE(                                                       \
+          dx_dims.size(), dy_dims_untrimed.size(),                             \
+          platform::errors::InvalidArgument(                                   \
+              "Rank of first input must >= rank of second input."));           \
+      if (dx != nullptr) {                                                     \
+        dx->mutable_data<T>(ctx.GetPlace());                                   \
+        dx_dims = dx->dims();                                                  \
+        dx_data = dx->data<T>();                                               \
+      }                                                                        \
+      if (dy != nullptr) {                                                     \
+        dy->mutable_data<T>(ctx.GetPlace());                                   \
+        dy_dims_untrimed = dy->dims();                                         \
+        dy_data = dy->data<T>();                                               \
+      }                                                                        \
+      int pre, n, post, is_run_common_broadcast;                               \
+      if (dx_dims == dy_dims_untrimed) {                                       \
+        pre = post = 1;                                                        \
+        n = dout->numel();                                                     \
+      } else {                                                                 \
+        axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis); \
+        PADDLE_ENFORCE_EQ(axis >= 0 && axis < dx_dims.size(), true,            \
+                          platform::errors::InvalidArgument(                   \
+                              "Axis should be in range [0, dx_dims)"));        \
+        auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);          \
+        axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;                  \
+        get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,                  \
+                     &is_run_common_broadcast);                                \
+      }                                                                        \
+      int len = pre * n * post;                                                \
+      auto& dev_ctx =                                                          \
+          ctx.template device_context<paddle::platform::XPUDeviceContext>();   \
+      if (dx == nullptr) {                                                     \
+        XPU_MALLOC(&dx_data, len * sizeof(float));                             \
+      }                                                                        \
+      if (dy == nullptr) {                                                     \
+        XPU_MALLOC(&dy_data, len * sizeof(float));                             \
+      } else {                                                                 \
+        if (len != n) {                                                        \
+          XPU_MALLOC(&dy_data, len * sizeof(float));                           \
+        }                                                                      \
+      }                                                                        \
+      if (use_x_y_data) {                                                      \
+        if (len != n) {                                                        \
+          XPU_MALLOC(&y_broadcast, len * sizeof(float));                       \
+          int res =                                                            \
+              xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre, \
+                                n, post, xpu::ElementwiseOp::ASSIGN);          \
+          PADDLE_ENFORCE_EQ(                                                   \
+              res, xpu::Error_t::SUCCESS,                                      \
+              platform::errors::External("XPU kernel error occur! %s",         \
+                                         get_xpu_error_message(res)));         \
+          y_data = y_broadcast;                                                \
+        }                                                                      \
+      }                                                                        \
+      int res = xpu::elementwise_##kernel_name##_grad(                         \
+          dev_ctx.x_context(), x_data, y_data, dout->data<T>() /*out*/,        \
+          dout->data<T>(), dx_data, dy_data, len);                             \
+      PADDLE_ENFORCE_EQ(                                                       \
+          res, xpu::Error_t::SUCCESS,                                          \
+          platform::errors::External("XPU kernel error occur! %s",             \
+                                     get_xpu_error_message(res)));             \
+      if ((dy != nullptr) && (len != n)) {                                     \
+        int res = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(),  \
+                                 pre, n, post, xpu::ElementwiseOp::ASSIGN);    \
+        PADDLE_ENFORCE_EQ(                                                     \
+            res, xpu::Error_t::SUCCESS,                                        \
+            platform::errors::External("XPU kernel error occur! %s",           \
+                                       get_xpu_error_message(res)));           \
+        dev_ctx.Wait();                                                        \
+        xpu_free(dy_data);                                                     \
+      }                                                                        \
+      if ((len != n || dx == nullptr || dy == nullptr) &&                      \
+          !(dy != nullptr && len != n)) {                                      \
+        dev_ctx.Wait();                                                        \
+      }                                                                        \
+      if (dx == nullptr) {                                                     \
+        xpu_free(dx_data);                                                     \
+      }                                                                        \
+      if (dy == nullptr) {                                                     \
+        xpu_free(dy_data);                                                     \
+      }                                                                        \
+      if (use_x_y_data) {                                                      \
+        if (len != n) {                                                        \
+          xpu_free(y_broadcast);                                               \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  }
+
 namespace paddle {
 namespace operators {
 
@@ -35,13 +179,16 @@ struct XPUMulFunctor {
 
 template <typename T, typename Functor>
 void XPUElementwise(const framework::ExecutionContext& ctx) {
-  PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
-                 "This kernel only runs on XPU device.");
+  PADDLE_ENFORCE_EQ(platform::is_xpu_place(ctx.GetPlace()), true,
+                    platform::errors::PreconditionNotMet(
+                        "This kernel only runs on XPU device."));
   auto x_var = ctx.InputVar("X");
-  PADDLE_ENFORCE_NE(x_var, nullptr,
-                    platform::errors::Fatal("Cannot get input Variable X"));
-  PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
-                 "XPU only support LoDTensor");
+  PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument(
+                                        "Cannot get input Variable X"));
+  PADDLE_ENFORCE_EQ(
+      x_var->IsType<framework::LoDTensor>(), true,
+      platform::errors::InvalidArgument(
+          "XPU only support LoDTensor, Input(X) is not LoDTensor"));
 
   auto x = x_var->Get<framework::LoDTensor>();
   auto* y = ctx.Input<framework::LoDTensor>("Y");
@@ -52,14 +199,21 @@ void XPUElementwise(const framework::ExecutionContext& ctx) {
   auto x_dims = x.dims();
   auto y_dims_untrimed = y->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
-                    "Rank of first input must >= rank of second input.");
+                    platform::errors::InvalidArgument(
+                        "Rank of first input must >= rank of second input."));
   axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
+  PADDLE_ENFORCE_EQ(
+      axis >= 0 && axis < x_dims.size(), true,
+      platform::errors::InvalidArgument("Axis should be in range [0, x_dims)"));
   auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
   int pre, n, post, is_common_broadcast;
   get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
+
+  PADDLE_ENFORCE_NE(is_common_broadcast, 1,
+                    platform::errors::Unimplemented(
+                        "X's shape should be equal to Y's shape."));
+
   int len = pre * n * post;
 
   const T* x_data = x.data<T>();
@@ -74,33 +228,36 @@ void XPUElementwise(const framework::ExecutionContext& ctx) {
     if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
       int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
                                        z_data, pre, n);
-      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
-                     res);
+      PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
+                        platform::errors::External("XPU kernel error occur! %s",
+                                                   get_xpu_error_message(res)));
       return;
     }
     if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
       int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
                                        z_data, pre, n);
-      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
-                     res);
+      PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
+                        platform::errors::External("XPU kernel error occur! %s",
+                                                   get_xpu_error_message(res)));
       return;
     }
   }
 
   if (pre != 1 || post != 1) {
-    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
-                              len * sizeof(T)) == XPU_SUCCESS);
+    XPU_MALLOC(&y_broadcast, len * sizeof(T));
     int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
                                 n, post, xpu::ElementwiseOp::ASSIGN);
-    PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
-                   res);
+    PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
+                      platform::errors::External("XPU kernel error occur! %s",
+                                                 get_xpu_error_message(res)));
     y_data = y_broadcast;
   }
 
   Functor functor;
   int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
-  PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
-                 res);
+  PADDLE_ENFORCE_EQ(res, xpu::Error_t::SUCCESS,
+                    platform::errors::External("XPU kernel error occur! %s",
+                                               get_xpu_error_message(res)));
 
   if (pre != 1 || post != 1) {
     dev_ctx.Wait();
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 6fea8fe98bf0e..41fcf3750878e 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -66,7 +66,9 @@ class FillConstantKernel : public framework::OpKernel<T> {
               value_tensor->numel()));
       const T *tensor_data = value_tensor->data<T>();
       framework::Tensor cpu_tensor;
-      if (platform::is_gpu_place(value_tensor->place())) {
+      auto tmp_place = value_tensor->place();
+      if (platform::is_gpu_place(tmp_place) ||
+          platform::is_xpu_place(tmp_place)) {
         TensorCopySync(*value_tensor, platform::CPUPlace(), &cpu_tensor);
         tensor_data = cpu_tensor.data<T>();
       }
@@ -102,6 +104,14 @@ class FillConstantKernel : public framework::OpKernel<T> {
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
     }
+#endif
+#ifdef PADDLE_WITH_XPU
+    if (!cpu_place) {
+      tensor->mutable_data(ctx.GetPlace(), data_type);
+      math::SetConstant<platform::XPUDeviceContext, T> functor;
+      functor(reinterpret_cast<const platform::XPUDeviceContext &>(dev_ctx),
+              tensor, static_cast<T>(value));
+    }
 #endif
   }
 };
diff --git a/paddle/fluid/operators/fill_constant_op_xpu.cc b/paddle/fluid/operators/fill_constant_op_xpu.cc
new file mode 100644
index 0000000000000..2bf836272a400
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op_xpu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/fill_constant_op.h"
+
+namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                       ops::FillConstantKernel<int64_t>,
+                       ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<bool>,
+                       ops::FillConstantKernel<int>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
index 08909bcb6fcb9..6fbe3b8d3bbed 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
@@ -218,9 +218,11 @@ class FusedFCElementwiseLayerNormOpMaker
         .SetDefault(1e-5)
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE_GE(epsilon, 0.0f,
-                            "'epsilon' should be between 0.0 and 0.001.");
+                            platform::errors::InvalidArgument(
+                                "'epsilon' should be between 0.0 and 0.001."));
           PADDLE_ENFORCE_LE(epsilon, 0.001f,
-                            "'epsilon' should be between 0.0 and 0.001.");
+                            platform::errors::InvalidArgument(
+                                "'epsilon' should be between 0.0 and 0.001."));
         });
     AddAttr<int>("begin_norm_axis",
                  "the axis of `begin_norm_axis ... Rank(Y) - 1` will be "
@@ -228,8 +230,10 @@ class FusedFCElementwiseLayerNormOpMaker
                  "matrix [N,H]. [default 1].")
         .SetDefault(1)
         .AddCustomChecker([](const int &begin_norm_axis) {
-          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
-                            "'begin_norm_axis' should be greater than zero.");
+          PADDLE_ENFORCE_GT(
+              begin_norm_axis, 0,
+              platform::errors::InvalidArgument(
+                  "'begin_norm_axis' should be greater than zero."));
         });
     AddComment(R"DOC(
 fc_out <= fc(X, W, Bias0)
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
new file mode 100644
index 0000000000000..ae3d0f2633bb1
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/gather_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("This kernel only runs on XPU."));
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+    if (ctx.HasInput("Axis")) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Now, it doesn't support XPU with Axis."));
+    }
+
+    output->mutable_data<T>(ctx.GetPlace());
+    if (x->numel() == 0) return;
+    // check index type is INT32
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "XPU only support INT32, it holds %s, but desires to be %s",
+            paddle::framework::DataTypeToString(index_type),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32)));
+
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+    int slice_size = x->numel() / x->dims()[0];
+    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+    int r =
+        xpu::gather<T>(dev_ctx.x_context(), x->data<T>(), index->data<int>(),
+                       index->dims()[0], slice_size, output->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error! error code=%d", r));
+  }
+};
+
+template <typename T>
+class GatherGradOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("This kernel only runs on XPU."));
+
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+
+    if (ctx.HasInput("Axis")) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Now, it doesn't support XPU with Axis."));
+    }
+
+    dx->mutable_data<T>(ctx.GetPlace());
+    const int zero = 0;
+    int r_dx = xpu::memset(dev_ctx.x_context(), dx->data<T>(), zero,
+                           dx->numel() * sizeof(T));
+    PADDLE_ENFORCE_EQ(
+        r_dx, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error! error code=%d", r_dx));
+
+    if (dout->numel() == 0) {
+      return;
+    }
+    bool overwrite = ctx.Attr<bool>("overwrite");
+    // check index type is INT32
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "XPU only support INT32, it holds %s, but desires to be %s",
+            paddle::framework::DataTypeToString(index_type),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32)));
+
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
+    int index_size = index_dims[0];
+    int slice_size = dout->numel() / dout->dims()[0];
+
+    int r = xpu::scatter<T>(dev_ctx.x_context(), dout->data<T>(),
+                            index->data<int>(), index_size, slice_size,
+                            dx->data<T>(), overwrite);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error! error code=%d", r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(gather, ops::GatherOpXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(gather_grad, ops::GatherGradOpXPUKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/gaussian_random_op_xpu.cc b/paddle/fluid/operators/gaussian_random_op_xpu.cc
new file mode 100644
index 0000000000000..5d3ba84b05f5e
--- /dev/null
+++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <random>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class XPUGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    // TODO(pangyoki): implement GetXPURandomEngine to set different seeds on
+    // corresponding XPU device.
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    std::unique_ptr<T[]> data_cpu(new T[size]);
+    for (int64_t i = 0; i < size; ++i) {
+      data_cpu[i] = dist(*engine);
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), data,
+                 platform::CPUPlace(), reinterpret_cast<void*>(data_cpu.get()),
+                 size * sizeof(T));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(gaussian_random, ops::XPUGaussianRandomKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 5f17f2960573c..6be7dbdc110d5 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -887,10 +887,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
         align_corners, align_mode, data_layout);
@@ -981,21 +981,22 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpFw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpFw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1097,10 +1098,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
         out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
@@ -1176,10 +1177,10 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
         ratio_w, align_corners, align_mode, data_layout);
@@ -1267,22 +1268,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpBw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpBw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1378,10 +1380,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
         out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 1f7dde9b931da..3362f2474fe25 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -118,9 +118,10 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
   PADDLE_ENFORCE(
       "bilinear" == interp_method || "nearest" == interp_method ||
           "bicubic" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4, but got method = %s .",
-      interp_method);
+      platform::errors::InvalidArgument(
+          "Interpolation method can only be \"bilinear\" or \"nearest\" when "
+          "Input(X) dimension is 4, but got method = %s.",
+          interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
@@ -305,12 +306,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1, but got size =%d .",
-                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got size is %d.",
+            out_size_dim.size()));
     PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
-                      "OutSize's dim[0] must be 3, but got size = %d .",
-                      out_size_dim[0]);
+                      platform::errors::InvalidArgument(
+                          "OutSize's dim[0] must be 3, but got size is %d.",
+                          out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -330,10 +334,8 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateV2Op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     PADDLE_ENFORCE(
@@ -576,9 +578,10 @@ class InterpolateV2OpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "InterpolateGrad");
+
     auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 816539c3b5fdb..90abcaa8b472a 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -899,10 +899,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
         align_corners, align_mode, data_layout);
@@ -1018,21 +1018,22 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpFw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpFw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1167,10 +1168,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
         out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
@@ -1259,10 +1260,10 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
         ratio_w, align_corners, align_mode, data_layout);
@@ -1376,22 +1377,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpBw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpBw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1520,10 +1522,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
         out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
new file mode 100644
index 0000000000000..5a3c865e26c35
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class LayerNormXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto& x_dims = x->dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* variance = ctx.Output<Tensor>("Variance");
+    const auto* x_data = x->data<T>();
+    const auto* scale_data = (scale == nullptr ? nullptr : scale->data<T>());
+    const auto* bias_data = (bias == nullptr ? nullptr : bias->data<T>());
+    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
+    auto* mean_data = mean->mutable_data<T>(ctx.GetPlace());
+    auto* variance_data = variance->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::layer_norm(dev_ctx.x_context(), left, right, x_data, y_data,
+                            scale_data, bias_data, epsilon, mean_data,
+                            variance_data, false);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(layer_norm) return wrong "
+                                   "value[%d], please check whether Baidu "
+                                   "Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LayerNormGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto& x_dims = x->dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    const auto* mean = ctx.Input<Tensor>("Mean");
+    const auto* variance = ctx.Input<Tensor>("Variance");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const auto* x_data = x->data<T>();
+    const auto* dy_data = dy->data<T>();
+    const auto* mean_data = mean->data<T>();
+    const auto* variance_data = variance->data<T>();
+    const auto* scale_data = (scale == nullptr ? nullptr : scale->data<T>());
+    auto* dscale_data =
+        (dscale == nullptr ? nullptr : dscale->mutable_data<T>(ctx.GetPlace()));
+    auto* dbias_data =
+        (dbias == nullptr ? nullptr : dbias->mutable_data<T>(ctx.GetPlace()));
+    auto* dx_data =
+        (dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::layer_norm_backward(
+        dev_ctx.x_context(), left, right, x_data, scale_data, variance_data,
+        mean_data, dy_data, dx_data, dscale_data, dbias_data, epsilon);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(layer_norm_backward) return wrong "
+                                   "value[%d], please check whether Baidu "
+                                   "Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    layer_norm,
+    ops::LayerNormXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
new file mode 100644
index 0000000000000..52fa7fd1079a7
--- /dev/null
+++ b/paddle/fluid/operators/layout_utils.h
@@ -0,0 +1,155 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{0, 3, 1, 2};
+    math::Transpose<DeviceContext, T, 4> trans4;
+    trans4(dev_ctx, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{0, 2, 1};
+    math::Transpose<DeviceContext, T, 3> trans3;
+    trans3(dev_ctx, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const framework::ExecutionContext& context,
+                               const Tensor* input, Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{0, 2, 3, 1};
+    math::Transpose<DeviceContext, T, 4> trans4;
+    trans4(dev_ctx, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{0, 2, 1};
+    math::Transpose<DeviceContext, T, 3> trans3;
+    trans3(dev_ctx, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
new file mode 100644
index 0000000000000..b2e68e9870d3c
--- /dev/null
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/log_loss_op.h"
+#include <memory>
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* predict = ctx.Input<Tensor>("Predicted");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+    loss->mutable_data<T>(ctx.GetPlace());
+    int n = predict->numel();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r =
+        xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, predict->data<T>(),
+                          labels->data<T>(), loss->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "XPU log_loss kernel return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* predict = ctx.Input<Tensor>("Predicted");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+    if (!dpred) {
+      return;
+    }
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+    dpred->mutable_data<T>(ctx.GetPlace());
+    int n = predict->numel();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
+                              predict->data<T>(), labels->data<T>(),
+                              dloss->data<T>(), dpred->data<T>());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "XPU log_loss kernel return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    log_loss, ops::LogLossXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
new file mode 100644
index 0000000000000..2284401ba1bae
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lookup_table_v2_op.h"
+#include <memory>
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_XPU
+template <typename DeviceContext, typename T>
+class LookupTableV2XPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");      // int
+    auto *output_t = context.Output<LoDTensor>("Out");  // float
+    auto *table_var = context.InputVar("W");
+    PADDLE_ENFORCE_EQ(
+        (std::is_same<DeviceContext, platform::XPUDeviceContext>::value), true,
+        platform::errors::PreconditionNotMet("Unsupported place! only support "
+                                             "xpu place , please check your "
+                                             "place."));
+
+    PADDLE_ENFORCE_EQ(table_var->IsType<LoDTensor>(), true,
+                      platform::errors::PermissionDenied(
+                          "Unsupported Variable Type , idx in "
+                          "LookupTableV2XPUKernel should be LoDTensor."));
+
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t ids_numel = ids_t->numel();
+
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    // size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+    const int64_t *ids = ids_t->data<int64_t>();
+
+    PADDLE_ENFORCE_EQ(
+        ids_numel <= std::numeric_limits<int32_t>::max(), true,
+        platform::errors::OutOfRange(
+            "Number of ids greater than int32_t::max , please check "
+            "number of ids in LookupTableV2XPUKernel."));
+    int ids_numel_int32 = static_cast<int>(ids_numel);
+    int r = xpu::embedding<T>(dev_ctx.x_context(), ids_numel_int32, ids, D,
+                              table, output, padding_idx);
+    PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                      platform::errors::External(
+                          "XPU API return wrong value[%d] , please check where "
+                          "Baidu Kunlun Card is properly installed.",
+                          r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LookupTableV2GradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    PADDLE_ENFORCE_EQ(table_var->IsType<LoDTensor>(), true,
+                      platform::errors::PermissionDenied(
+                          "Unsupported Variable Type , idx in "
+                          "LookupTableV2GradXPUKernel should be LoDTensor."));
+    table_dim = context.Input<LoDTensor>("W")->dims();
+
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    PADDLE_ENFORCE_EQ(
+        is_sparse, false,
+        platform::errors::InvalidArgument(
+            "LookupTableV2GradXPUKernel dose NOT support is_sparse = True."));
+
+    auto ids_t = context.Input<LoDTensor>("Ids");
+    auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+    int64_t ids_numel = ids_t->numel();
+    PADDLE_ENFORCE_EQ(
+        ids_numel <= std::numeric_limits<int32_t>::max(), true,
+        platform::errors::OutOfRange(
+            "Number of ids greater than int32_t::max , please check "
+            "number of ids in LookupTableV2GradXPUKernel."));
+    int ids_numel_int32 = static_cast<int>(ids_numel);
+    const int64_t *ids_data = ids_t->data<int64_t>();
+
+    int D = d_table_t->dims()[1];
+    const T *d_output_data = d_output_t->data<T>();
+    T *d_table_data = d_table_t->mutable_data<T>(context.GetPlace());
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    // set zeros for d_table_data
+    const int zero = 0;
+    int r = xpu::memset(dev_ctx.x_context(), d_table_data, zero,
+                        d_table_t->numel() * sizeof(T));
+    PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                      platform::errors::External(
+                          "XPU API return wrong value[%d], please check where "
+                          "Baidu Kunlun Card is properly installed.",
+                          r));
+
+    r = xpu::embedding_backward<T, int64_t>(dev_ctx.x_context(),
+                                            ids_numel_int32, ids_data, D,
+                                            d_output_data, d_table_data);
+    PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                      platform::errors::External(
+                          "XPU API return wrong value[%d] , please check where "
+                          "Baidu Kunlun Card is properly installed.",
+                          r));
+  }
+};
+#endif
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    lookup_table_v2,
+    ops::LookupTableV2XPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    lookup_table_v2_grad,
+    ops::LookupTableV2GradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index b8af5a21ca581..8c7437e4b5e72 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
+#include <memory>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
@@ -44,6 +45,15 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
+#ifdef PADDLE_WITH_XPU
+template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, float>;
+template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, int>;
+template struct SetConstant<platform::XPUDeviceContext, int64_t>;
+template struct SetConstant<platform::XPUDeviceContext, bool>;
+#endif
+
 #define DEFINE_CPU_TRANS(RANK)                                              \
   template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
                             RANK>;                                          \
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 6af0278d82503..1ad1c29ddd879 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <cmath>
+#include <memory>
 #include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
@@ -84,6 +85,33 @@ struct RowwiseMean {
                   framework::Tensor* vec);
 };
 
+#ifdef PADDLE_WITH_XPU
+template <typename U>
+struct TensorSetConstantXPU {
+  TensorSetConstantXPU(framework::Tensor* tensor, U value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    int dev_id = -1;
+    xpu_current_device(&dev_id);
+    if (dev_id >= 64) {
+      // if dev_id >= 64, the device is a simulator device, -64 to get real
+      // dev_id
+      dev_id -= 64;
+    }
+    auto xpu = platform::XPUPlace(dev_id);
+    auto* begin = tensor_->mutable_data<T>(xpu);
+    int numel = tensor_->numel();
+    std::unique_ptr<T[]> data_cpu(new T[numel]);
+    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
+    memory::Copy(xpu, begin, platform::CPUPlace(),
+                 static_cast<void*>(data_cpu.get()), numel * sizeof(T));
+  }
+  framework::Tensor* tensor_;
+  U value_;
+};
+#endif
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index 869a3054598da..d2480763dcf12 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <memory>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -27,8 +28,18 @@ template <typename DeviceContext, typename T>
 void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
                                                framework::Tensor* tensor,
                                                T num) {
-  auto t = framework::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+  bool xpu_place = false;
+#ifdef PADDLE_WITH_XPU
+  if (context.GetPlace() == platform::XPUPlace()) {
+    xpu_place = true;
+    framework::VisitDataType(tensor->type(),
+                             TensorSetConstantXPU<T>(tensor, num));
+  }
+#endif
+  if (!xpu_place) {
+    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+  }
 }
 
 template <typename DeviceContext, typename T, int Rank>
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index 37155fa184e23..0b615cefac4ee 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
new file mode 100644
index 0000000000000..d6f3cc226e655
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -0,0 +1,367 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void MatMulXPUFunction(const Tensor* X, const Tensor* Y,
+                       const std::vector<std::int64_t>& x_dims,
+                       const std::vector<std::int64_t>& y_dims, Tensor* Out,
+                       bool trans_x, bool trans_y,
+                       const paddle::framework::ExecutionContext& ctx) {
+  const int x_ndim = x_dims.size();
+  const int y_ndim = y_dims.size();
+
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+  // currently only support x_ndim == y_dim and non-broadcast case
+  PADDLE_ENFORCE_EQ(x_ndim, y_ndim, platform::errors::InvalidArgument(
+                                        "Shape mistake in matmul_v2_op"));
+  for (int i = 0; i < x_ndim - 2; i++) {
+    PADDLE_ENFORCE_EQ(
+        x_dims.data()[i], y_dims.data()[i],
+        platform::errors::InvalidArgument("Shape mistake in matmul_v2_op"));
+  }
+
+  int ret = 0;
+  if (x_ndim == 1 && y_ndim == 1) {
+    PADDLE_ENFORCE_EQ(X->numel(), Y->numel(),
+                      platform::errors::InvalidArgument(
+                          "X's numbers is not equal to Y's numbers,"
+                          "when X/Y's dims =1"));
+    VLOG(3) << "MatMul's case 1";
+    Out->Resize({1});
+    Out->mutable_data<T>(ctx.GetPlace());
+    ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false, 1, 1,
+                                    X->numel(), 1.0f, X->data<T>(),
+                                    Y->data<T>(), 0.0f, Out->data<T>());
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d] in matmul_v2, please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    return;
+  }
+
+  if (x_ndim == 1) {
+    const int N = X->numel();
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(Y) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(y_ndim - 1);
+    if (trans_y) {
+      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
+    } else {
+      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
+      out_dims.back() = y_dims.back();
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+    if (trans_y) {
+      const int M = Y->numel() / N;
+      VLOG(3) << "MatMul's case 2";
+      ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, true, 1, M, N,
+                                      1.0f, X->data<T>(), Y->data<T>(), 0.0f,
+                                      Out->data<T>());
+      PADDLE_ENFORCE_EQ(
+          ret, XPU_SUCCESS,
+          platform::errors::External("XPU API return wrong value[%d] in "
+                                     "matmul_v2, please check whether "
+                                     "Baidu Kunlun Card is properly installed.",
+                                     ret));
+    } else {
+      const int M = y_dims[y_ndim - 1];
+      const int batch_size = Y->numel() / (M * N);
+      for (int i = 0; i < batch_size; i++) {
+        ret = baidu::xpu::api::fc_int16(
+            dev_ctx.x_context(), false, false, 1, M, N, 1.0f, X->data<T>(),
+            Y->data<T>() + i * M * N, 0.0f, Out->data<T>() + i * M);
+        PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d] in matmul_v2, "
+                              "please check whether "
+                              "Baidu Kunlun Card is properly installed.",
+                              ret));
+      }
+    }
+    return;
+  }
+
+  if (y_ndim == 1) {
+    const int N = Y->numel();
+    if (trans_x) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1], N,
+          platform::errors::InvalidArgument("Input(X) has error dim."));
+    }
+    std::vector<std::int64_t> out_dims(x_ndim - 1);
+    if (trans_x) {
+      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
+      out_dims.back() = x_dims.back();
+    } else {
+      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
+    }
+    Out->Resize(framework::make_ddim(out_dims));
+    Out->mutable_data<T>(ctx.GetPlace());
+
+    if (trans_x) {
+      const int M = x_dims[x_ndim - 1];
+      const int batch_size = X->numel() / (M * N);
+      for (int i = 0; i < batch_size; i++) {
+        ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), true, false, M, 1,
+                                        N, 1.0f, X->data<T>() + i * M * N,
+                                        Y->data<T>(), 0.0f,
+                                        Out->data<T>() + i * M);
+        PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d] in matmul_v2, "
+                              "please check whether "
+                              "Baidu Kunlun Card is properly installed.",
+                              ret));
+      }
+    } else {
+      const int M = X->numel() / N;
+      VLOG(3) << "MatMul's case 7";
+      ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false, M, 1,
+                                      N, 1.0f, X->data<T>(), Y->data<T>(), 0.0f,
+                                      Out->data<T>());
+      PADDLE_ENFORCE_EQ(
+          ret, XPU_SUCCESS,
+          platform::errors::External("XPU API return wrong value[%d] in "
+                                     "matmul_v2, please check whether "
+                                     "Baidu Kunlun Card is properly installed.",
+                                     ret));
+    }
+    return;
+  }
+
+  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
+  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+  if (trans_y) {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  } else {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, platform::errors::InvalidArgument(
+                                                 "Input(X) has error dim."));
+  }
+  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
+  const int ndim = (std::max)(x_ndim, y_ndim);
+  std::vector<std::int64_t> out_broadcast_dims(ndim);
+  int batch_size = 1;
+  for (int i = 0; i < ndim - 2; i++) {
+    PADDLE_ENFORCE_EQ(
+        x_dims.data()[i], y_dims.data()[i],
+        platform::errors::InvalidArgument("Shape mistake in matmul_v2_op"));
+    out_broadcast_dims[i] = x_dims.data()[i];
+    batch_size *= x_dims.data()[i];
+  }
+
+  out_broadcast_dims[ndim - 2] = M;
+  out_broadcast_dims[ndim - 1] = N;
+
+  Out->Resize(framework::make_ddim(out_broadcast_dims));
+  Out->mutable_data<T>(ctx.GetPlace());
+  ret = baidu::xpu::api::batched_gemm_int16(
+      dev_ctx.x_context(), trans_x, trans_y, batch_size, M, N, K, 1.0f,
+      X->data<T>(), Y->data<T>(), Out->data<T>(), nullptr, nullptr);
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External(
+          "XPU API return wrong value[%d] in matmul_v2, please check whether "
+          "Baidu Kunlun Card is properly installed.",
+          ret));
+}
+
+template <typename T>
+class MatMulV2XPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* Out = ctx.Output<Tensor>("Out");
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+    MatMulXPUFunction<T>(X, Y, vectorize(X->dims()), vectorize(Y->dims()), Out,
+                         trans_x, trans_y, ctx);
+  }
+};
+
+template <typename T>
+class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext& context,
+              const framework::Tensor& a, bool trans_a,
+              const framework::Tensor& b, bool trans_b,
+              framework::Tensor* out) const {
+    out->mutable_data<T>(context.GetPlace());
+    MatMulXPUFunction<T>(&a, &b, vectorize(a.dims()), vectorize(b.dims()), out,
+                         trans_a, trans_b, context);
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext& context,
+                     const framework::Tensor& a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor& b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor* out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out);
+    } else {
+      // currently not support this case
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    bool transpose_x = ctx.Attr<bool>("trans_x");
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+
+    auto x = *ctx.Input<framework::Tensor>("X");
+    auto y = *ctx.Input<framework::Tensor>("Y");
+    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    // get dims
+    std::vector<std::int64_t> x_dims = vectorize(x.dims());
+    std::vector<std::int64_t> y_dims = vectorize(y.dims());
+    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    // Case1 : x's or y's dim = 1
+    int ret = 0;
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dx) {
+        dx->mutable_data<T>(ctx.GetPlace());
+        ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false,
+                                        dx->numel(), 1, 1, 1.0f, y.data<T>(),
+                                        dout.data<T>(), 0.0f, dx->data<T>());
+        PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d] in "
+                              "matmul_v2_grad, please check whether "
+                              "Baidu Kunlun Card is properly installed.",
+                              ret));
+      }
+      if (dy) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false,
+                                        dy->numel(), 1, 1, 1.0f, x.data<T>(),
+                                        dout.data<T>(), 0.0f, dy->data<T>());
+        PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d] in "
+                              "matmul_v2_grad, please check whether "
+                              "Baidu Kunlun Card is properly installed.",
+                              ret));
+      }
+      return;
+    }
+
+    bool is_broadcast = true;
+    if (x_ndim <= 2 || y_ndim <= 2) {
+      is_broadcast = false;
+    } else if (x_ndim != y_ndim) {
+      is_broadcast = true;
+    } else {
+      is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2,
+                                 y_dims.cbegin());
+    }
+
+    // currently only support non-broadcast case
+    PADDLE_ENFORCE_EQ(
+        is_broadcast, false,
+        platform::errors::InvalidArgument("Shape mistake in matmul_v2_op"));
+
+    // Case2: no broadcast or no batch size, it aims to speed and it is same as
+    // matmul in old version.
+    if (!is_broadcast) {
+      ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+      framework::DDim dx_dims;
+      if (dx) {
+        dx_dims = dx->dims();
+        if (dx_dims != x.dims()) {
+          dx->Resize(x.dims());
+        }
+      }
+
+      framework::DDim dy_dims;
+      if (dy) {
+        dy_dims = dy->dims();
+        if (dy_dims != y.dims()) {
+          dy->Resize(y.dims());
+        }
+      }
+      if (transpose_x && transpose_y) {
+        CalcInputGrad(ctx, y, true, true, dout, true, false, dx);
+        CalcInputGrad(ctx, dout, true, true, x, true, false, dy);
+      } else if (transpose_x) {
+        CalcInputGrad(ctx, y, false, false, dout, true, false, dx);
+        CalcInputGrad(ctx, x, false, false, dout, false, true, dy);
+      } else if (transpose_y) {
+        CalcInputGrad(ctx, dout, false, false, y, false, true, dx);
+        CalcInputGrad(ctx, dout, true, true, x, false, true, dy);
+      } else {
+        CalcInputGrad(ctx, dout, false, false, y, true, false, dx);
+        CalcInputGrad(ctx, x, true, true, dout, false, true, dy);
+      }
+
+      if (dx) {
+        if (dx_dims != x.dims()) {
+          dx->Resize(dx_dims);
+        }
+      }
+      if (dy) {
+        if (dy_dims != y.dims()) {
+          dy->Resize(dy_dims);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc
new file mode 100644
index 0000000000000..71bcc4be15ce5
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_xpu.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MeanXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const float* x_data = input->data<float>();
+    float* y_data = output->data<float>();
+    int r = xpu::mean(dev_ctx.x_context(), x_data, y_data, input->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "XPU kernel error, Mean op execution not succeed, error code=%d",
+            r));
+  }
+};
+template <typename DeviceContext, typename T>
+class MeanGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(OG->numel(), 1, platform::errors::InvalidArgument(
+                                          "Mean Gradient should be scalar"));
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    float* dx = IG->data<float>();
+    const float* dy = OG->data<float>();
+    int r = xpu::mean_grad(dev_ctx.x_context(), dx, dy, IG->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "XPU kernel error. Mean_grad execution not succeed, error code=%d",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    mean, ops::MeanXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    mean_grad,
+    ops::MeanGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index c0aa00e79341e..d73e46df3491b 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -81,9 +81,9 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
     memory::Copy(platform::CPUPlace(), label_int64_host,
                  BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
                  label_data, label_int64_size);
-    for (int i = 0; i < num_samples; ++i) {
+    for (size_t i = 0; i < num_samples; ++i) {
       label_int32_host[i] = label_int64_host[i];
-      for (int j = 0; j < class_dim; ++j) {
+      for (size_t j = 0; j < class_dim; ++j) {
         indices_int32_host[i * class_dim + j] =
             indices_int64_host[i * class_dim + j];
       }
@@ -98,7 +98,7 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
                           label_int32_device, num_samples, class_dim,
                           correct_data, total_data, accuracy_data);
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU kernel error!"));
+                      platform::errors::Fatal("XPU accuracy kernel error!"));
     dev_ctx.Wait();
     xpu_free(indices_int32_device);
     xpu_free(label_int32_device);
diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu
index 77817e526e13d..6513e5d95e4ac 100644
--- a/paddle/fluid/operators/mish_op.cu
+++ b/paddle/fluid/operators/mish_op.cu
@@ -87,8 +87,9 @@ class MishCUDAKernel : public framework::OpKernel<T> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishFw<T><<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                   ctx.cuda_device_context().stream()>>>(x_data, out_data, numel,
                                                         threshold);
   }
@@ -108,8 +109,9 @@ class MishFP32CUDAKernel : public framework::OpKernel<float> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishFwFP32<<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishFwFP32<<<config.block_per_grid, config.thread_per_block, 0,
                    ctx.cuda_device_context().stream()>>>(x_data, out_data,
                                                          numel, threshold);
   }
@@ -131,8 +133,9 @@ class MishGradCUDAKernel : public framework::OpKernel<T> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishBw<T><<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                   ctx.cuda_device_context().stream()>>>(
         x_data, dout_data, dx_data, numel, threshold);
   }
@@ -154,8 +157,9 @@ class MishGradFP32CUDAKernel : public framework::OpKernel<float> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishBwFP32<<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishBwFP32<<<config.block_per_grid, config.thread_per_block, 0,
                    ctx.cuda_device_context().stream()>>>(
         x_data, dout_data, dx_data, numel, threshold);
   }
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 7a4e11091fd3a..f44ce8c56733a 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -193,13 +193,8 @@ class ConvMKLDNNHandlerT
                                data_dims, strides, ksize);
       const bool is_conv3d = strides.size() == 3U;
 
-      PADDLE_ENFORCE_EQ(
-          is_conv3d
-              ? dilations.size() == 3 && dilations[0] == 1 &&
-                    dilations[1] == 1 && dilations[2] == 1
-              : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-          true, platform::errors::Unimplemented(
-                    "Dilation in oneDNN convolution is not implemented yet"));
+      std::transform(dilations.begin(), dilations.end(), dilations.begin(),
+                     [](int64_t i) { return i - 1; });
 
       const auto src_tz = paddle::framework::vectorize(input->dims());
 
@@ -210,6 +205,7 @@ class ConvMKLDNNHandlerT
 
       const mkldnn::memory::dims stride_dims = strides;
       const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
+      const mkldnn::memory::dims dilations_dims = dilations;
 
       /* create memory descriptor for convolution without specified format
        * ('any') which lets a primitive (convolution in this case) choose
@@ -256,13 +252,13 @@ class ConvMKLDNNHandlerT
 
         this->AcquireForwardPrimitiveDescriptor(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
-            src_md, weights_md, bias_md, dst_md, stride_dims,
+            src_md, weights_md, bias_md, dst_md, stride_dims, dilations_dims,
             mkldnn_paddings[0], mkldnn_paddings[1]);
       } else {
         this->AcquireForwardPrimitiveDescriptor(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
-            src_md, weights_md, dst_md, stride_dims, mkldnn_paddings[0],
-            mkldnn_paddings[1]);
+            src_md, weights_md, dst_md, stride_dims, dilations_dims,
+            mkldnn_paddings[0], mkldnn_paddings[1]);
       }
     }
   }
@@ -619,9 +615,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       bool is_conv3d = strides.size() == 3U;
 
       PADDLE_ENFORCE_NE(is_conv3d, true,
-                        platform::errors::InvalidArgument(
-                            "int8 does not support conv3d currently, should "
-                            "set param is_conv3d as False"));
+                        platform::errors::Unimplemented(
+                            "int8 does not support conv3d currently"));
 
       auto input_dims = input->dims();
       auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
@@ -641,13 +636,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       GetWeightsTz(weights_tz, g);
       auto dst_tz = paddle::framework::vectorize(output->dims());
 
-      PADDLE_ENFORCE_EQ(
-          is_conv3d
-              ? dilations.size() == 3 && dilations[0] == 1 &&
-                    dilations[1] == 1 && dilations[2] == 1
-              : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-          true, platform::errors::Unimplemented(
-                    "dilation in convolution is not implemented yet"));
+      std::transform(dilations.begin(), dilations.end(), dilations.begin(),
+                     [](int64_t i) { return i - 1; });
 
       const K* filter_data = filter->data<K>();
       auto scale_in_data = ctx.Attr<float>("Scale_in");
@@ -710,13 +700,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
                                                MKLDNNMemoryFormat::x);
         conv_pd = handler->AcquireConvolutionPrimitiveDescriptor(
-            src_md, weights_md, bias_md, dst_md, strides, paddings,
+            src_md, weights_md, bias_md, dst_md, strides, dilations, paddings,
             mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
             fuse_residual_conn, propagation, output_shift_scale, sum_scale);
       } else {
         conv_pd = handler->AcquireConvolutionPrimitiveDescriptor(
-            src_md, weights_md, boost::none, dst_md, strides, paddings,
-            mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
+            src_md, weights_md, boost::none, dst_md, strides, dilations,
+            paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
             fuse_residual_conn, propagation, output_shift_scale, sum_scale);
       }
 
@@ -1019,11 +1009,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                           "Fail to find conv_pd in device context"));
 
     auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
-
+    std::transform(dilations.begin(), dilations.end(), dilations.begin(),
+                   [](int64_t i) { return i - 1; });
+    const mkldnn::memory::dims dilations_dims = dilations;
     // create backward convolution weights primitive descriptor
     auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
         mkldnn::algorithm::convolution_direct, src_md, diff_weights_md,
-        diff_dst_md, strides, mkldnn_paddings[0], mkldnn_paddings[1]);
+        diff_dst_md, strides, dilations_dims, mkldnn_paddings[0],
+        mkldnn_paddings[1]);
 
     auto conv_bwd_weights_pd =
         std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
@@ -1032,7 +1025,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     // create backward convolution data primitive descriptor
     auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
         mkldnn::algorithm::convolution_direct, diff_src_md, weights_md,
-        diff_dst_md, strides, mkldnn_paddings[0], mkldnn_paddings[1]);
+        diff_dst_md, strides, dilations_dims, mkldnn_paddings[0],
+        mkldnn_paddings[1]);
 
     auto conv_bwd_data_pd =
         std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 56537900216a8..e9f32e7ac25d8 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -104,6 +104,11 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
 
+    PADDLE_ENFORCE_EQ(
+        strides.size(), 2,
+        platform::errors::Unimplemented(
+            "Now we only support 2d oneDNN convolution transpose op"));
+
     auto input_dims = input->dims();
     auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
     auto filter_dims = filter->dims();
@@ -115,10 +120,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
                              data_dims, strides, ksize);
 
-    PADDLE_ENFORCE(
-        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        platform::errors::Unimplemented(
-            "dilation in convolution is not implemented yet"));
+    std::transform(dilations.begin(), dilations.end(), dilations.begin(),
+                   [](int64_t i) { return i - 1; });
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -210,11 +213,12 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       auto bias_md = platform::MKLDNNMemDesc(
           bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
       conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, false, fwd_prop_kind);
+          src_md, weights_md, bias_md, dst_md, strides, dilations, paddings,
+          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, false,
+          fwd_prop_kind);
     } else {
       conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, boost::none, dst_md, strides, paddings,
+          src_md, weights_md, boost::none, dst_md, strides, dilations, paddings,
           mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, false,
           fwd_prop_kind);
     }
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 94c9fc2d9742b..165d402342162 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -53,12 +53,27 @@ class MultinomialOp : public framework::OperatorWithKernel {
 
     auto x_dim = ctx->GetInputDim("X");
     int64_t x_rank = x_dim.size();
+    PADDLE_ENFORCE_GT(x_rank, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of dimensions of the input probability "
+                          "distribution should be > 0, but got %d.",
+                          x_rank));
+    PADDLE_ENFORCE_LE(x_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of dimensions of the input probability "
+                          "distribution should be <= 2, but got %d.",
+                          x_rank));
+
     std::vector<int64_t> out_dims(x_rank);
     for (int64_t i = 0; i < x_rank - 1; i++) {
       out_dims[i] = x_dim[i];
     }
 
     int64_t num_samples = ctx->Attrs().Get<int>("num_samples");
+    PADDLE_ENFORCE_GT(
+        num_samples, 0,
+        platform::errors::InvalidArgument(
+            "The number of samples should be > 0, but got %d.", num_samples));
     out_dims[x_rank - 1] = num_samples;
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu
index 2762f0ce9bd46..92f7c992ed976 100644
--- a/paddle/fluid/operators/multinomial_op.cu
+++ b/paddle/fluid/operators/multinomial_op.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/multinomial_op.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -31,6 +32,14 @@ __global__ void NormalizeProbability(T* norm_probs, const T* in_data,
                                      T* sum_rows) {
   int id = threadIdx.x + blockIdx.x * blockDim.x +
            blockIdx.y * gridDim.x * blockDim.x;
+  PADDLE_ENFORCE(
+      in_data[id] >= 0.0,
+      "The input of multinomial distribution should be >= 0, but got %f.",
+      in_data[id]);
+  PADDLE_ENFORCE(sum_rows[blockIdx.y] > 0.0,
+                 "The sum of one multinomial distribution probability should "
+                 "be > 0, but got %f.",
+                 sum_rows[blockIdx.y]);
   norm_probs[id] = in_data[id] / sum_rows[blockIdx.y];
 }
 
diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/fluid/operators/multinomial_op.h
index 420d2cd11e37d..14cfbd268389e 100644
--- a/paddle/fluid/operators/multinomial_op.h
+++ b/paddle/fluid/operators/multinomial_op.h
@@ -44,28 +44,29 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data,
     int64_t num_zeros = 0;
     for (int64_t j = 0; j < num_categories; j++) {
       prob_value = in_data[i * num_categories + j];
-      PADDLE_ENFORCE_GE(
-          prob_value, 0.0,
-          platform::errors::OutOfRange(
-              "The input of multinomial distribution should be >= 0"));
-      PADDLE_ENFORCE_EQ((std::isinf(static_cast<double>(prob_value)) ||
-                         std::isnan(static_cast<double>(prob_value))),
-                        false, platform::errors::OutOfRange(
-                                   "The input of multinomial distribution "
-                                   "shoud not be infinity or NaN"));
+      PADDLE_ENFORCE_GE(prob_value, 0.0,
+                        platform::errors::InvalidArgument(
+                            "The input of multinomial distribution "
+                            "should be >= 0, but got %f.",
+                            prob_value));
+
       probs_sum += prob_value;
       if (prob_value == 0) {
         num_zeros += 1;
       }
       cumulative_probs[j] = probs_sum;
     }
-    PADDLE_ENFORCE_GT(probs_sum, 0.0, platform::errors::OutOfRange(
-                                          "The sum of input should not be 0"));
+    PADDLE_ENFORCE_GT(probs_sum, 0.0,
+                      platform::errors::InvalidArgument(
+                          "The sum of one multinomial distribution "
+                          "probability should be > 0, but got %f.",
+                          probs_sum));
     PADDLE_ENFORCE_EQ(
         (replacement || (num_categories - num_zeros >= num_samples)), true,
-        platform::errors::OutOfRange("When replacement is False, number of "
-                                     "samples should be less than non-zero "
-                                     "categories"));
+        platform::errors::InvalidArgument(
+            "When replacement is False, number of "
+            "samples should be less than non-zero "
+            "categories."));
 
     for (int64_t j = 0; j < num_categories; j++) {
       cumulative_probs[j] /= probs_sum;
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
index b6d829392e3ba..ee638ede22b64 100644
--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mv_op.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 8748078109f16..3357db8454227 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -135,7 +135,12 @@ class NCEKernel : public framework::OpKernel<T> {
                                           alias_data, alias_probs_data, seed);
         break;
       }
-      default: { PADDLE_THROW("Unsupported SamplerType."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported SamplerType. SamplerType should be 0: Uniform, "
+            "1: LogUniform or 2: CostumDist. Received SamplerType: %d",
+            sampler_type));
+      }
     }
 
     PrepareSamples<DeviceContext, T>(context, sampler);
@@ -225,9 +230,9 @@ class NCEKernel : public framework::OpKernel<T> {
                                        weight, false, table_names, epmap,
                                        context, local_scope);
 #else
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
           "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
+          "parameter prefetch!"));
 #endif
 
       auto weight_mat = EigenMatrix<T>::From(
@@ -347,7 +352,12 @@ class NCEGradKernel : public framework::OpKernel<T> {
                                           alias_data, alias_probs_data, seed);
         break;
       }
-      default: { PADDLE_THROW("Unsupported SamplerType."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported SamplerType. SamplerType should be 0: Uniform, "
+            "1: LogUniform or 2: CostumDist. Received SamplerType: %d",
+            sampler_type));
+      }
     }
 
     //    T b = 1. / num_total_classes * num_neg_samples;
@@ -409,9 +419,9 @@ class NCEGradKernel : public framework::OpKernel<T> {
         auto *table_t = context.Input<SelectedRows>("Weight");
         table_dim = table_t->value().dims();
       } else {
-        PADDLE_THROW(
+        PADDLE_THROW(platform::errors::InvalidArgument(
             "The parameter Weight of a NCE_OP "
-            "must be either LoDTensor or SelectedRows");
+            "must be either LoDTensor or SelectedRows"));
       }
 
       auto d_w = context.Output<SelectedRows>(framework::GradVarName("Weight"));
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index e3da79125be24..21249d2375a4a 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -71,7 +71,8 @@ class AdadeltaOp : public framework::OperatorWithKernel {
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dim, ctx->GetInputDim("Grad"),
-        "param and grad input of AdadeltaOp should have same dimension");
+        platform::errors::InvalidArgument(
+            "Param and grad input of AdadeltaOp should have same dimension."));
     PADDLE_ENFORCE_NE(
         framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 5373fe15f6d9a..0713237561b65 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -38,7 +38,8 @@ __global__ void AdamKernelREG(T beta1, T beta2, T epsilon, T beta1_pow_,
     T mom2 = moment2[id];
     mom1 = beta1 * mom1 + (static_cast<T>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<T>(1.0) - beta2) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon));
+    p -= lr * (mom1 /
+               (sqrt(mom2) + epsilon * sqrt(static_cast<T>(1.0) - beta2_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -68,7 +69,8 @@ __global__ void AdamKernelMEM(T beta1, T beta2, T epsilon, const T* beta1_pow_,
     T mom2 = moment2[id];
     mom1 = beta1 * mom1 + (static_cast<T>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<T>(1.0) - beta2) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon));
+    p -= lr * (mom1 /
+               (sqrt(mom2) + epsilon * sqrt(static_cast<T>(1.0) - beta2_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -105,7 +107,8 @@ __global__ void SparseAdamCUDAKernelREG(
       T g = row_idx >= 0 ? grad_[row_idx * row_numel + id % row_numel] : 0;
       mom1 = beta1 * mom1 + (1 - beta1) * g;
       mom2 = beta2 * mom2 + (1 - beta2) * g * g;
-      p -= lr * (mom1 / (sqrt(mom2) + epsilon));
+      p -= lr * (mom1 / (sqrt(mom2) +
+                         epsilon * sqrt(static_cast<T>(1.0) - beta2_pow)));
 
       // Write back to global memory
       mom1_out_[id] = mom1;
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 24e383c87122a..c8b28aed24e8c 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -36,6 +36,10 @@ static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
     TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
     tensor_data = cpu_tensor.data<float>();
   }
+  if (platform::is_xpu_place(tensor->place())) {
+    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<float>();
+  }
   return tensor_data[0];
 }
 
@@ -109,7 +113,7 @@ class AdamFunctor<T, GPUAdam> {
 
     mom1 = beta1_ * mom1 + (1 - beta1_) * g;
     mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow)));
 
     // Write back to global memory
     moment1_out_[i] = mom1;
@@ -181,7 +185,9 @@ class AdamFunctor<T, CPUAdam> {
 
     moment1_out = beta1_ * mom1 + (1 - beta1_) * g;
     moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g;
-    param_out = param - lr * (moment1_out / (moment2_out.sqrt() + epsilon_));
+    param_out = param -
+                lr * (moment1_out /
+                      (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow)));
   }
 };
 
@@ -249,7 +255,7 @@ class SparseAdamFunctor<T, GPUAdam> {
 
     mom1 = beta1_ * mom1 + (1 - beta1_) * g;
     mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow)));
 
     // Write back to global memory
     moment1_out_[i] = mom1;
@@ -328,7 +334,7 @@ class SparseAdamFunctor<T, CPUAdam> {
 
     mom1 = beta1_ * mom1 + (1 - beta1_) * g;
     mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow)));
 
     // Write back to global memory
     moment1_out_[i] = mom1;
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
new file mode 100644
index 0000000000000..05b4544c02a12
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+#include <gflags/gflags.h>
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#ifdef PADDLE_WITH_XPU
+template <typename DeviceContext, typename T>
+class AdamOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "Tensor holds the wrong type，Expected Var(%s)'s "
+                          "type is LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+    using paddle::framework::LoDTensor;
+
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
+                                  "Param", "Adam");
+    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto* grad_var = ctx.InputVar("Grad");
+    auto& mom1 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment1"), "Input",
+                                 "Moment1", "Adam");
+    auto& mom2 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment2"), "Input",
+                                 "Moment2", "Adam");
+    auto& lr = GET_DATA_SAFELY(ctx.Input<LoDTensor>("LearningRate"), "Input",
+                               "LearningRate", "Adam");
+    auto& beta1_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta1Pow"), "Input",
+                                      "Beta1Pow", "Adam");
+    auto& beta2_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta2Pow"), "Input",
+                                      "Beta2Pow", "Adam");
+
+    auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
+                                      "Output", "ParamOut", "Adam");
+    auto& mom1_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment1Out"),
+                                     "Output", "Moment1Out", "Adam");
+    auto& mom2_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment2Out"),
+                                     "Output", "Moment2Out", "Adam");
+
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Tensor holds the wrong size, Expected beta1 pow "
+                          "output size is 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Tensor holds the wrong size, Expected beta2 pow "
+                          "output size is 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    if (ctx.HasInput("Beta1Tensor")) {
+      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      beta1 = static_cast<T>(GetAttrFromTensor(beta1_tensor));
+    }
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    if (ctx.HasInput("Beta2Tensor")) {
+      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
+    }
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input",
+                                   "Grad", "Adam");
+
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      int r = xpu::adam(
+          dev_ctx.x_context(), grad.template data<T>(), mom1.template data<T>(),
+          mom2.template data<T>(), param.template data<T>(),
+          beta1_pow.template data<T>(), beta2_pow.template data<T>(), beta1,
+          beta2, epsilon, lr.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
+
+      const float* ptr0 = beta1_pow.template data<T>();
+      float* ptr1 = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      float cpudata;
+      xpu_memcpy(&cpudata, ptr0, sizeof(float), XPU_DEVICE_TO_HOST);
+      cpudata = cpudata * beta1;
+      xpu_memcpy(ptr1, &cpudata, sizeof(float), XPU_HOST_TO_DEVICE);
+
+      const float* ptr2 = beta2_pow.template data<T>();
+      float* ptr3 = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      float cpudata1;
+      xpu_memcpy(&cpudata1, ptr2, sizeof(float), XPU_DEVICE_TO_HOST);
+      cpudata1 = cpudata1 * beta2;
+      xpu_memcpy(ptr3, &cpudata1, sizeof(float), XPU_HOST_TO_DEVICE);
+
+      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            r));
+    } else {
+      PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
+                                  "Variable type not supported by adam_op"));
+    }
+  }
+};
+#endif
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    adam, ops::AdamOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index e52a1dd9db179..688a7f1ad8435 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -50,8 +50,14 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
     auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
 
     auto sz = param_out->numel();
-    PADDLE_ENFORCE_EQ(param->numel(), sz);
-    PADDLE_ENFORCE_EQ(grad->numel(), sz);
+    PADDLE_ENFORCE_EQ(param->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "Input parameter's number of elements is error, "
+                          "expected %zu, but received %zu."));
+    PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "Input gradient's number of elements is error, "
+                          "expected %zu, but received %zu."));
 
     const T *lr = learning_rate->data<T>();
     const T *param_data = param->data<T>();
diff --git a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
new file mode 100644
index 0000000000000..932368e810edd
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MomentumOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+    auto* lr = learning_rate->data<T>();
+
+    auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::PermissionDenied(
+                          "Unsupported Variable Type of Param & Grad in "
+                          "MomentumOp-XPU. Excepted "
+                          "LodTensor, But received [%s] and [%s]",
+                          paddle::framework::ToTypeName(grad_var->Type())));
+
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::momentum(
+        dev_ctx.x_context(), param->data<float>(), velocity->data<float>(),
+        grad->data<float>(), lr, use_nesterov, mu, param_out->numel(),
+        param_out->data<float>(), velocity_out->data<float>());
+    if (r == xpu::Error_t::INVALID_PARAM) {
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::InvalidArgument(
+              "XPU kernel error of MomentumOp, error message: INVALID_PARAM, "
+              "please check your input & output."));
+    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::Unavailable(
+              "XPU kernel error of MomentumOp, error message: RUNTIME_ERROR, "
+              "please check whether Baidu Kunlun card is properly installed."));
+    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                        platform::errors::ResourceExhausted(
+                            "XPU kernel error of MomentumOp, error message: "
+                            "NO_ENOUGH_WORKSPACE, XPU has no enough memory."));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    momentum,
+    ops::MomentumOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
new file mode 100644
index 0000000000000..9dabca1b66a77
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SGDOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *param_var = ctx.InputVar("Param");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    if (param_var->IsType<framework::LoDTensor>() &&
+        grad_var->IsType<framework::LoDTensor>()) {
+      const auto *param = ctx.Input<framework::Tensor>("Param");
+      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+      // Actually, all tensors are LoDTensor except SelectedRows.
+      const auto *grad = ctx.Input<framework::Tensor>("Grad");
+      auto sz = param_out->numel();
+      PADDLE_ENFORCE_EQ(param->numel(), sz,
+                        platform::errors::InvalidArgument(
+                            "The input tensor Param's numel of SgdOp "
+                            "should be equal with ParamOut's numel. "
+                            "But received Param's "
+                            "numel = [%s], ParamOut's numel = [%s]",
+                            param->numel(), sz));
+      PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                        platform::errors::InvalidArgument(
+                            "The input tensor Grad's numel of SgdOp "
+                            "should be equal with ParamOut's numel. "
+                            "But received Grad's "
+                            "numel = [%s], ParamOut's numel = [%s]",
+                            grad->numel(), sz));
+
+      const T *lr = learning_rate->data<T>();
+      const T *param_data = param->data<T>();
+      const T *grad_data = grad->data<T>();
+      T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+      auto &dev_ctx = ctx.template device_context<DeviceContext>();
+      int r = xpu::sgd(dev_ctx.x_context(), sz, grad_data, param_data, lr,
+                       out_data);
+      if (r == xpu::Error_t::INVALID_PARAM) {
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::InvalidArgument(
+                "XPU kernel error of SgdOp, error message: INVALID_PARAM, "
+                "please check your input & output."));
+      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                          platform::errors::Unavailable(
+                              "XPU kernel error of SgdOp, error message: "
+                              "RUNTIME_ERROR, please check whether Baidu "
+                              "Kunlun Card is properly installed."));
+      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                          platform::errors::ResourceExhausted(
+                              "XPU kernel error of SgdOp, error "
+                              "message: NO_ENOUGH_WORKSPACE, XPU "
+                              "has no enough memory."));
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Param & Grad in "
+                            "SgdOp-XPU. Excepted "
+                            "LodTensor, But received [%s] and [%s]",
+                            paddle::framework::ToTypeName(param_var->Type())));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
new file mode 100644
index 0000000000000..325b73593892c
--- /dev/null
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/pool_op.h"
+#include <unordered_map>
+
+#ifdef PADDLE_WITH_XPU
+namespace paddle {
+namespace operators {
+
+xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive,
+                              bool is_test) {
+  if (pooltype == "max") {
+    return xpu::Pooling_t::MAX_WITHOUT_INDEX;
+  } else if (pooltype == "avg") {
+    if (exclusive) {
+      return xpu::Pooling_t::AVG_WITHOUT_PAD;
+    } else {
+      return xpu::Pooling_t::AVG_WITH_PAD;
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Pool op only supports 2D and 3D input."));
+  }
+}
+template <typename DeviceContext, typename T>
+class PoolXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool is_test = context.Attr<bool>("is_test");
+    bool adaptive = context.Attr<bool>("adaptive");
+    PADDLE_ENFORCE_EQ(
+        !adaptive, true,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP does not support adaptive == true!"));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP only support 2 dimension pooling!"));
+    int* index_data = nullptr;
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    const int c = in_x->dims()[0] * in_x->dims()[1];
+    const int in_h = in_x->dims()[2];
+    const int in_w = in_x->dims()[3];
+    const int out_h = out->dims()[2];
+    const int out_w = out->dims()[3];
+    const int win_h = ksize[0];
+    const int win_w = ksize[1];
+    const int stride_h = strides[0];
+    const int stride_w = strides[1];
+    const int pad_up = paddings[0];
+    const int pad_down = paddings[0];
+    const int pad_left = paddings[1];
+    const int pad_right = paddings[1];
+    const float* input = in_x->data<float>();
+    out->mutable_data<T>(context.GetPlace());
+    float* output = out->data<float>();
+    xpu::Pooling_t pool_type = XPUPoolingType(pooling_type, exclusive, is_test);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::pooling_forward<float, float>(
+        dev_ctx.x_context(), input, output, index_data, pool_type, c, in_h,
+        in_w, pad_left, pad_right, pad_up, pad_down, win_h, win_w, stride_h,
+        stride_w, out_h, out_w);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The pool2d XPU API return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+template <typename DeviceContext, typename T>
+class PoolGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
+    const int* index_data = nullptr;
+    PADDLE_ENFORCE_EQ(
+        !adaptive, true,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP does not support adaptive == true!"));
+    PADDLE_ENFORCE_EQ(ksize.size(), 2, platform::errors::InvalidArgument(
+                                           "The Pool2d XPU OP only support 2 "
+                                           "dimension pooling!, but received "
+                                           "%d-dimension pool kernel size",
+                                           ksize.size()));
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    if (!in_x_grad) {
+      return;
+    }
+    const int c = in_x->dims()[0] * in_x->dims()[1];
+    const int in_h = in_x->dims()[2];
+    const int in_w = in_x->dims()[3];
+    const int out_h = out->dims()[2];
+    const int out_w = out->dims()[3];
+    const int win_h = ksize[0];
+    const int win_w = ksize[1];
+    const int stride_h = strides[0];
+    const int stride_w = strides[1];
+    const int pad_up = paddings[0];
+    const int pad_down = paddings[0];
+    const int pad_left = paddings[1];
+    const int pad_right = paddings[1];
+    const float* input = in_x->data<float>();
+    const float* output = out->data<float>();
+    const float* output_grad = out_grad->data<float>();
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    float* input_grad = in_x_grad->data<float>();
+    xpu::Pooling_t pool_type = XPUPoolingType(pooling_type, exclusive, false);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // Need to init memory in the first place
+    const int zero = 0;
+    int r =
+        xpu::memset(dev_ctx.x_context(), reinterpret_cast<void**>(input_grad),
+                    zero, in_x_grad->numel() * sizeof(float));
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The Pool2d XPU OP return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
+    r = xpu::pooling_backward(dev_ctx.x_context(), input, output, index_data,
+                              output_grad, input_grad, pool_type, c, in_h, in_w,
+                              pad_left, pad_right, pad_up, pad_down, win_h,
+                              win_w, stride_h, stride_w, out_h, out_w);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The Pool2d XPU OP return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    pool2d, ops::PoolXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    pool2d_grad,
+    ops::PoolGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index 176d20cfe1d78..5b62edda247ab 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -21,10 +21,14 @@ class PullBoxSparseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(), 1UL,
-                      "Inputs(Ids) of PullBoxSparseOp should not be empty.");
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of PullBoxSparseOp should not be empty.");
+    PADDLE_ENFORCE_GE(
+        ctx->Inputs("Ids").size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Inputs(Ids) of PullBoxSparseOp should not be empty."));
+    PADDLE_ENFORCE_GE(
+        ctx->Outputs("Out").size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Outputs(Out) of PullBoxSparseOp should not be empty."));
     auto hidden_size = static_cast<int64_t>(ctx->Attrs().Get<int>("size"));
     auto all_ids_dim = ctx->GetInputsDim("Ids");
     const size_t n_ids = all_ids_dim.size();
@@ -34,9 +38,10 @@ class PullBoxSparseOp : public framework::OperatorWithKernel {
       const auto ids_dims = all_ids_dim[i];
       int ids_rank = ids_dims.size();
       PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
-                        "Shape error in %lu id, the last dimension of the "
-                        "'Ids' tensor must be 1.",
-                        i);
+                        platform::errors::InvalidArgument(
+                            "Shape error in %lu id, the last dimension of the "
+                            "'Ids' tensor must be 1.",
+                            i));
       auto out_dim = framework::vectorize(
           framework::slice_ddim(ids_dims, 0, ids_rank - 1));
       out_dim.push_back(hidden_size);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
new file mode 100644
index 0000000000000..b82ecbbe2fcdc
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ReduceMeanXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("This kernel only runs on XPU."));
+    // bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int ndim = input->dims().size();
+    std::vector<int> idims;
+    for (int i = 0; i < input->dims().size(); i++) {
+      idims.push_back(input->dims()[i]);
+    }
+    auto dims = context.Attr<std::vector<int>>("dim");
+    int rdim = dims.size();
+    int r =
+        xpu::reduce(dev_ctx.x_context(), input->data<T>(), output->data<T>(),
+                    idims.data(), ndim, dims.data(), rdim, xpu::REDUCE_MEAN);
+    PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                      platform::errors::External("XPU kernel error!"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_XPU_KERNEL(
+    reduce_mean,
+    ops::ReduceMeanXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
new file mode 100644
index 0000000000000..b751eca9ee0bc
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+#include <memory>
+#include <string>
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReduceSumXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("This kernel only runs on XPU."));
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (reduce_all) {
+      int input_len = input->numel();
+      int r = xpu::sum(dev_ctx.x_context(), input->data<T>(), output->data<T>(),
+                       input_len);
+      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                        platform::errors::External("XPU kernel error!"));
+    } else {
+      int ndim = input->dims().size();
+      std::vector<int> idims;
+      for (int i = 0; i < input->dims().size(); i++) {
+        idims.push_back(input->dims()[i]);
+      }
+      auto dims = context.Attr<std::vector<int>>("dim");
+      int rdim = dims.size();
+      int r =
+          xpu::reduce(dev_ctx.x_context(), input->data<T>(), output->data<T>(),
+                      idims.data(), ndim, dims.data(), rdim, xpu::REDUCE_SUM);
+      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                        platform::errors::External("XPU kernel error!"));
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class ReduceSumGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+    const auto* input2_d = input2->data<T>();
+    auto* output_d = output->data<T>();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    std::vector<int> idims;
+    int reduce_dim = 0;
+    if (reduce_all) {
+      idims.push_back(input0->numel());
+      idims.push_back(1);
+      idims.push_back(1);
+      r = xpu::reduce_grad(dev_ctx.x_context(), input2_d, output_d,
+                           idims.data(), idims.size(), &reduce_dim, 1,
+                           xpu::REDUCE_SUM);
+      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                        platform::errors::External("XPU kernel error!"));
+    } else if (dims.size() == 1) {
+      // handle reduce by one dimension
+      int reduce_dim_index = dims[0];
+      if (reduce_dim_index < 0) {
+        reduce_dim_index += input0->dims().size();
+      }
+      auto& input_dim = input0->dims();
+      int before_dim = 1;
+      for (int i = 0; i < reduce_dim_index; ++i) {
+        before_dim *= input_dim[i];
+      }
+      int reduce_dim = input_dim[reduce_dim_index];
+      int after_dim = 1;
+      for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+        after_dim *= input_dim[i];
+      }
+      idims.push_back(before_dim);
+      idims.push_back(input_dim[reduce_dim_index]);
+      idims.push_back(after_dim);
+      reduce_dim = 1;
+      r = xpu::reduce_grad(dev_ctx.x_context(), input2_d, output_d,
+                           idims.data(), idims.size(), &reduce_dim, 1,
+                           xpu::REDUCE_SUM);
+      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                        platform::errors::External("XPU kernel error!"));
+    } else {
+      PADDLE_THROW(
+          platform::errors::Unimplemented("unsupport reduce sum grad"));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_XPU_KERNEL(
+    reduce_sum,
+    ops::ReduceSumXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 05bb37ee421ff..aa8e39037062e 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -621,15 +621,18 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp,
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int8_t, ops::ReshapeKernel,
                                uint8_t, ops::ReshapeKernel, int,
-                               ops::ReshapeKernel, int64_t, ops::ReshapeKernel);
+                               ops::ReshapeKernel, int64_t, ops::ReshapeKernel,
+                               bool, ops::ReshapeKernel);
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel, bool,
                                ops::ReshapeGradKernel);
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad_grad, float,
                                ops::ReshapeDoubleGradKernel, double,
                                ops::ReshapeDoubleGradKernel, int,
                                ops::ReshapeDoubleGradKernel, int64_t,
+                               ops::ReshapeDoubleGradKernel, bool,
                                ops::ReshapeDoubleGradKernel);
 
 #ifdef PADDLE_WITH_CUDA
@@ -641,15 +644,17 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
                                 ops::ReshapeGradKernel, plat::float16,
+
                                 ops::ReshapeGradKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
                                 int64_t, ops::ReshapeKernel, plat::float16,
-                                ops::ReshapeKernel);
+                                ops::ReshapeKernel, bool, ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
                                 ops::ReshapeGradKernel, plat::float16,
+                                ops::ReshapeGradKernel, bool,
                                 ops::ReshapeGradKernel);
 
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float,
@@ -657,6 +662,7 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float,
                                 ops::ReshapeDoubleGradKernel, int,
                                 ops::ReshapeDoubleGradKernel, int64_t,
                                 ops::ReshapeDoubleGradKernel, plat::float16,
+                                ops::ReshapeDoubleGradKernel, bool,
                                 ops::ReshapeDoubleGradKernel);
 #endif
 
@@ -664,10 +670,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float,
 REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int, ops::ReshapeKernel,
                                int64_t, ops::ReshapeKernel, plat::float16,
-                               ops::ReshapeKernel);
+                               ops::ReshapeKernel, bool, ops::ReshapeKernel);
 REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel, plat::float16,
+                               ops::ReshapeGradKernel, bool,
                                ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
new file mode 100644
index 0000000000000..2c3bfdbc16b4d
--- /dev/null
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/roi_align_op.h"
+#include <memory>
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class XPUROIAlignOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+    const T* input_data = in->data<T>();
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        platform::errors::InvalidArgument(
+            "The rois_batch_size and imgs batch_size of roi_align_xpu OP must "
+            "be the same. But received rois_batch_size %d , batch_size %d",
+            rois_batch_size, batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num, rois_num_with_lod,
+        platform::errors::InvalidArgument(
+            "The rois_num from input and lod of roi_align_xpu OP must be the "
+            "same. But received input rois_num %d , input lod %d",
+            rois_num, rois_num_with_lod));
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* rois_data = rois->data<T>();
+    for (int n = 0; n < rois_batch_size; n++) {
+      int cur_batch_rois_num = rois_lod[n + 1] - rois_lod[n];
+      if (cur_batch_rois_num != 0) {
+        int r = xpu::roi_align(
+            dev_ctx.x_context(), input_data + n * channels * height * width,
+            rois_data + rois_lod[n] * 4, cur_batch_rois_num, channels, height,
+            width, pooled_height, pooled_width, sampling_ratio, spatial_scale,
+            output_data +
+                rois_lod[n] * channels * pooled_height * pooled_width);
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+                "The roi_align XPU OP return wrong value[%d], please check "
+                "where Baidu Kunlun Card is properly installed.",
+                r));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    roi_align,
+    ops::XPUROIAlignOpKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index c2a58b4199f32..f619f3d59cece 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -89,6 +89,7 @@ REGISTER_OP_CPU_KERNEL(
     save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu
index 0a778a694e52f..5c8c5a7545beb 100644
--- a/paddle/fluid/operators/save_op.cu
+++ b/paddle/fluid/operators/save_op.cu
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
     save, ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index 4002be8100152..b778bab8f9308 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -49,7 +49,7 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
     int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias,
                        bias_after_scale, in->data<float>(), out->data<float>());
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU kernel error!"));
+                      platform::errors::Fatal("XPU scale kernel error!"));
   }
 };
 
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
index dc92d7fcc3a87..379a07a26dd5c 100644
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
index 44fd555544e7f..86fe826c659ef 100644
--- a/paddle/fluid/operators/sign_op_xpu.cc
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -30,7 +30,7 @@ class SignXPUKernel : public framework::OpKernel<T> {
     int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
                                     in->numel(), in->data<T>(), out->data<T>());
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU kernel error!"));
+                      platform::errors::Fatal("XPU sign kernel error!"));
   }
 };
 
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index d147ec3e407b0..eff56046b9a01 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -29,10 +29,12 @@ class SliceOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "Input (Input) of slice op should not be null.");
+                      platform::errors::InvalidArgument(
+                          "Input (Input) of slice op should not be null."));
 
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output (Out) of slice op should not be null.");
+                      platform::errors::InvalidArgument(
+                          "Output (Out) of slice op should not be null."));
     auto x_var_type = ctx->GetInputsVarType("Input")[0];
     auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
     if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
@@ -57,7 +59,8 @@ class SliceOp : public framework::OperatorWithKernel {
     }
     auto in_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_LT(in_dims.size(), 7,
-                      "The rank of input should be less than 7.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input should be less than 7."));
     framework::DDim out_dims(in_dims);
 
     auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
@@ -76,31 +79,37 @@ class SliceOp : public framework::OperatorWithKernel {
     if (ctx->HasInputs("StartsTensorList")) {
       auto StartsTensorList = ctx->Inputs("StartsTensorList");
       PADDLE_ENFORCE_GT(StartsTensorList.size(), 0,
-                        "StartsTensorList size can't be zero");
+                        platform::errors::InvalidArgument(
+                            "StartsTensorList size can't be zero"));
       starts_size = StartsTensorList.size();
     }
     if (ctx->HasInputs("EndsTensorList")) {
       auto EndsTensorList = ctx->Inputs("EndsTensorList");
       PADDLE_ENFORCE_GT(EndsTensorList.size(), 0,
-                        "EndsTensorList size can't be zero");
+                        platform::errors::InvalidArgument(
+                            "EndsTensorList size can't be zero"));
       ends_size = EndsTensorList.size();
     }
 
     if (ctx->HasInput("StartsTensor") == false) {
       PADDLE_ENFORCE_EQ(
           starts_size, axes.size(),
-          "The size of starts must be equal to the size of axes.");
+          platform::errors::InvalidArgument(
+              "The size of starts must be equal to the size of axes."));
     }
     if (ctx->HasInput("EndsTensor") == false) {
-      PADDLE_ENFORCE_EQ(ends_size, axes.size(),
-                        "The size of ends must be equal to the size of axes.");
+      PADDLE_ENFORCE_EQ(
+          ends_size, axes.size(),
+          platform::errors::InvalidArgument(
+              "The size of ends must be equal to the size of axes."));
     }
 
     int dim_value, start, end;
     for (size_t i = 0; i < axes.size(); ++i) {
       PADDLE_ENFORCE_LT(static_cast<int>(axes[i]), in_dims.size(),
-                        "The index of dimension in axes must be less "
-                        "than the size of input shape.");
+                        platform::errors::InvalidArgument(
+                            "The index of dimension in axes must be less "
+                            "than the size of input shape."));
       if (infer_flags[i] == -1) {
         out_dims[axes[i]] = -1;
       } else {
@@ -112,7 +121,8 @@ class SliceOp : public framework::OperatorWithKernel {
           start = std::max(start, 0);
           end = std::max(end, 0);
           end = std::min(end, dim_value);
-          PADDLE_ENFORCE_GT(end, start, "end should greater than start");
+          PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
+                                            "end should greater than start"));
           out_dims[axes[i]] = end - start;
         }
       }
@@ -122,8 +132,9 @@ class SliceOp : public framework::OperatorWithKernel {
       std::vector<int> new_out_shape;
       for (size_t i = 0; i < decrease_axis.size(); ++i) {
         if (ctx->IsRuntime() && infer_flags[i] != -1) {
-          PADDLE_ENFORCE_EQ(out_dims[decrease_axis[i]], 1,
-                            "decrease dim should be 1");
+          PADDLE_ENFORCE_EQ(
+              out_dims[decrease_axis[i]], 1,
+              platform::errors::InvalidArgument("decrease dim should be 1"));
         }
         out_dims[decrease_axis[i]] = 0;
       }
@@ -284,9 +295,12 @@ class SliceOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, "Input should not be null");
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Input"), true,
+        platform::errors::InvalidArgument("Input should not be null"));
     PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Input(Out@GRAD) should not be null");
+                      platform::errors::InvalidArgument(
+                          "Input(Out@GRAD) should not be null"));
     auto x_var_type = ctx->GetInputsVarType("Input")[0];
     if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
       // If the var type of input is LOD_TENSOR_ARRAY,
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index ee46f4d821c78..4de5c1f7508c3 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -191,8 +191,9 @@ class SliceKernel : public framework::OpKernel<T> {
       if (decrease_axis.size() > 0) {
         std::vector<int64_t> new_out_shape;
         for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          PADDLE_ENFORCE_EQ(out_dims[decrease_axis[i]], 1,
-                            "decrease dim should be 1");
+          PADDLE_ENFORCE_EQ(
+              out_dims[decrease_axis[i]], 1,
+              platform::errors::InvalidArgument("decrease dim should be 1"));
           out_dims[decrease_axis[i]] = 0;
         }
 
diff --git a/paddle/fluid/operators/slice_xpu_op.cc b/paddle/fluid/operators/slice_xpu_op.cc
new file mode 100644
index 0000000000000..3d6f52c7dc31f
--- /dev/null
+++ b/paddle/fluid/operators/slice_xpu_op.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SliceXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto in = ctx.Input<framework::Tensor>("Input");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+    auto in_dims = in->dims();
+
+    // prepare starts, ends on XPU
+    int dim_value = 0, start = 0, end = 0;
+    // If a negative value is passed for any of the start or end indices,
+    // it represents number of elements before the end of that dimension.
+    // If the value passed to start or end is larger than the n
+    // (the number of elements in this dimension), it represents n.
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = in_dims[axes[i]];
+      start = starts[i];
+      end = ends[i];
+      start = start < 0 ? (start + dim_value) : start;
+      end = end < 0 ? (end + dim_value) : end;
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, dim_value);
+      PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
+                                        "end should greater than start"));
+      starts[i] = start;
+      ends[i] = end;
+    }
+    size_t shape_size = in_dims.size();
+    // the slice XPU kernel require that the length of `start`, `end` must be
+    // equal
+    // to the dims size of input tensor, therefore, if shape_size > axes.size(),
+    // the `starts_extension` and `ends_extension` is necessary.
+    std::vector<int> starts_extension(shape_size, 0);
+    std::vector<int> ends_extension(shape_size, 0);
+    if (shape_size > axes.size()) {
+      for (size_t i = 0; i < shape_size; ++i) {
+        ends_extension[i] = in_dims[i];
+      }
+      for (size_t i = 0; i < axes.size(); ++i) {
+        starts_extension[axes[i]] = starts[i];
+        ends_extension[axes[i]] = ends[i];
+      }
+    } else {
+      starts_extension = std::move(starts);
+      ends_extension = std::move(ends);
+    }
+
+    // prepare shape on XPU
+    std::vector<int> shape(shape_size, 0);
+    for (size_t i = 0; i < shape_size; ++i) {
+      shape[i] = in_dims[i];
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* in_data = in->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(),
+                               starts_extension.data(), ends_extension.data(),
+                               shape_size, in_data, out_data);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External("XPU slice kernel error!"));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SliceGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
+    d_in->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = d_in->dims();
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    // prepare starts, ends on XPU
+    int dim_value = 0, start = 0, end = 0;
+    // If a negative value is passed for any of the start or end indices,
+    // it represents number of elements before the end of that dimension.
+    // If the value passed to start or end is larger than the n
+    // (the number of elements in this dimension), it represents n.
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = in_dims[axes[i]];
+      start = starts[i];
+      end = ends[i];
+      start = start < 0 ? (start + dim_value) : start;
+      end = end < 0 ? (end + dim_value) : end;
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, dim_value);
+      PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
+                                        "end should greater than start"));
+      starts[i] = start;
+      ends[i] = end;
+    }
+    size_t shape_size = in_dims.size();
+    // the slice XPU kernel require that the length of `start`, `end` must be
+    // equal
+    // to the dims size of input tensor, therefore, if shape_size > axes.size(),
+    // the `starts_extension` and `ends_extension` is necessary.
+    std::vector<int> starts_extension(shape_size, 0);
+    std::vector<int> ends_extension(shape_size, 0);
+    if (shape_size > axes.size()) {
+      for (size_t i = 0; i < shape_size; ++i) {
+        ends_extension[i] = in_dims[i];
+      }
+      for (size_t i = 0; i < axes.size(); ++i) {
+        starts_extension[axes[i]] = starts[i];
+        ends_extension[axes[i]] = ends[i];
+      }
+    }
+    int* starts_device = nullptr;
+    int* ends_device = nullptr;
+    int* starts_host =
+        shape_size > axes.size() ? starts_extension.data() : starts.data();
+    int* ends_host =
+        shape_size > axes.size() ? ends_extension.data() : ends.data();
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)),
+        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)),
+        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 starts_device, platform::CPUPlace(), starts_host,
+                 shape_size * sizeof(int));
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 ends_device, platform::CPUPlace(), ends_host,
+                 shape_size * sizeof(int));
+
+    // prepare shape on XPU
+    std::vector<int> shape(shape_size, 0);
+    for (size_t i = 0; i < shape_size; ++i) {
+      shape[i] = in_dims[i];
+    }
+    int* shape_device = nullptr;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)),
+        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 shape_device, platform::CPUPlace(), shape.data(),
+                 shape_size * sizeof(int));
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r =
+        xpu::slice_backward(dev_ctx.x_context(), shape_device, starts_device,
+                            ends_device, shape_size, d_out->data<T>(),
+                            d_in->data<T>(), d_in->numel(), d_out->numel());
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External("xpu slice kernel error"));
+    dev_ctx.Wait();
+    // free device data
+    xpu_free(shape_device);
+    xpu_free(starts_device);
+    xpu_free(ends_device);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    slice, ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    slice_grad,
+    ops::SliceGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
new file mode 100644
index 0000000000000..f4f6eb9cdc82d
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()), true,
+        platform::errors::PreconditionNotMet("This kernel only runs on XPU."));
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+    Tensor* loss = context.Output<Tensor>("Loss");
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
+                                          "axis should == rank - 1"));
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+    const int n = SizeToAxis(axis, logits->dims());
+    const int d = SizeFromAxis(axis, logits->dims());
+    // softmax
+    auto& dev_ctx =
+        context.template device_context<platform::XPUDeviceContext>();
+    int r = xpu::softmax2d_forward(dev_ctx.x_context(), logits->data<float>(),
+                                   softmax->data<float>(), n, d);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error. Softmax2d_forward "
+                                   "execution not succeed, error code=%d",
+                                   r));
+    // cross_entropy
+    auto ignore_index = context.Attr<int>("ignore_index");
+    const bool soft_label = context.Attr<bool>("soft_label");
+    if (soft_label) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "XPU only support soft_label == false for now!"));
+    } else {
+      auto* p_labels = labels->data<int64_t>();
+      int64_t* labels_int64_host =
+          reinterpret_cast<int64_t*>(std::malloc(n * sizeof(int64_t)));
+      int* labels_int32_host =
+          reinterpret_cast<int*>(std::malloc(n * sizeof(int)));
+      int* labels_int32_device = NULL;
+      int ret = xpu_malloc(reinterpret_cast<void**>(&labels_int32_device),
+                           n * sizeof(int));
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+      dev_ctx.Wait();
+      memory::Copy(platform::CPUPlace(), labels_int64_host,
+                   BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
+                   p_labels, n * sizeof(int64_t));
+      for (int i = 0; i < n; ++i) {
+        labels_int32_host[i] = labels_int64_host[i];
+      }
+      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
+                   labels_int32_device, platform::CPUPlace(), labels_int32_host,
+                   n * sizeof(int));
+      int r = xpu::cross_entropy_forward(
+          dev_ctx.x_context(), n, d, softmax->data<float>(),
+          labels_int32_device, loss->data<float>(), nullptr, ignore_index);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU kernel error. Cross_entropy_forward "
+                                     "execution not succeed, error code=%d",
+                                     r));
+      dev_ctx.Wait();
+      std::free(labels_int32_host);
+      std::free(labels_int64_host);
+      xpu_free(labels_int32_device);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyXPUKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index 3d8ab9bbd8617..c9dca9b9ace7a 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -31,51 +31,76 @@ class SpaceToDepthOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SpaceToDepthOp should not be null.");
+                   platform::errors::InvalidArgument(
+                       "Input(X) of SpaceToDepthOp should not be null."));
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SpaceToDepthOp should not be null.");
+                   platform::errors::InvalidArgument(
+                       "Output(Out) of SpaceToDepthOp should not be null."));
 
     auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "input should be a 4D tensor");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument(
+                                            "input should be a 4D tensor"));
     auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
 
-    PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
+    PADDLE_ENFORCE_GT(blocksize, 1,
+                      platform::errors::InvalidArgument(
+                          "The blocksize should be Greater than 1"));
     if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
-      PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
-      PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-
-      PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                        "input channel should be divisible of the square of "
-                        "SpaceToDepthOp blocksize");
+      PADDLE_ENFORCE_GT(x_dims[1], 0,
+                        platform::errors::InvalidArgument(
+                            "input channel should be Greater than 0"));
+      PADDLE_ENFORCE_GT(x_dims[2], 0,
+                        platform::errors::InvalidArgument(
+                            "input Height should be Greater than 0"));
+      PADDLE_ENFORCE_GT(x_dims[3], 0,
+                        platform::errors::InvalidArgument(
+                            "input Width should be Greater than 0"));
+
+      PADDLE_ENFORCE_EQ(
+          x_dims[1] % (blocksize * blocksize), 0,
+          platform::errors::InvalidArgument(
+              "input channel should be divisible of the square of "
+              "SpaceToDepthOp blocksize"));
       PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                        "input Height should be divisible of the square of "
-                        "SpaceToDepthOp blocksize");
+                        platform::errors::InvalidArgument(
+                            "input Height should be divisible of the square of "
+                            "SpaceToDepthOp blocksize"));
       PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                        "input Width should be divisible of the square of "
-                        "SpaceToDepthOp blocksize");
+                        platform::errors::InvalidArgument(
+                            "input Width should be divisible of the square of "
+                            "SpaceToDepthOp blocksize"));
     } else {
       if (x_dims[1] != -1) {
         PADDLE_ENFORCE_GT(x_dims[1], 0,
-                          "input channel should be Greater than 0");
-        PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                          "input channel should be divisible of the square of "
-                          "SpaceToDepthOp blocksize");
+                          platform::errors::InvalidArgument(
+                              "input channel should be Greater than 0"));
+        PADDLE_ENFORCE_EQ(
+            x_dims[1] % (blocksize * blocksize), 0,
+            platform::errors::InvalidArgument(
+                "input channel should be divisible of the square of "
+                "SpaceToDepthOp blocksize"));
       }
       if (x_dims[2] != -1) {
         PADDLE_ENFORCE_GT(x_dims[2], 0,
-                          "input Height should be Greater than 0");
-        PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                          "input Height should be divisible of the square of "
-                          "SpaceToDepthOp blocksize");
+                          platform::errors::InvalidArgument(
+                              "input Height should be Greater than 0"));
+        PADDLE_ENFORCE_EQ(
+            x_dims[2] % (blocksize), 0,
+            platform::errors::InvalidArgument(
+                "input Height should be divisible of the square of "
+                "SpaceToDepthOp blocksize"));
       }
 
       if (x_dims[3] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-
-        PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                          "input Width should be divisible of the square of "
-                          "SpaceToDepthOp blocksize");
+        PADDLE_ENFORCE_GT(x_dims[3], 0,
+                          platform::errors::InvalidArgument(
+                              "input Width should be Greater than 0"));
+
+        PADDLE_ENFORCE_EQ(
+            x_dims[3] % (blocksize), 0,
+            platform::errors::InvalidArgument(
+                "input Width should be divisible of the square of "
+                "SpaceToDepthOp blocksize"));
       }
     }
 
@@ -156,9 +181,11 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), platform::errors::InvalidArgument(
+                                           "Input(X) shouldn't be null."));
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
+                   platform::errors::InvalidArgument(
+                       "Input(Out@GRAD) shouldn't be null."));
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 0157f0635b844..0151778075de0 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -25,9 +25,11 @@ class SplitOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SplitOp should not be null.");
+                      platform::errors::InvalidArgument(
+                          "Input(X) of SplitOp should not be null."));
     PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      "Outputs(Out) of SplitOp should not be empty.");
+                      platform::errors::InvalidArgument(
+                          "Outputs(Out) of SplitOp should not be empty."));
     auto in_dims = ctx->GetInputDim("X");
     auto outs_names = ctx->Outputs("Out");
     size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
@@ -37,9 +39,10 @@ class SplitOp : public framework::OperatorWithKernel {
     const size_t outs_number = outs_names.size();
 
     if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
-                        "tensor split sections size "
-                        "should be equal to output size.");
+      PADDLE_ENFORCE_EQ(
+          sections.size(), outs_number,
+          platform::errors::InvalidArgument("tensor split sections size "
+                                            "should be equal to output size."));
     }
 
     if (ctx->HasInput("AxisTensor")) {
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index 48ad5efa2c105..ceba0dfddf0f5 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -33,12 +33,14 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
   int64_t input_axis_dim = in_dims[axis];
   if (num > 0) {
     if (is_runtime || input_axis_dim > 0) {
-      PADDLE_ENFORCE_EQ(input_axis_dim % num, 0,
-                        "The input's size along the split dimension "
-                        "must be evenly divisible by Attr(num_or_sections). "
-                        "But received Attr(num_or_sections) "
-                        "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
-                        num, in_dims, axis);
+      PADDLE_ENFORCE_EQ(
+          input_axis_dim % num, 0,
+          platform::errors::InvalidArgument(
+              "The input's size along the split dimension "
+              "must be evenly divisible by Attr(num_or_sections). "
+              "But received Attr(num_or_sections) "
+              "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
+              num, in_dims, axis));
       size_t out_axis_dim = input_axis_dim / num;
 
       for (auto& out_dim : outs_dims) {
@@ -64,11 +66,13 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
       }
 
       if (each_section_is_known) {
-        PADDLE_ENFORCE_LE(num_of_unk, 1,
-                          "Only one dimension value of Attr(num_or_sections) "
-                          "in SplitOp can be -1. "
-                          "But received Attr(num_or_sections) = [%s].",
-                          framework::make_ddim(sections));
+        PADDLE_ENFORCE_LE(
+            num_of_unk, 1,
+            platform::errors::InvalidArgument(
+                "Only one dimension value of Attr(num_or_sections) "
+                "in SplitOp can be -1. "
+                "But received Attr(num_or_sections) = [%s].",
+                framework::make_ddim(sections)));
       }
 
       if (unk_dim_idx != -1) {
@@ -77,21 +81,25 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
         // the following check will fail.
         PADDLE_ENFORCE_LT(
             sum_of_section, input_axis_dim,
-            "Sum of Attr(num_or_sections) other than unknown section "
-            "must be less than the input's size "
-            "along the split dimension. But received Attr(num_or_sections) "
-            "= [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
-            framework::make_ddim(sections), in_dims, axis);
+            platform::errors::InvalidArgument(
+                "Sum of Attr(num_or_sections) other than unknown section "
+                "must be less than the input's "
+                "size "
+                "along the split dimension. But received Attr(num_or_sections) "
+                "= [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
+                framework::make_ddim(sections), in_dims, axis));
         if (each_section_is_known) {
           sections[unk_dim_idx] = input_axis_dim - sum_of_section;
         }
       } else {
         PADDLE_ENFORCE_EQ(
             sum_of_section, input_axis_dim,
-            "Sum of Attr(num_or_sections) must be equal to the input's size "
-            "along the split dimension. But received Attr(num_or_sections)"
-            " = [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
-            framework::make_ddim(sections), in_dims, axis);
+            platform::errors::InvalidArgument(
+                "Sum of Attr(num_or_sections) must be equal to the input's "
+                "size "
+                "along the split dimension. But received Attr(num_or_sections)"
+                " = [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
+                framework::make_ddim(sections), in_dims, axis));
       }
     }
     for (int i = 0; i < outs_number; ++i) {
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index b3784d3c032d9..4800f5f9eb533 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -16,7 +16,7 @@
 #include <limits>
 #include <vector>
 #include "paddle/fluid/operators/stack_op.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
new file mode 100644
index 0000000000000..175bb94c70bea
--- /dev/null
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+#include <string>
+#ifdef PADDLE_WITH_XPU
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+template <typename DeviceContext, typename T>
+class StackXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) {
+      axis += (x[0]->dims().size() + 1);
+    }
+    int n = static_cast<int>(x.size());
+    PADDLE_ENFORCE_LE(n, 24,
+                      platform::errors::InvalidArgument(
+                          "XPU only surpport at most 24 tensors for now"));
+    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
+    int pre = 1, post = 1;
+    auto& dim = x[0]->dims();
+    for (auto i = 0; i < axis; ++i) {
+      pre *= dim[i];
+    }
+    for (auto i = axis; i < dim.size(); ++i) {
+      post *= dim[i];
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    void* x_datas_host = std::malloc(n * sizeof(void*));
+    void* x_datas_device = nullptr;
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
+                                 n * sizeof(void*)),
+                      XPU_SUCCESS,
+                      platform::errors::ResourceExhausted(
+                          "\n\nOut of memory error on XPU, Cannot"
+                          "allocate %s memory on XPU. \n\nPlease "
+                          "check whether there is any other process "
+                          "using XPU.\n",
+                          string::HumanReadableSize(n * sizeof(void*))));
+    for (auto i = 0; i < n; ++i) {
+      ((const void**)x_datas_host)[i] = x[i]->data<T>();
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 x_datas_device, platform::CPUPlace(), x_datas_host,
+                 n * sizeof(void*));
+    int r = xpu::stack_forward<float>(dev_ctx.x_context(), pre, post, n,
+                                      x_datas_device, y_data);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "The stack XPU API return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
+    dev_ctx.Wait();
+    std::free(x_datas_host);
+    xpu_free(x_datas_device);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(stack,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index 14928061d23dd..f15910fd4f65b 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -51,7 +51,7 @@ class SumXPUKernel : public framework::OpKernel<T> {
     int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data<T>(),
                            valid_count, out->numel());
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU kernel error!"));
+                      platform::errors::Fatal("XPU sum kernel error!"));
   }
 };
 
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index d3f9754d307c6..cce5ad2631733 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -36,7 +36,9 @@ class TopkOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
-    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
+    PADDLE_ENFORCE_GE(k, 1,
+                      platform::errors::InvalidArgument(
+                          "Attribute k must be >= 1, but got k is %d.", k));
     PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
                                                 "input must have >= 1d shape"));
 
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 0a694e1ad5b01..39a56f874d950 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -96,7 +96,8 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
               output_data, k, indices_data, input_data, input_width,
               input_width, static_cast<int>(k), gridx, input_height));
       default:
-        PADDLE_THROW("Error");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Calculation error occurred in TopK Operator's CUDA Kernel."));
     }
   }
 };
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 0e3fcced19ea8..810afc901df57 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -23,20 +23,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
-                   "Output(Indices) of TopkOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2");
+    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2");
 
     auto input_dims = ctx->GetInputDim("X");
     const int& dim_size = input_dims.size();
     int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
-                      "the axis of topk"
-                      "must be [-%d, %d), but you set axis is %d",
-                      dim_size, dim_size, axis);
+    PADDLE_ENFORCE_EQ(
+        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
+        paddle::platform::errors::InvalidArgument(
+            "the axis of topk must be [-%d, %d), but you set axis is %d",
+            dim_size, dim_size, axis));
 
     if (axis < 0) axis += dim_size;
 
@@ -47,18 +45,22 @@ class TopkV2Op : public framework::OperatorWithKernel {
     } else {
       k = static_cast<int>(ctx->Attrs().Get<int>("k"));
       PADDLE_ENFORCE_EQ(k >= 1, true,
-                        "the attribute of k in the topk must >= 1 or be a "
-                        "Tensor, but received %d .",
-                        k);
+                        paddle::platform::errors::InvalidArgument(
+                            "the attribute of k in the topk must >= 1 or be a "
+                            "Tensor, but received %d .",
+                            k));
     }
 
     PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      "input of topk must have >= 1d shape");
+                      paddle::platform::errors::InvalidArgument(
+                          "input of topk must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_GE(
           input_dims[axis], k,
-          "input of topk op must have >= %d columns in axis of %d", k, axis);
+          paddle::platform::errors::InvalidArgument(
+              "input of topk op must have >= %d columns in axis of %d", k,
+              axis));
     }
 
     framework::DDim dims = input_dims;
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
index 2c94dca1e3a46..a2c97aee92a1a 100644
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ b/paddle/fluid/operators/top_k_v2_op.cu
@@ -38,8 +38,10 @@ template <typename DeviceContext, typename T>
 class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
@@ -194,7 +196,8 @@ class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
     auto* x = context.Input<Tensor>("X");
     auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* indices = context.Input<Tensor>("Indices");
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
index 79dd29ebc691c..0679668cf1b5a 100644
--- a/paddle/fluid/operators/transpose_op.cu
+++ b/paddle/fluid/operators/transpose_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc
new file mode 100644
index 0000000000000..c7ecf2ebfaa8c
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_xpu.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/transpose_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+bool XPUSupported(int ndims, const std::vector<int>& axis) {
+  /*
+   * XPU currently support:
+   * permute = {0, 2, 1}, permute = {1, 0},
+   * permute = {0, 2, 1, 3}, permute = {1, 0, 2},
+   * permute = {0, 2, 3, 1}
+   */
+  bool is_supported = false;
+  std::vector<int> permute_10(2, 0);
+  std::vector<int> permute_102(3, 0);
+  std::vector<int> permute_021(3, 0);
+  std::vector<int> permute_210(3, 0);
+  std::vector<int> permute_0213(4, 0);
+  std::vector<int> permute_0231(4, 0);
+  std::vector<int> permute_0312(4, 0);
+  std::vector<int> permute_3201(4, 0);
+  permute_10[0] = 1;
+  permute_102[0] = 1;
+  permute_102[2] = 2;
+  permute_021[1] = 2;
+  permute_021[2] = 1;
+  permute_210[0] = 2;
+  permute_210[1] = 1;
+  permute_0213[1] = 2;
+  permute_0213[2] = 1;
+  permute_0213[3] = 3;
+  permute_0231[1] = 2;
+  permute_0231[2] = 3;
+  permute_0231[3] = 1;
+  permute_0312[1] = 3;
+  permute_0312[2] = 1;
+  permute_0312[3] = 2;
+  permute_3201[0] = 3;
+  permute_3201[1] = 2;
+  permute_3201[3] = 1;
+  switch (ndims) {
+    case 2:
+      if (axis == permute_10) {
+        is_supported = true;
+      }
+      break;
+    case 3:
+      if ((axis == permute_021) || (axis == permute_102) ||
+          (axis == permute_210)) {
+        is_supported = true;
+      }
+      break;
+    case 4:
+      if ((axis == permute_0213) || (axis == permute_0231) ||
+          (axis == permute_0312) || (axis == permute_3201)) {
+        is_supported = true;
+      }
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Tensors with rank only 2, 3 and 4 are supported on XPU"));
+  }
+  return is_supported;
+}
+
+template <typename DeviceContext, typename T>
+class TransposeXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto x = context.Input<framework::Tensor>("X");
+    auto out = context.Output<framework::Tensor>("Out");
+    // axis is permute
+    auto axis = context.Attr<std::vector<int>>("axis");
+    int ndims = axis.size();
+    const auto x_dims = x->dims();
+
+    const T* x_data = x->data<T>();
+    T* y_data = out->mutable_data<T>(context.GetPlace());
+    if (!XPUSupported(ndims, axis)) {
+      VLOG(0) << "XPU does not support the permute, try to do on cpu";
+      framework::Tensor x_cpu;
+      framework::Tensor out_cpu;
+      auto x_cpu_data = x_cpu.mutable_data<T>(x->dims(), platform::CPUPlace());
+      auto out_cpu_data =
+          out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      memory::Copy(platform::CPUPlace(), reinterpret_cast<void*>(x_cpu_data),
+                   BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
+                   (const void*)x_data, x->numel() * sizeof(T));
+
+      const platform::CPUDeviceContext* cpu_dev_ctx =
+          static_cast<const platform::CPUDeviceContext*>(
+              platform::DeviceContextPool::Instance().Get(
+                  platform::CPUPlace()));
+      TransCompute<platform::CPUDeviceContext, T>(ndims, *cpu_dev_ctx, x_cpu,
+                                                  &out_cpu, axis);
+      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
+                   reinterpret_cast<void*>(y_data), platform::CPUPlace(),
+                   (const void*)out_cpu_data, out->numel() * sizeof(T));
+      return;
+    }
+
+    std::vector<int> x_shape_host(ndims, 0);
+    for (int i = 0; i < ndims; ++i) {
+      x_shape_host[i] = x_dims[i];
+    }
+    int* permute_host = axis.data();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::transpose(dev_ctx.x_context(), x_data, y_data,
+                           x_shape_host.data(), permute_host, ndims);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error! error code=%d", r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TransposeGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
+
+    x_grad->mutable_data<T>(context.GetPlace());
+    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    int ndims = axis.size();
+    if (!XPUSupported(ndims, reversed_axis)) {
+      PADDLE_THROW(
+          platform::errors::Unimplemented("XPU does not support the permute"));
+    }
+
+    std::vector<int> out_shape_host(ndims, 0);
+    for (int i = 0; i < ndims; ++i) {
+      out_shape_host[i] = out_grad->dims()[i];
+    }
+    int* permute_host = reversed_axis.data();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::transpose(dev_ctx.x_context(), out_grad->data<T>(),
+                           x_grad->data<T>(), out_shape_host.data(),
+                           permute_host, ndims);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error! error code=%d", r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    transpose,
+    ops::TransposeXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    transpose_grad,
+    ops::TransposeGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    transpose2,
+    ops::TransposeXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 419f0f7a2a578..b9f7ba3092248 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -14,146 +14,15 @@ limitations under the License. */
 
 #include <limits>
 #include <random>
+#include <vector>
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
 namespace operators {
 
-// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
-template <typename T>
-T Erfinv(T x) {
-  if (x < -1 || x > 1) {
-    return std::numeric_limits<T>::quiet_NaN();
-  } else if (x == 1.0) {
-    return std::numeric_limits<T>::infinity();
-  } else if (x == -1.0) {
-    return -std::numeric_limits<T>::infinity();
-  }
-
-  const T LN2 = 6.931471805599453094172321214581e-1;
-
-  const T A0 = 1.1975323115670912564578e0;
-  const T A1 = 4.7072688112383978012285e1;
-  const T A2 = 6.9706266534389598238465e2;
-  const T A3 = 4.8548868893843886794648e3;
-  const T A4 = 1.6235862515167575384252e4;
-  const T A5 = 2.3782041382114385731252e4;
-  const T A6 = 1.1819493347062294404278e4;
-  const T A7 = 8.8709406962545514830200e2;
-
-  const T B0 = 1.0000000000000000000e0;
-  const T B1 = 4.2313330701600911252e1;
-  const T B2 = 6.8718700749205790830e2;
-  const T B3 = 5.3941960214247511077e3;
-  const T B4 = 2.1213794301586595867e4;
-  const T B5 = 3.9307895800092710610e4;
-  const T B6 = 2.8729085735721942674e4;
-  const T B7 = 5.2264952788528545610e3;
-
-  const T C0 = 1.42343711074968357734e0;
-  const T C1 = 4.63033784615654529590e0;
-  const T C2 = 5.76949722146069140550e0;
-  const T C3 = 3.64784832476320460504e0;
-  const T C4 = 1.27045825245236838258e0;
-  const T C5 = 2.41780725177450611770e-1;
-  const T C6 = 2.27238449892691845833e-2;
-  const T C7 = 7.74545014278341407640e-4;
-
-  const T D0 = 1.4142135623730950488016887e0;
-  const T D1 = 2.9036514445419946173133295e0;
-  const T D2 = 2.3707661626024532365971225e0;
-  const T D3 = 9.7547832001787427186894837e-1;
-  const T D4 = 2.0945065210512749128288442e-1;
-  const T D5 = 2.1494160384252876777097297e-2;
-  const T D6 = 7.7441459065157709165577218e-4;
-  const T D7 = 1.4859850019840355905497876e-9;
-
-  const T E0 = 6.65790464350110377720e0;
-  const T E1 = 5.46378491116411436990e0;
-  const T E2 = 1.78482653991729133580e0;
-  const T E3 = 2.96560571828504891230e-1;
-  const T E4 = 2.65321895265761230930e-2;
-  const T E5 = 1.24266094738807843860e-3;
-  const T E6 = 2.71155556874348757815e-5;
-  const T E7 = 2.01033439929228813265e-7;
-
-  const T F0 = 1.414213562373095048801689e0;
-  const T F1 = 8.482908416595164588112026e-1;
-  const T F2 = 1.936480946950659106176712e-1;
-  const T F3 = 2.103693768272068968719679e-2;
-  const T F4 = 1.112800997078859844711555e-3;
-  const T F5 = 2.611088405080593625138020e-5;
-  const T F6 = 2.010321207683943062279931e-7;
-  const T F7 = 2.891024605872965461538222e-15;
-
-  T abs_x = abs(x);
-
-  if (abs_x <= 0.85) {
-    T r = 0.180625 - 0.25 * x * x;
-    T num =
-        (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
-             r +
-         A0);
-    T den =
-        (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
-             r +
-         B0);
-    return x * num / den;
-  }
-
-  T r = sqrt(LN2 - log(1.0 - abs_x));
-
-  T num, den;
-  if (r <= 5.0) {
-    r = r - 1.6;
-    num =
-        (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
-             r +
-         C0);
-    den =
-        (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
-             r +
-         D0);
-  } else {
-    r = r - 5.0;
-    num =
-        (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
-             r +
-         E0);
-    den =
-        (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
-             r +
-         F0);
-  }
-
-  if (x < 0) {
-    return -num / den;
-  } else {
-    return num / den;
-  }
-}
-
-template <typename T>
-struct TruncatedNormal {
-  T mean, std;
-  T a_normal_cdf;
-  T b_normal_cdf;
-  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
-    auto normal_cdf = [](T x) {
-      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
-    };
-    a_normal_cdf = normal_cdf(-2.0);
-    b_normal_cdf = normal_cdf(2.0);
-  }
-
-  T operator()(T value) const {
-    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
-  }
-};
-
 template <typename T>
 class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h
new file mode 100644
index 0000000000000..a6ff2f686cb76
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <limits>
+#include <random>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
+template <typename T>
+T Erfinv(T x) {
+  if (x < -1 || x > 1) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else if (x == 1.0) {
+    return std::numeric_limits<T>::infinity();
+  } else if (x == -1.0) {
+    return -std::numeric_limits<T>::infinity();
+  }
+
+  const T LN2 = 6.931471805599453094172321214581e-1;
+
+  const T A0 = 1.1975323115670912564578e0;
+  const T A1 = 4.7072688112383978012285e1;
+  const T A2 = 6.9706266534389598238465e2;
+  const T A3 = 4.8548868893843886794648e3;
+  const T A4 = 1.6235862515167575384252e4;
+  const T A5 = 2.3782041382114385731252e4;
+  const T A6 = 1.1819493347062294404278e4;
+  const T A7 = 8.8709406962545514830200e2;
+
+  const T B0 = 1.0000000000000000000e0;
+  const T B1 = 4.2313330701600911252e1;
+  const T B2 = 6.8718700749205790830e2;
+  const T B3 = 5.3941960214247511077e3;
+  const T B4 = 2.1213794301586595867e4;
+  const T B5 = 3.9307895800092710610e4;
+  const T B6 = 2.8729085735721942674e4;
+  const T B7 = 5.2264952788528545610e3;
+
+  const T C0 = 1.42343711074968357734e0;
+  const T C1 = 4.63033784615654529590e0;
+  const T C2 = 5.76949722146069140550e0;
+  const T C3 = 3.64784832476320460504e0;
+  const T C4 = 1.27045825245236838258e0;
+  const T C5 = 2.41780725177450611770e-1;
+  const T C6 = 2.27238449892691845833e-2;
+  const T C7 = 7.74545014278341407640e-4;
+
+  const T D0 = 1.4142135623730950488016887e0;
+  const T D1 = 2.9036514445419946173133295e0;
+  const T D2 = 2.3707661626024532365971225e0;
+  const T D3 = 9.7547832001787427186894837e-1;
+  const T D4 = 2.0945065210512749128288442e-1;
+  const T D5 = 2.1494160384252876777097297e-2;
+  const T D6 = 7.7441459065157709165577218e-4;
+  const T D7 = 1.4859850019840355905497876e-9;
+
+  const T E0 = 6.65790464350110377720e0;
+  const T E1 = 5.46378491116411436990e0;
+  const T E2 = 1.78482653991729133580e0;
+  const T E3 = 2.96560571828504891230e-1;
+  const T E4 = 2.65321895265761230930e-2;
+  const T E5 = 1.24266094738807843860e-3;
+  const T E6 = 2.71155556874348757815e-5;
+  const T E7 = 2.01033439929228813265e-7;
+
+  const T F0 = 1.414213562373095048801689e0;
+  const T F1 = 8.482908416595164588112026e-1;
+  const T F2 = 1.936480946950659106176712e-1;
+  const T F3 = 2.103693768272068968719679e-2;
+  const T F4 = 1.112800997078859844711555e-3;
+  const T F5 = 2.611088405080593625138020e-5;
+  const T F6 = 2.010321207683943062279931e-7;
+  const T F7 = 2.891024605872965461538222e-15;
+
+  T abs_x = abs(x);
+
+  if (abs_x <= 0.85) {
+    T r = 0.180625 - 0.25 * x * x;
+    T num =
+        (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
+             r +
+         A0);
+    T den =
+        (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
+             r +
+         B0);
+    return x * num / den;
+  }
+
+  T r = sqrt(LN2 - log(1.0 - abs_x));
+
+  T num, den;
+  if (r <= 5.0) {
+    r = r - 1.6;
+    num =
+        (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
+             r +
+         C0);
+    den =
+        (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
+             r +
+         D0);
+  } else {
+    r = r - 5.0;
+    num =
+        (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
+             r +
+         E0);
+    den =
+        (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
+             r +
+         F0);
+  }
+
+  if (x < 0) {
+    return -num / den;
+  } else {
+    return num / den;
+  }
+}
+
+template <typename T>
+struct TruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
+  T operator()(T value) const {
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
new file mode 100644
index 0000000000000..b2ff91a37451e
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include <limits>
+#include <random>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class XPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
+    TruncatedNormal<T> truncated_normal(mean, std);
+    int64_t size = tensor->numel();
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    // TODO(pangyoki): implement GetXPURandomEngine to set different seeds on
+    // corresponding XPU device.
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    std::unique_ptr<T[]> data_cpu(new T[size]);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data_cpu[i] = truncated_normal(dist(*engine));
+    }
+
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), data,
+                 platform::CPUPlace(), reinterpret_cast<void*>(data_cpu.get()),
+                 size * sizeof(T));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(truncated_gaussian_random,
+                       ops::XPUTruncatedGaussianRandomKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc
new file mode 100644
index 0000000000000..507bd7e9ea96e
--- /dev/null
+++ b/paddle/fluid/operators/uniform_random_op_xpu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/uniform_random_op.h"
+#include <string>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class XPUUniformRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    framework::Tensor *tensor = nullptr;
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
+      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      tensor = selected_rows->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+      selected_rows->mutable_rows()->reserve(shape[0]);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be "
+          "LoDTensor, "
+          "SelectedRows. But got unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
+    }
+    T *data = tensor->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = tensor->numel();
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    // TODO(pangyoki): implement GetXPURandomEngine to set different seeds on
+    // corresponding XPU device.
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    std::unique_ptr<T[]> data_cpu(new T[size]);
+    for (int64_t i = 0; i < size; ++i) {
+      data_cpu[i] = dist(*engine);
+    }
+
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), data,
+                 platform::CPUPlace(), reinterpret_cast<void *>(data_cpu.get()),
+                 size * sizeof(T));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_XPU_KERNEL(uniform_random,
+                       paddle::operators::XPUUniformRandomKernel<float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index 05d077b173a13..985c35127617b 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -26,7 +26,7 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
   if (x->type() == framework::proto::VarType::INT32) {
     auto* data = x->data<int>();
     framework::Tensor cpu_attr_tensor;
-    if (platform::is_gpu_place(x->place())) {
+    if (!platform::is_cpu_place(x->place())) {
       TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor);
       data = cpu_attr_tensor.data<int>();
     }
@@ -34,7 +34,7 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
   } else if (x->type() == framework::proto::VarType::INT64) {
     auto* data = x->data<int64_t>();
     framework::Tensor cpu_attr_tensor;
-    if (platform::is_gpu_place(x->place())) {
+    if (!platform::is_cpu_place(x->place())) {
       TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor);
       data = cpu_attr_tensor.data<int64_t>();
     }
@@ -62,7 +62,7 @@ inline std::vector<T> GetDataFromTensorList(
                           tensor->dims()));
 
     if (tensor->type() == framework::proto::VarType::INT32) {
-      if (platform::is_gpu_place(tensor->place())) {
+      if (!platform::is_cpu_place(tensor->place())) {
         framework::Tensor temp;
         TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_data.push_back(static_cast<T>(*temp.data<int>()));
@@ -70,7 +70,7 @@ inline std::vector<T> GetDataFromTensorList(
         vec_new_data.push_back(static_cast<T>(*tensor->data<int>()));
       }
     } else if (tensor->type() == framework::proto::VarType::INT64) {
-      if (platform::is_gpu_place(tensor->place())) {
+      if (!platform::is_cpu_place(tensor->place())) {
         framework::Tensor temp;
         TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         // NOTE: Converting int64 to int32 may cause data overflow.
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
index 39983e7de03b5..721c6e5390e85 100644
--- a/paddle/fluid/operators/where_op.cu
+++ b/paddle/fluid/operators/where_op.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace platform = paddle::platform;
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 88b545b48e532..db84b8731f9ca 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnGetVersion);                               \
   __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
   __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
   __macro(cudnnGetErrorString);                           \
   __macro(cudnnCreateDropoutDescriptor);                  \
@@ -177,7 +178,8 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(cudnnCTCLoss);                                  \
   __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
-  __macro(cudnnGetConvolutionForwardAlgorithm_v7);
+  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index c9c3a9456b736..b7b8a749d2ac0 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -45,25 +45,9 @@ void* GetTensorRtHandle() {
 
   if (nullptr == dso_handle) {
     auto error_msg =
-        "TensorRT dynamic library (%s) that Paddle depends on is not "
-        "configured correctly. (error code is %s)\n"
-        "  Suggestions:\n"
-        "  1. Check if TensorRT "
-        "is installed correctly and its version is matched with paddlepaddle "
-        "you installed.\n"
-        "  2. Configure TensorRT dynamic library environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
-        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
-        "impossible unless System Integrity Protection (SIP) is disabled.]";
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    std::cerr << string::Sprintf(error_msg, dso_name, errorno);
+        "You are using Paddle compiled with TensorRT, but TensorRT dynamic "
+        "library is not found. Ignore this if TensorRT is not needed.";
+    std::cerr << error_msg;
   }
 
   return dso_handle;
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index cb751071062c1..c6650c0b041e2 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -50,6 +50,7 @@ extern void* tensorrt_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
+#ifdef NV_TENSORRT_MAJOR
 #if (NV_TENSORRT_MAJOR >= 6)
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
   __macro(createInferBuilder_INTERNAL);     \
@@ -62,6 +63,7 @@ extern void* tensorrt_dso_handle;
 #endif
 
 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
old mode 100644
new mode 100755
index fd6e80527caf6..d2d57995b728e
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -1,49 +1,103 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Used for compute gpu launch parameter
 
 #pragma once
 
-#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
 
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <algorithm>
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace platform {
 
-struct GpuLaunchConfig {
-  // Number of threads per block.
-  int threads;
-  // Number of blocks for GPU kernel launch.
-  int blocks;
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
 
-  GpuLaunchConfig(int threads, int blocks) : threads(threads), blocks(blocks) {}
+struct GpuLaunchConfig {
+  dim3 theory_thread_count = dim3(1, 1, 1);
+  dim3 thread_per_block = dim3(1, 1, 1);
+  dim3 block_per_grid = dim3(1, 1, 1);
 };
 
-inline GpuLaunchConfig getGpuLaunchConfig(
-    const int N, const framework::ExecutionContext& ctx,
-    int max_threads = 1024) {
-  int threads =
-      std::min(max_threads, ctx.cuda_device_context().GetMaxThreadsPerBlock());
-  int physical_thread_count =
-      std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N);
-  int blocks = std::min((physical_thread_count + threads - 1) / threads,
-                        ctx.cuda_device_context().GetSMCount());
+inline GpuLaunchConfig GetGpuLaunchConfig1D(
+    const platform::CUDADeviceContext& context, int element_count) {
+  PADDLE_ENFORCE_GT(element_count, 0, platform::errors::InvalidArgument(
+                                          "element count should greater than 0,"
+                                          " but received value is %d.",
+                                          element_count));
+
+  const int theory_thread_count = element_count;
+  // Get Max threads in all SM
+  int max_pyhsical_threads = context.GetMaxPhysicalThreadCount();
+  int sm = context.GetSMCount();
+
+  // Compute pyhsical threads we need, should small than max sm threads
+  const int physical_thread_count =
+      std::min(max_pyhsical_threads, theory_thread_count);
+
+  // Need get from device
+  const int thread_per_block = std::min(1024, context.GetMaxThreadsPerBlock());
+  // Suppose block count small than factor * sm, factor is a experiments value.
+  int factor = 4;
+  const int block_count =
+      std::min(DivUp(physical_thread_count, thread_per_block), factor * sm);
 
-  GpuLaunchConfig config(threads, blocks);
+  GpuLaunchConfig config;
+  config.theory_thread_count.x = theory_thread_count;
+  config.thread_per_block.x = thread_per_block;
+  config.block_per_grid.x = block_count;
+  return config;
+}
+
+inline GpuLaunchConfig GetGpuLaunchConfig2D(
+    const platform::CUDADeviceContext& context, int xdim, int ydim) {
+  PADDLE_ENFORCE_GT(xdim, 0, platform::errors::InvalidArgument(
+                                 "x dim number should greater than 0,"
+                                 " but received value is:%d",
+                                 xdim));
+  PADDLE_ENFORCE_GT(ydim, 0, platform::errors::InvalidArgument(
+                                 "y dim number should greater than 0,"
+                                 " but received value is:%d",
+                                 ydim));
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = std::min(xdim, kThreadsPerBlock);
+  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
+
+  int max_physical_threads = context.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
 
+  GpuLaunchConfig config;
+  // Noticed, block size is not align to 32, if needed do it yourself.
+  config.theory_thread_count = dim3(xdim, ydim, 1);
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
+  int grid_y = std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1));
+
+  config.block_per_grid = dim3(grid_x, grid_y, 1);
   return config;
 }
 
+// TODO(wangchaochaohu): 3D will add later
+
 }  // namespace platform
 }  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/gpu_launch_param_config.h b/paddle/fluid/platform/gpu_launch_param_config.h
deleted file mode 100755
index 40f4ef975e76c..0000000000000
--- a/paddle/fluid/platform/gpu_launch_param_config.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Used for compute gpu launch parameter
-
-#pragma once
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <cuda_runtime.h>
-#include <stddef.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace platform {
-
-inline int DivUp(int a, int b) { return (a + b - 1) / b; }
-
-struct GpuLaunchParamConfig {
-  dim3 theory_thread_count = dim3(0, 0, 0);
-  dim3 thread_per_block = dim3(0, 0, 0);
-  dim3 block_per_grid = dim3(0, 0, 0);
-};
-
-inline GpuLaunchParamConfig GetGpuLaunchConfig1D(
-    const platform::CUDADeviceContext& context, int element_count) {
-  PADDLE_ENFORCE_GT(element_count, 0, platform::errors::InvalidArgument(
-                                          "element count should greater than 0,"
-                                          " but received value is %d.",
-                                          element_count));
-
-  const int theory_thread_count = element_count;
-  // Get Max threads in all SM
-  int max_pyhsical_threads = context.GetMaxPhysicalThreadCount();
-  int sm = context.GetSMCount();
-
-  // Compute pyhsical threads we need, should small than max sm threads
-  const int physical_thread_count =
-      std::min(max_pyhsical_threads, theory_thread_count);
-
-  // Need get from device
-  const int thread_per_block = std::min(1024, context.GetMaxThreadsPerBlock());
-  // Suppose block count small than factor * sm, factor is a experiments value.
-  int factor = 4;
-  const int block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block), factor * sm);
-
-  GpuLaunchParamConfig config;
-  config.theory_thread_count.x = theory_thread_count;
-  config.thread_per_block.x = thread_per_block;
-  config.block_per_grid.x = block_count;
-  return config;
-}
-
-inline GpuLaunchParamConfig GetGpuLaunchConfig2D(
-    const platform::CUDADeviceContext& context, int xdim, int ydim) {
-  PADDLE_ENFORCE_GT(xdim, 0, platform::errors::InvalidArgument(
-                                 "x dim number should greater than 0,"
-                                 " but received value is:%d",
-                                 xdim));
-  PADDLE_ENFORCE_GT(ydim, 0, platform::errors::InvalidArgument(
-                                 "y dim number should greater than 0,"
-                                 " but received value is:%d",
-                                 ydim));
-
-  const int kThreadsPerBlock = 256;
-  int block_cols = std::min(xdim, kThreadsPerBlock);
-  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
-
-  int max_physical_threads = context.GetMaxPhysicalThreadCount();
-  const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
-
-  GpuLaunchParamConfig config;
-  // Noticed, block size is not align to 32, if needed do it yourself.
-  config.theory_thread_count = dim3(xdim, ydim, 1);
-  config.thread_per_block = dim3(block_cols, block_rows, 1);
-
-  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
-  int grid_y = std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1));
-
-  config.block_per_grid = dim3(grid_x, grid_y, 1);
-  return config;
-}
-
-// 3D will add later
-
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 785627a09fb27..740ac1d81f8f9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1330,6 +1330,7 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
       const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
       boost::optional<const mkldnn::memory::desc&> bias,
       const mkldnn::memory::desc& dst, const std::vector<int64_t>& strides,
+      const std::vector<int64_t>& dilations,
       const std::vector<int64_t>& paddings, const mkldnn::engine& engine,
       const std::string& fuse_activation, float fuse_alpha, float fuse_beta,
       const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind,
@@ -1352,18 +1353,18 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
           dev_ctx_.GetBlob(key_conv_pd));
       if (conv_pd_ == nullptr) {
         mkldnn::memory::dims stride_dims = strides;
-
+        mkldnn::memory::dims dilations_dims = dilations;
         auto mkldnn_paddings = ToMkldnnPadding(paddings);
 
         auto conv_desc =
             bias ? typename forward_t::desc(
                        fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, *bias, dst, stride_dims,
+                       src, weights, *bias, dst, stride_dims, dilations_dims,
                        mkldnn_paddings[0], mkldnn_paddings[1])
                  : typename forward_t::desc(
                        fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, dst, stride_dims, mkldnn_paddings[0],
-                       mkldnn_paddings[1]);
+                       src, weights, dst, stride_dims, dilations_dims,
+                       mkldnn_paddings[0], mkldnn_paddings[1]);
 
         mkldnn::primitive_attr conv_attr =
             CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc
index 6ac37a85c2822..07ba7061678d9 100644
--- a/paddle/fluid/pybind/communicator_py.cc
+++ b/paddle/fluid/pybind/communicator_py.cc
@@ -109,10 +109,15 @@ void BindLargeScaleKV(py::module* m) {
              auto* sparse_variable = self.Get(table_name);
              sparse_variable->Load(dir);
            })
-      .def("save", [](LargeScaleKV& self, const std::string& table_name,
-                      const std::string& dir) {
+      .def("save",
+           [](LargeScaleKV& self, const std::string& table_name,
+              const std::string& dir) {
+             auto* sparse_variable = self.Get(table_name);
+             sparse_variable->Save(dir);
+           })
+      .def("size", [](LargeScaleKV& self, const std::string& table_name) {
         auto* sparse_variable = self.Get(table_name);
-        sparse_variable->Save(dir);
+        return sparse_variable->Size();
       });
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 289540c8049a9..4c46af3199e29 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -712,12 +712,60 @@ void BindImperative(py::module *m_ptr) {
                     tmp.stop_gradient=False
                     inputs.append(tmp)
                 ret = paddle.sums(inputs2)
-                loss = paddle.reduce_sum(ret)
+                loss = paddle.sum(ret)
                 loss.backward()
                 print("Before clear_gradient {}".format(loss.grad))
                 loss.clear_gradient()
                 print("After clear_gradient {}".format(loss.grad))
       )DOC")
+      .def("clone",
+           [](std::shared_ptr<imperative::VarBase> &self) {
+             const auto &tensor = self->Var().Get<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(
+                 tensor.IsInitialized(), true,
+                 platform::errors::InvalidArgument(
+                     "%s has not been initialized", self->Name()));
+             auto tracer = imperative::GetCurrentTracer();
+             auto new_var = std::make_shared<imperative::VarBase>(
+                 true, tracer->GenerateUniqueName(self->Name() + "_clone"));
+             framework::AttributeMap attrs;
+             imperative::NameVarBaseMap ins = {{"X", {self}}};
+             imperative::NameVarBaseMap outs = {{"Out", {new_var}}};
+             tracer->TraceOp("assign", ins, outs, attrs);
+             return new_var;
+           },
+           py::return_value_policy::copy, R"DOC(
+
+        Returns a new Tensor, which is clone of origin Tensor, and it remains in the current graph.
+        It will always have a Tensor copy.
+        Tn addition, the cloned Tensor provides gradient propagation.
+
+        Returns: The cloned Tensor.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+
+              x = paddle.to_tensor(1.0, stop_gradient=False)
+              clone_x = x.clone()
+              y = clone_x**2
+              y.backward()
+              print(clone_x.stop_gradient) # False
+              print(clone_x.grad)          # [2.0], support gradient propagation
+              print(x.stop_gradient)       # False
+              print(x.grad)                # [2.0], clone_x support gradient propagation for x
+
+              x = paddle.to_tensor(1.0)
+              clone_x = x.clone()
+              clone_x.stop_gradient = False
+              z = clone_x**3
+              z.backward()
+              print(clone_x.stop_gradient) # False
+              print(clone_x.grad)          # [3.0], support gradient propagation
+              print(x.stop_gradient) # True
+              print(x.grad)          # None
+       )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
               bool retain_graph) {
@@ -833,6 +881,12 @@ void BindImperative(py::module *m_ptr) {
       .def_property_readonly(
           "place", [](imperative::VarBase &self) { return self.Place(); },
           py::return_value_policy::copy)
+      .def_property_readonly("_place_str",
+                             [](imperative::VarBase &self) {
+                               std::stringstream ostr;
+                               ostr << self.Place();
+                               return ostr.str();
+                             })
       .def_property_readonly("type", &imperative::VarBase::Type)
       .def_property_readonly("dtype", &imperative::VarBase::DataType);
 
@@ -890,7 +944,7 @@ void BindImperative(py::module *m_ptr) {
            &imperative::Tracer::GetProgramDescTracer,
            py::return_value_policy::reference)
       .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName,
-           py::arg("key") = "eager_tmp")
+           py::arg("key") = "dygraph_tmp")
       .def(
           "_set_amp_op_list",
           [](imperative::Tracer &self,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index c8e5048421cca..ac615a2320daa 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -502,6 +502,7 @@ void BindAnalysisConfig(py::module *m) {
            py::return_value_policy::reference)
       .def("set_mkldnn_cache_capacity", &AnalysisConfig::SetMkldnnCacheCapacity,
            py::arg("capacity") = 0)
+      .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op)
 #endif
       .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index ee6e541c9e6c6..8288f1852c27b 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -51,6 +51,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
     {"hierarchical_sigmoid",
      {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
+    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -75,6 +76,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
+    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -118,6 +120,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
     {"update_loss_scaling",
      {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
+    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
 };
 
 // clang-format off
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0ee725c302215..3d9d204991f79 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -45,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/save_load_util.h"
 #include "paddle/fluid/framework/scope_pool.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/version.h"
@@ -440,6 +442,31 @@ PYBIND11_MODULE(core_noavx, m) {
         &pb_vmap);
   });
 
+  m.def("set_printoptions", [](const py::kwargs &kwargs) {
+    auto &print_opt = framework::PrintOptions::Instance();
+    if (kwargs.contains("precision")) {
+      print_opt.precision = kwargs["precision"].cast<int>();
+    }
+    if (kwargs.contains("threshold")) {
+      print_opt.threshold = kwargs["threshold"].cast<int>();
+    }
+    if (kwargs.contains("edgeitems")) {
+      print_opt.edgeitems = kwargs["edgeitems"].cast<int>();
+    }
+    if (kwargs.contains("linewidth")) {
+      print_opt.linewidth = kwargs["linewidth"].cast<int>();
+    }
+    if (kwargs.contains("sci_mode")) {
+      print_opt.sci_mode = kwargs["sci_mode"].cast<bool>();
+    }
+
+    VLOG(4) << "Set printoptions: precision=" << print_opt.precision
+            << ", threshold=" << print_opt.threshold
+            << ", edgeitems=" << print_opt.edgeitems
+            << ", linewidth=" << print_opt.linewidth
+            << ", sci_mode=" << print_opt.sci_mode;
+  });
+
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -629,6 +656,8 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_get_double_element", TensorGetElement<double>)
       .def("_place", [](Tensor &self) { return self.place(); })
       .def("_dtype", [](Tensor &self) { return self.type(); })
+      .def("_layout",
+           [](Tensor &self) { return DataLayoutToString(self.layout()); })
       .def("_share_data_with", &Tensor::ShareDataWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
       .def("__str__", [](const Tensor &self) {
@@ -1439,6 +1468,7 @@ All parameter, weight, gradient are variables in Paddle.
              std::exit(-1);
 #endif
            })
+#ifdef PADDLE_WITH_XPU
       .def("_type", &PlaceIndex<platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
@@ -1446,6 +1476,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::XPUPlace &self) { return self.GetDeviceId(); })
+#endif
       .def("__str__", string::to_string<const platform::XPUPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
diff --git a/paddle/fluid/train/demo/clean.sh b/paddle/fluid/train/demo/clean.sh
index 0297b9781da42..a2064492c08b8 100755
--- a/paddle/fluid/train/demo/clean.sh
+++ b/paddle/fluid/train/demo/clean.sh
@@ -1,4 +1,20 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -x
-cd `dirname $0`
+cd "$(dirname "$0")"
 rm -rf build/
 set +x
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 2f7afcf17df89..afb3f360a9abd 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -293,10 +293,19 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
 dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
+pip install requests
+python %work_dir%\tools\get_quick_disable_lt.py > Output
+if %errorlevel%==0 (
+    set /p disable_ut_quickly=<Output
+    DEL Output
+    ) else (
+    set disable_ut_quickly=''
+)
+
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+ctest.exe -E "(%disable_ut_quickly%)" --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
 :unit_test_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a05761ea6dc24..913af90de5ad8 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -555,12 +555,12 @@ EOF
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
-        set +e
+        set +ex
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2 | tee $tmpfile
+        get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list 
+        ctest -E "($disable_ut_quickly)" --output-on-failure -j $2 | tee $tmpfile
         failed_test_lists=''
         collect_failed_tests
-        set +x
         mactest_error=0
         retry_unittests_record=''
         retry_time=3
@@ -892,13 +892,26 @@ function collect_failed_tests() {
     done
 }
 
+# getting qucik disable ut list 
+function get_quickly_disable_ut() {
+    python -m pip install requests
+    if disable_ut_quickly=$(python ${PADDLE_ROOT}/tools/get_quick_disable_lt.py); then
+        echo "========================================="
+        echo "The following unittests have been disabled:"
+        echo ${disable_ut_quickly}
+        echo "========================================="
+    else
+        disable_ut_quickly=''
+    fi
+}
+
 function card_test() {
     set -m
     case_count $1 $2
     ut_startTime_s=`date +%s` 
-    # get the CUDA device count
+    # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
-        CUDA_DEVICE_COUNT=4
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -934,15 +947,15 @@ function card_test() {
         tmpfile=$tmp_dir/$tmpfile_rand"_"$i
         if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                (ctest -I $i,,$NUM_PROC -R "($testcases)" -V | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" -V | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else  
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -V | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" -V | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         else
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                (ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --output-on-failure | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --output-on-failure | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         fi
     done
@@ -977,6 +990,7 @@ set +x
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
         is_nightly=''             # indicate whether the case will only run at night
+        get_quickly_disable_ut||disable_ut_quickly=''    # indicate whether the case was in quickly disable list 
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 0af32da4e690b..21827166d1882 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -59,10 +59,9 @@
 from .tensor.attribute import rank  #DEFINE_ALIAS
 from .tensor.attribute import shape  #DEFINE_ALIAS
 from .tensor.creation import to_tensor  #DEFINE_ALIAS
-from .tensor.creation import crop_tensor  #DEFINE_ALIAS
 from .tensor.creation import diag  #DEFINE_ALIAS
 from .tensor.creation import eye  #DEFINE_ALIAS
-from .tensor.creation import fill_constant  #DEFINE_ALIAS
+# from .tensor.creation import fill_constant  #DEFINE_ALIAS
 # from .tensor.creation import get_tensor_from_selected_rows        #DEFINE_ALIAS
 from .tensor.creation import linspace  #DEFINE_ALIAS
 from .tensor.creation import ones  #DEFINE_ALIAS
@@ -78,6 +77,7 @@
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
 from .tensor.creation import empty  #DEFINE_ALIAS
 from .tensor.creation import empty_like  #DEFINE_ALIAS
+from .tensor.creation import assign  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
@@ -103,8 +103,8 @@
 from .tensor.logic import logical_or  #DEFINE_ALIAS
 from .tensor.logic import logical_xor  #DEFINE_ALIAS
 from .tensor.logic import not_equal  #DEFINE_ALIAS
-from .tensor.logic import reduce_all  #DEFINE_ALIAS
-from .tensor.logic import reduce_any  #DEFINE_ALIAS
+# from .tensor.logic import reduce_all  #DEFINE_ALIAS
+# from .tensor.logic import reduce_any  #DEFINE_ALIAS
 from .tensor.logic import allclose  #DEFINE_ALIAS
 from .tensor.logic import equal_all  #DEFINE_ALIAS
 # from .tensor.logic import isnan        #DEFINE_ALIAS
@@ -130,7 +130,6 @@
 from .tensor.manipulation import strided_slice  #DEFINE_ALIAS
 from .tensor.manipulation import transpose  #DEFINE_ALIAS
 from .tensor.manipulation import unique  #DEFINE_ALIAS
-from .tensor.manipulation import unique_with_counts  #DEFINE_ALIAS
 from .tensor.manipulation import unsqueeze  #DEFINE_ALIAS
 from .tensor.manipulation import unstack  #DEFINE_ALIAS
 from .tensor.manipulation import flip  #DEFINE_ALIAS
@@ -145,12 +144,12 @@
 from .tensor.math import cos  #DEFINE_ALIAS
 from .tensor.math import cosh  #DEFINE_ALIAS
 from .tensor.math import cumsum  #DEFINE_ALIAS
-from .tensor.math import elementwise_add  #DEFINE_ALIAS
-from .tensor.math import elementwise_div  #DEFINE_ALIAS
-from .tensor.math import elementwise_floordiv  #DEFINE_ALIAS
-from .tensor.math import elementwise_mod  #DEFINE_ALIAS
-from .tensor.math import elementwise_pow  #DEFINE_ALIAS
-from .tensor.math import elementwise_sub  #DEFINE_ALIAS
+# from .tensor.math import elementwise_add  #DEFINE_ALIAS
+# from .tensor.math import elementwise_div  #DEFINE_ALIAS
+# from .tensor.math import elementwise_floordiv  #DEFINE_ALIAS
+# from .tensor.math import elementwise_mod  #DEFINE_ALIAS
+# from .tensor.math import elementwise_pow  #DEFINE_ALIAS
+# from .tensor.math import elementwise_sub  #DEFINE_ALIAS
 from .tensor.math import exp  #DEFINE_ALIAS
 from .tensor.math import floor  #DEFINE_ALIAS
 from .tensor.math import increment  #DEFINE_ALIAS
@@ -158,10 +157,10 @@
 from .tensor.math import multiplex  #DEFINE_ALIAS
 from .tensor.math import pow  #DEFINE_ALIAS
 from .tensor.math import reciprocal  #DEFINE_ALIAS
-from .tensor.math import reduce_max  #DEFINE_ALIAS
-from .tensor.math import reduce_min  #DEFINE_ALIAS
-from .tensor.math import reduce_prod  #DEFINE_ALIAS
-from .tensor.math import reduce_sum  #DEFINE_ALIAS
+# from .tensor.math import reduce_max  #DEFINE_ALIAS
+# from .tensor.math import reduce_min  #DEFINE_ALIAS
+# from .tensor.math import reduce_prod  #DEFINE_ALIAS
+# from .tensor.math import reduce_sum  #DEFINE_ALIAS
 from .tensor.math import round  #DEFINE_ALIAS
 from .tensor.math import rsqrt  #DEFINE_ALIAS
 from .tensor.math import scale  #DEFINE_ALIAS
@@ -172,9 +171,8 @@
 from .tensor.math import square  #DEFINE_ALIAS
 from .tensor.math import stanh  #DEFINE_ALIAS
 from .tensor.math import sum  #DEFINE_ALIAS
-from .tensor.math import sums  #DEFINE_ALIAS
 from .tensor.math import tanh  #DEFINE_ALIAS
-from .tensor.math import elementwise_sum  #DEFINE_ALIAS
+from .tensor.math import add_n  #DEFINE_ALIAS
 from .tensor.math import max  #DEFINE_ALIAS
 from .tensor.math import maximum  #DEFINE_ALIAS
 from .tensor.math import min  #DEFINE_ALIAS
@@ -192,7 +190,7 @@
 from .tensor.math import inverse  #DEFINE_ALIAS
 from .tensor.math import log1p  #DEFINE_ALIAS
 from .tensor.math import erf  #DEFINE_ALIAS
-from .tensor.math import addcmul  #DEFINE_ALIAS
+# from .tensor.math import addcmul  #DEFINE_ALIAS
 from .tensor.math import addmm  #DEFINE_ALIAS
 from .tensor.math import clip  #DEFINE_ALIAS
 from .tensor.math import trace  #DEFINE_ALIAS
@@ -212,20 +210,22 @@
 from .tensor.search import argmax  #DEFINE_ALIAS
 from .tensor.search import argmin  #DEFINE_ALIAS
 from .tensor.search import argsort  #DEFINE_ALIAS
-from .tensor.search import has_inf  #DEFINE_ALIAS
-from .tensor.search import has_nan  #DEFINE_ALIAS
+# from .tensor.search import has_inf  #DEFINE_ALIAS
+# from .tensor.search import has_nan  #DEFINE_ALIAS
 from .tensor.search import masked_select  #DEFINE_ALIAS
 from .tensor.search import topk  #DEFINE_ALIAS
 from .tensor.search import where  #DEFINE_ALIAS
 from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
+
+from .tensor.to_string import set_printoptions
+
 from .framework.random import manual_seed  #DEFINE_ALIAS
 from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
 from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
-from .framework import Variable  #DEFINE_ALIAS
 from .framework import ParamAttr  #DEFINE_ALIAS
-from .framework import create_global_var  #DEFINE_ALIAS
+# from .framework import create_global_var  #DEFINE_ALIAS
 from .framework import create_parameter  #DEFINE_ALIAS
 from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
@@ -237,22 +237,15 @@
 from .framework import load  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
-from .framework import NoamDecay  #DEFINE_ALIAS
-from .framework import PiecewiseDecay  #DEFINE_ALIAS
-from .framework import NaturalExpDecay  #DEFINE_ALIAS
-from .framework import ExponentialDecay  #DEFINE_ALIAS
-from .framework import InverseTimeDecay  #DEFINE_ALIAS
-from .framework import PolynomialDecay  #DEFINE_ALIAS
-from .framework import CosineDecay  #DEFINE_ALIAS
 from .framework import set_default_dtype  #DEFINE_ALIAS
 from .framework import get_default_dtype  #DEFINE_ALIAS
 
 from .tensor.search import index_sample  #DEFINE_ALIAS
 from .tensor.stat import mean  #DEFINE_ALIAS
-from .tensor.stat import reduce_mean  #DEFINE_ALIAS
+# from .tensor.stat import reduce_mean  #DEFINE_ALIAS
 from .tensor.stat import std  #DEFINE_ALIAS
 from .tensor.stat import var  #DEFINE_ALIAS
-from .fluid.data import data
+# from .fluid.data import data
 from .tensor.stat import numel  #DEFINE_ALIAS
 from .device import get_cudnn_version
 from .device import set_device
@@ -268,6 +261,7 @@
 from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
 from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
 from .fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
+from .fluid.layers import crop_tensor as crop  #DEFINE_ALIAS
 
 from . import jit
 from . import static
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 9476f3765b3bc..0e43e5a6a17fe 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -56,7 +56,7 @@ class GradScaler(AmpScaler):
         data = paddle.rand([10, 3, 32, 32])
         with paddle.amp.auto_cast():
             conv = model(data)
-            loss = paddle.reduce_mean(conv) 
+            loss = paddle.mean(conv)
             scaled = scaler.scale(loss)  # scale the loss 
             scaled.backward()            # do backward
             scaler.minimize(optimizer, scaled)  # update parameters     
@@ -96,7 +96,7 @@ def scale(self, var):
             data = paddle.rand([10, 3, 32, 32])
             with paddle.amp.auto_cast():
                 conv = model(data)
-                loss = paddle.reduce_mean(conv) 
+                loss = paddle.mean(conv)
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
@@ -128,7 +128,7 @@ def minimize(self, optimizer, *args, **kwargs):
             data = paddle.rand([10, 3, 32, 32])
             with paddle.amp.auto_cast():
                 conv = model(data)
-                loss = paddle.reduce_mean(conv) 
+                loss = paddle.mean(conv)
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 5eba18776c964..372249e01f66b 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -88,27 +88,31 @@ def download(url, module_name, md5sum, save_name=None):
         sys.stderr.write("Cache file %s not found, downloading %s \n" %
                          (filename, url))
         sys.stderr.write("Begin to download\n")
-        r = requests.get(url, stream=True)
-        total_length = r.headers.get('content-length')
-
-        if total_length is None:
-            with open(filename, 'wb') as f:
-                shutil.copyfileobj(r.raw, f)
-        else:
-            with open(filename, 'wb') as f:
-                chunk_size = 4096
-                total_length = int(total_length)
-                total_iter = total_length / chunk_size + 1
-                log_interval = total_iter / 20 if total_iter > 20 else 1
-                log_index = 0
-                for data in r.iter_content(chunk_size=chunk_size):
-                    if six.PY2:
-                        data = six.b(data)
-                    f.write(data)
-                    log_index += 1
-                    if log_index % log_interval == 0:
-                        sys.stderr.write(".")
-                    sys.stdout.flush()
+        try:
+            r = requests.get(url, stream=True)
+            total_length = r.headers.get('content-length')
+
+            if total_length is None:
+                with open(filename, 'wb') as f:
+                    shutil.copyfileobj(r.raw, f)
+            else:
+                with open(filename, 'wb') as f:
+                    chunk_size = 4096
+                    total_length = int(total_length)
+                    total_iter = total_length / chunk_size + 1
+                    log_interval = total_iter / 20 if total_iter > 20 else 1
+                    log_index = 0
+                    for data in r.iter_content(chunk_size=chunk_size):
+                        if six.PY2:
+                            data = six.b(data)
+                        f.write(data)
+                        log_index += 1
+                        if log_index % log_interval == 0:
+                            sys.stderr.write(".")
+                        sys.stdout.flush()
+        except Exception as e:
+            # re-try
+            continue
     sys.stderr.write("\nDownload finished\n")
     sys.stdout.flush()
     return filename
diff --git a/python/paddle/device.py b/python/paddle/device.py
index c2f331caa8ab3..16bb1123e63c6 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -103,15 +103,15 @@ def get_cudnn_version():
 
 def set_device(device):
     """
-    Paddle supports running calculations on various types of devices, including CPU and GPU.
+    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
     They are represented by string identifiers. This function can specify the global device
     which the OP will run.
 
     Parameters:
         device(str): This parameter determines the specific running device.
-            It can be ``cpu`` or ``gpu:0``. When ``device`` is ``cpu``, the
-            program is running on the cpu. When ``device`` is ``gpu``, the
-            program is running ont the gpu.
+            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. 
+
     Examples:
 
      .. code-block:: python
@@ -132,20 +132,37 @@ def set_device(device):
                 "The device should not be 'gpu', " \
                 "since PaddlePaddle is not compiled with CUDA")
         place = core.CUDAPlace(ParallelEnv().dev_id)
+    elif lower_device == 'xpu':
+        if not core.is_compiled_with_xpu():
+            raise ValueError(
+                "The device should not be 'xpu', " \
+                "since PaddlePaddle is not compiled with XPU")
+        place = core.XPUPlace(ParallelEnv().dev_id)
     else:
-        avaliable_device = re.match(r'gpu:\d+', lower_device)
-        if not avaliable_device:
+        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
+        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
+        if not avaliable_gpu_device and not avaliable_xpu_device:
             raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'"
+                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu' or 'xpu:x'"
             )
-        if not core.is_compiled_with_cuda():
-            raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with CUDA".format(avaliable_device))
-        device_info_list = device.split(':', 1)
-        device_id = device_info_list[1]
-        device_id = int(device_id)
-        place = core.CUDAPlace(device_id)
+        if avaliable_gpu_device:
+            if not core.is_compiled_with_cuda():
+                raise ValueError(
+                    "The device should not be {}, since PaddlePaddle is " \
+                    "not compiled with CUDA".format(avaliable_gpu_device))
+            device_info_list = device.split(':', 1)
+            device_id = device_info_list[1]
+            device_id = int(device_id)
+            place = core.CUDAPlace(device_id)
+        if avaliable_xpu_device:
+            if not core.is_compiled_with_xpu():
+                raise ValueError(
+                    "The device should not be {}, since PaddlePaddle is " \
+                    "not compiled with XPU".format(avaliable_xpu_device))
+            device_info_list = device.split(':', 1)
+            device_id = device_info_list[1]
+            device_id = int(device_id)
+            place = core.XPUPlace(device_id)
     framework._set_expected_place(place)
     return place
 
@@ -153,8 +170,8 @@ def set_device(device):
 def get_device():
     """
     This funciton can get the current global device of the program is running.
-    It's a string which is like 'cpu' and 'gpu:0'. if the global device is not
-    set, it will return a string which is 'gpu:0' when cuda is avaliable or it 
+    It's a string which is like 'cpu', 'gpu:x' and 'xpu:x'. if the global device is not
+    set, it will return a string which is 'gpu:x' when cuda is avaliable or it 
     will return a string which is 'cpu' when cuda is not avaliable.
 
     Examples:
@@ -173,5 +190,8 @@ def get_device():
     elif isinstance(place, core.CUDAPlace):
         device_id = place.get_device_id()
         device = 'gpu:' + str(device_id)
+    elif isinstance(place, core.XPUPlace):
+        device_id = place.get_device_id()
+        device = 'xpu:' + str(device_id)
 
     return device
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 19df0ca91e103..b631f7bbe9d11 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -36,7 +36,37 @@
 
 
 class ReduceOp:
-    """Reduce Operation"""
+    """
+    Specify the type of operation used for element-wise reductions.
+    It should be one of the following values:
+
+        ReduceOp.SUM
+
+        ReduceOp.MAX
+
+        ReduceOp.MIN
+
+        ReduceOp.PROD
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            from paddle.distributed import ReduceOp
+            from paddle.distributed import init_parallel_env
+
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.all_reduce(data, op=ReduceOp.SUM)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
+    """
     SUM = 0
     MAX = 1
     MIN = 2
@@ -439,7 +469,7 @@ def barrier(group=0):
             paddle.distributed.barrier()
     """
     op_type = 'barrier'
-    temp = paddle.fill_constant([1], dtype="int32", value="1")
+    temp = fill_constant([1], dtype="int32", value="1")
     if in_dygraph_mode():
         return core.ops.barrier(temp, temp, 'ring_id', group)
     if not isinstance(group, int):
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index a86e1234e4a35..6282b9021b411 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -34,6 +34,8 @@
 
 fleet = Fleet()
 _final_strategy = fleet._final_strategy
+_get_applied_meta_list = fleet._get_applied_meta_list
+_get_applied_graph_list = fleet._get_applied_graph_list
 init = fleet.init
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 7eb3a5659654a..3d26841876b41 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -186,6 +186,15 @@ def init(self, role_maker=None, is_collective=False):
         fleet.util._set_role_maker(self._role_maker)
 
         self.strategy_compiler = StrategyCompiler()
+
+        if self._role_maker._is_non_distributed() and self._is_collective:
+            if paddle.fluid.core.is_compiled_with_cuda():
+                gpus_num = paddle.fluid.core.get_cuda_device_count()
+                if gpus_num != 1:
+                    raise ValueError(
+                        "CUDA_VISIBLE_DEVICES shoule be set only 1 card if you use `python` to launch fleet program."
+                    )
+
         if paddle.fluid.framework.in_dygraph_mode():
             if self.worker_num() == 1:
                 return
@@ -498,7 +507,7 @@ def save_inference_model(self,
             executor, dirname, feeded_var_names, target_vars, main_program,
             export_for_deployment)
 
-    def save_persistables(self, executor, dirname, main_program=None):
+    def save_persistables(self, executor, dirname, main_program=None, mode=1):
         """
 
         saves all persistable variables from :code:`main_program` to
@@ -539,7 +548,8 @@ def save_persistables(self, executor, dirname, main_program=None):
 
         """
 
-        self._runtime_handle._save_persistables(executor, dirname, main_program)
+        self._runtime_handle._save_persistables(executor, dirname, main_program,
+                                                mode)
 
     def distributed_optimizer(self, optimizer, strategy=None):
         """
@@ -568,8 +578,6 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         """
         self.user_defined_optimizer = optimizer
-        if paddle.fluid.framework.in_dygraph_mode():
-            return self
 
         if strategy == None:
             strategy = DistributedStrategy()
@@ -630,9 +638,7 @@ def forward(self, x):
 
                 print("loss:", loss.numpy())
 
-                loss = dp_layer.scale_loss(loss)
                 loss.backward()
-                dp_layer.apply_collective_grads()
 
                 adam.step()
                 adam.clear_grad()
@@ -842,9 +848,7 @@ def forward(self, x):
 
                 print("loss:", loss.numpy())
 
-                loss = dp_layer.scale_loss(loss)
                 loss.backward()
-                dp_layer.apply_collective_grads()
 
                 adam.step()
                 adam.clear_grad()
@@ -903,9 +907,7 @@ def forward(self, x):
 
                 print("loss:", loss.numpy())
 
-                loss = dp_layer.scale_loss(loss)
                 loss.backward()
-                dp_layer.apply_collective_grads()
 
                 adam.step()
                 adam.clear_grad()
@@ -923,6 +925,24 @@ def _final_strategy(self):
         else:
             return self._context["valid_strategy"]
 
+    def _get_applied_meta_list(self):
+        if "applied_meta_list" not in self._context:
+            print(
+                "WARNING: You may need to call minimize function before _get_applied_meta_list called"
+            )
+            return []
+        else:
+            return self._context["applied_meta_list"]
+
+    def _get_applied_graph_list(self):
+        if "applied_graph_list" not in self._context:
+            print(
+                "WARNING: You may need to call minimize function before _get_applied_graph_list called"
+            )
+            return []
+        else:
+            return self._context["applied_graph_list"]
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -1041,6 +1061,12 @@ def minimize(self,
 
         context["valid_strategy"] = copy.deepcopy(valid_strategy)
 
+        applied_meta_list = self.strategy_compiler._get_applied_meta_list()
+        applied_graph_list = self.strategy_compiler._get_applied_graph_list()
+
+        context['applied_meta_list'] = applied_meta_list
+        context['applied_graph_list'] = applied_graph_list
+
         self._context = context
 
         self.valid_strategy = valid_strategy
@@ -1058,17 +1084,11 @@ def minimize(self,
                     loss_name=loss.name, share_vars_from=None)
             loss.block.program._graph = compiled_program
             return self.user_defined_optimizer.minimize(
-                loss,
-                startup_program=startup_program,
-                parameter_list=parameter_list,
-                no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
 
         if meta_optimizer:
             optimize_ops, params_grads = meta_optimizer.minimize(
-                loss,
-                startup_program=startup_program,
-                parameter_list=parameter_list,
-                no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
 
             default_program = paddle.static.default_main_program()
 
@@ -1077,20 +1097,14 @@ def minimize(self,
 
         else:
             optimize_ops, params_grads = self.user_defined_optimizer.minimize(
-                loss,
-                startup_program=startup_program,
-                parameter_list=parameter_list,
-                no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
 
         context["program_optimize_ops"] = optimize_ops
         context["program_params_grads"] = params_grads
 
         if graph_optimizer:
             optimize_ops, params_grads = graph_optimizer.minimize(
-                loss,
-                startup_program=startup_program,
-                parameter_list=parameter_list,
-                no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
             # since we do not encourage users to use graph operations
             # if a graph optimizer takes effect, mostly
             # optimizers_ops and params_grads are None
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index ce9826d7e59ae..b6be992ad1e92 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -825,8 +825,8 @@ def _gloo_init(self):
                 if self._is_first_worker():
                     start_http_server = True
             else:
-                ep_rank_0 = self._server_endpoints[0]
-                if self._server_index() == 0:
+                ep_rank_0 = os.getenv("PADDLE_GLOO_HTTP_ENDPOINT", "")
+                if self._is_server() and self._server_index() == 0:
                     start_http_server = True
             ip, port = ep_rank_0.split(':')
             kwargs = {
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 29e10661888f8..1d6fcee544294 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -13,24 +13,95 @@
 # limitations under the License.
 
 
-def maximum_path_len_algo(optimizer_list):
-    max_idx = 0
-    max_len = 0
-    candidates = []
-    for idx, opt in enumerate(optimizer_list):
-        local_buffer = [opt]
-        for opt_inner in optimizer_list:
+def create_graph(optimizer_list):
+    nsize = len(optimizer_list)
+
+    edge = [[0] * nsize for _ in range(nsize)]  # adjacency matrix
+    indegree = [0] * nsize
+    for i, opt in enumerate(optimizer_list):
+        for j, opt_inner in enumerate(optimizer_list):
             if opt._can_update(opt_inner):
-                local_buffer.append(opt_inner)
-        if len(local_buffer) > max_len:
-            max_idx = idx
-            max_len = len(local_buffer)
-        candidates.append(local_buffer)
-    if len(candidates) == 0:
+                edge[i][j] = 1  # weight
+                indegree[j] += 1
+
+    return edge, indegree
+
+
+def topo_sort(edge, indegree):
+    nsize = len(indegree)
+
+    topo = [-1] * nsize
+    for i in range(nsize):
+        j = 0
+        while j < nsize and indegree[j] != 0:
+            j += 1
+        assert j < nsize, 'The combination of meta optimizers contains ring'
+
+        topo[i] = j
+        indegree[j] = -1
+        for k in range(nsize):
+            if edge[j][k] != 0:
+                indegree[k] -= 1
+
+    return topo
+
+
+def floyd(edge):
+    nsize = len(edge)
+    max_len = -1
+    max_edge = [-1, -1]
+
+    max_path = [[[] for _ in range(nsize)] for _ in range(nsize)]
+    for i in range(nsize):
+        for j in range(nsize):
+            if edge[i][j] > 0:
+                max_path[i][j] = [j]
+
+                if edge[i][j] > max_len:
+                    max_len = edge[i][j]
+                    max_edge = [i, j]
+
+    # use floyd algorithm to find max_path
+    for k in range(nsize):
+        for i in range(nsize):
+            for j in range(nsize):
+                # if a-->b-->c, but a-/->c, can only apply a-->b or b-->c,
+                # however if a-->b-->c, and a-->c, can apply a->b->c
+                if edge[i][j] == 0:
+                    continue
+
+                if edge[i][k] == 0 or edge[k][j] == 0:
+                    continue
+
+                if edge[i][j] < edge[i][k] + edge[k][j]:
+                    edge[i][j] = edge[i][k] + edge[k][j]
+                    max_path[i][j] = max_path[i][k] + max_path[k][j]
+
+                    max_len = edge[i][j]
+                    max_edge = [i, j]
+
+    if max_len == -1:
+        return [0]
+
+    return [max_edge[0]] + max_path[max_edge[0]][max_edge[1]]
+
+
+def maximum_path_len_algo(optimizer_list):
+    if len(optimizer_list) == 0:
         return None
-    for idx, opt in enumerate(candidates[max_idx][:-1]):
-        opt._update_inner_optimizer(candidates[max_idx][idx + 1])
-    return candidates[max_idx]
+
+    edge, indegree = create_graph(optimizer_list)
+    topo_sort(edge, indegree)
+    max_path = floyd(edge)
+
+    candidate = []
+    for idx in max_path:
+        candidate.append(optimizer_list[idx])
+
+    for idx, opt in enumerate(candidate[:-1]):
+        opt._update_inner_optimizer(candidate[idx + 1])
+
+    return candidate
 
 
 class StrategyCompilerBase(object):
@@ -51,13 +122,19 @@ class StrategyCompiler(StrategyCompilerBase):
 
     def __init__(self):
         super(StrategyCompiler, self).__init__()
-        self._meta_optimizer = None
-        self._graph_optimizer = None
+        self._meta_optimizers = []
+        self._graph_optimizers = []
         self._valid_optimizer_list = None
         self._user_defined_strategy = None
         self._meta_optimizer_candidates = []
         self._graph_optimizer_candidates = []
 
+    def _get_applied_meta_list(self):
+        return [type(opt).__name__ for opt in self._meta_optimizers]
+
+    def _get_applied_graph_list(self):
+        return [type(opt).__name__ for opt in self._graph_optimizers]
+
     def _get_valid_strategy(self, dist_strategy, can_not_apply_optimizer_list):
         import copy
         valid_strategy = copy.deepcopy(dist_strategy)
@@ -107,8 +184,8 @@ def generate_optimizer(self, loss, role_maker, optimizer,
             # and graph_optimizer, the corresponding distributed strategy
             # should be updated.
 
-            self._meta_optimizers = meta_optimizers
-            self._graph_optimizers = graph_optimizers
+            self._meta_optimizers = [] if meta_optimizers is None else meta_optimizers
+            self._graph_optimizers = [] if graph_optimizers is None else graph_optimizers
 
             return_meta = None if meta_optimizers == None else meta_optimizers[
                 0]
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 2e23a915454fa..00bec671d4b86 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -141,6 +141,7 @@ def _parse_args():
     ps_group.add_argument("--server_num", type=int, help="number of servers")
     ps_group.add_argument(
         "--heter_worker_num", type=int, help="number of heter_workers")
+    ps_group.add_argument("--http_port", type=int, help="Gloo http Port")
 
     return parser.parse_args()
 
@@ -249,12 +250,8 @@ def launch_ps(args, distribute_mode):
 
 def which_distributed_mode(args):
     ps_args = [
-        '--worker_num',
-        '--server_num',
-        '--heter_worker_num',
-        '--servers',
-        '--workers',
-        '--heter_workers',
+        '--worker_num', '--server_num', '--heter_worker_num', '--servers',
+        '--workers', '--heter_workers', '--http_port'
     ]
     collective_args = ['--ips']
 
@@ -292,9 +289,16 @@ def which_distributed_mode(args):
                     format(has_collective_args, cuda_device_num))
         return DistributeMode.COLLECTIVE
     else:
-        logger.warning(
-            "Not found distinct arguments. Default use gpu collective mode")
-        return DistributeMode.COLLECTIVE
+        if not fluid.core.is_compiled_with_cuda():
+            logger.warning(
+                "Not found distinct arguments and not compiled with cuda. Default use ps mode"
+            )
+            return DistributeMode.PS
+        else:
+            logger.warning(
+                "Not found distinct arguments and compiled with cuda. Default use collective mode"
+            )
+            return DistributeMode.COLLECTIVE
 
 
 def launch():
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 35782e0b04c5a..ec4b0342f2414 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -713,6 +713,14 @@ def get_role_endpoints(self, args):
             else:
                 self.worker_endpoints = args.workers
 
+        # get http_port
+        if args.http_port:
+            self.http_port = args.http_port
+        else:
+            http_port = get_ports(1, self.server_num + self.worker_num)
+            http_ip = self.server_endpoints.split(",")[0].split(":")[0]
+            self.http_port = http_ip + ":" + str(http_port[0])
+
         # get heter worker envs
         if self.distribute_mode == DistributeMode.PS_HETER:
             if args.heter_worker_num:
@@ -827,7 +835,8 @@ def start_ps(self):
 
         self.start_pod_server(self.args, pod)
         self.start_pod_worker(self.args, pod)
-        self.start_pod_heter_worker(self.args, pod)
+        if self.distribute_mode == DistributeMode.PS_HETER:
+            self.start_pod_heter_worker(self.args, pod)
 
         logger.info(
             "Please check servers, workers and heter_worker logs in {}/workerlog.*, {}/serverlog.* and {}/heterlog.*".
@@ -886,8 +895,9 @@ def start_pod_server(self, args, pod):
                 "PADDLE_TRAINERS_NUM": str(self.worker_num),
                 "POD_IP": cur_server.endpoint.split(":")[0],
                 "PADDLE_WITH_GLOO": "1",
-                "PADDLE_GLOO_RENDEZVOUS": "2",
-                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir
+                "PADDLE_GLOO_RENDEZVOUS": "3",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
+                "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
             }
             current_env.update(proc_env)
 
@@ -938,7 +948,8 @@ def start_pod_worker(self, args, pod):
             device_list = [str(x) for x in range(0, heter_device_num)]
 
         for idx, cur_worker in enumerate(pod.workers):
-            device_id = str(device_list[idx % heter_device_num])
+            device_id = "0" if heter_device_num == 0 else str(device_list[
+                idx % heter_device_num])
             proc_env = {
                 "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
                 "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
@@ -948,12 +959,13 @@ def start_pod_worker(self, args, pod):
                 "TRAINING_ROLE": "TRAINER",
                 "PADDLE_TRAINER_ID": str(cur_worker.rank),
                 "PADDLE_WITH_GLOO": "1",
-                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "FLAGS_selected_gpus": "0",
                 "FLAGS_selected_xpus": "0",
                 "CUDA_VISIBLE_DEVICES": device_id,
                 "XPU_VISIBLE_DEVICES": device_id,
+                "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
             }
             current_env.update(proc_env)
 
@@ -1016,12 +1028,13 @@ def start_pod_heter_worker(self, args, pod):
                 "PADDLE_TRAINERS_NUM": str(self.worker_num),
                 "POD_IP": cur_heter_worker.endpoint.split(":")[0],
                 "PADDLE_WITH_GLOO": "1",
-                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "FLAGS_selected_gpus": "0",
                 "FLAGS_selected_xpus": "0",
                 "CUDA_VISIBLE_DEVICES": device_id,
                 "XPU_VISIBLE_DEVICES": device_id,
+                "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
             }
             current_env.update(proc_env)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 283589c5f3320..d861aa7579f46 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -72,7 +72,7 @@ def _enable_strategy(self, dist_strategy, context):
             "incr_every_n_steps": 1000,
             "decr_every_n_nan_or_inf": 2,
             "incr_ratio": 2.0,
-            "decr_ratio": 8.0,
+            "decr_ratio": 0.8,
             "use_dynamic_loss_scaling": True
         }
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 9990021c8506a..be614a0514738 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -31,6 +31,10 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
         opt = self.inner_opt
+
+        if not self.role_maker._is_collective:
+            return
+
         if not isinstance(opt, Momentum):
             return
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index f1b3680976541..6315fbf5a0d63 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -19,11 +19,12 @@ class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(GradientMergeOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.wrapped_opt = GM(optimizer)
+        self.wrapped_opt = None
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
             "GraphExecutionOptimizer",
+            "RecomputeOptimizer",
         ]
         self.meta_optimizers_black_list = []
 
@@ -31,6 +32,10 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(GradientMergeOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
+
+    def _init_wrapped_opt(self):
+        config = self.user_defined_strategy.gradient_merge_configs
+        self.wrapped_opt = GM(self.inner_opt)
         self.wrapped_opt._set_k_steps(
             self.user_defined_strategy.gradient_merge_configs["k_steps"])
         self.wrapped_opt._set_avg(
@@ -49,7 +54,7 @@ def _disable_strategy(self, dist_strategy):
         dist_strategy.gradient_merge_configs = {}
 
     def _enable_strategy(self, dist_strategy, context):
-        # we currently do not support auto-enable gradient merge
+        # we currently do not support auto-enable GradientMerge
         return
 
     def minimize_impl(self,
@@ -57,6 +62,7 @@ def minimize_impl(self,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 266c7d0f405bf..887209d9de2f4 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -21,6 +21,7 @@
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
+from paddle.fluid.framework import Variable, Parameter
 
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
@@ -69,7 +70,52 @@ def build_compiled_startegy(self):
             self.async_strategy, self.role_maker)
         return compiled_config
 
-    def _load_sparse_params(self, dirname, varnames):
+    def _load_sparse_params(self,
+                            executor,
+                            dirname,
+                            varnames,
+                            main_program=None):
+        assert vars != None
+        check_vars = []
+        load_prog = Program()
+        load_block = load_prog.global_block()
+
+        def _in_varnames(var):
+            return var.name in varnames
+
+        load_vars = list(
+            filter(_in_varnames, fluid.default_main_program().list_vars()))
+        if main_program is None:
+            main_program = self.origin_main_program
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
+        for each_var in load_vars:
+            assert isinstance(each_var, Variable)
+
+            origin_varname, _, _ = _get_varname_parts(each_var.name)
+
+            new_var = fluid.io._clone_var_in_block_(load_block, each_var)
+            var_path = os.path.join(dirname, origin_varname)
+            if not os.path.exists(var_path):
+                raise ValueError("SelectedRows var {} can not find at {}".
+                                 format(new_var.name, var_path))
+
+            if os.path.isfile(var_path):
+                load_block.append_op(
+                    type='sparse_tensor_load',
+                    inputs={},
+                    outputs={'Out': [new_var]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_varname),
+                        'node_index': self.role_maker._server_index(),
+                        'node_num': self.role_maker._server_num(),
+                        'shape': each_var.shape
+                    })
+            check_vars.append(each_var)
+
+        executor.run(load_prog)
+
+    def _load_distributed_params(self, dirname, varnames):
         from paddle.fluid.communicator import LargeScaleKV
         from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_varname_parts
 
@@ -248,34 +294,54 @@ def _init_server(self, *args, **kwargs):
             self._init_worker()
             return
 
-        if not model_dirname:
-            return
-
-        if not os.path.isdir(model_dirname):
-            raise ValueError("There is no directory named '%s'", model_dirname)
-
-        sparse_varnames = self.compiled_strategy.get_sparse_varname_on_ps(True)
-
+        sparse_varnames = self.compiled_strategy.get_sparse_varname_on_ps(False)
+        sparse_related_optimize_varnames = []
+        for var_name in sparse_varnames:
+            sparse_related_optimize_varnames += self.compiled_strategy.get_optimize_varname_on_ps(
+                var_name)
+        sparse_related_optimize_varnames = list(
+            set(sparse_related_optimize_varnames))
         distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
-            False)
+            True)
+        distributed_related_optimize_varnames = []
+        for var_name in distribtued_varnames:
+            distributed_related_optimize_varnames += self.compiled_strategy.get_optimize_varname_on_ps(
+                var_name)
+        distributed_related_optimize_varnames = list(
+            set(distributed_related_optimize_varnames))
 
         remaining_vars = list(
             filter(
-                ParameterServerRuntime.__exclude_vars(sparse_varnames +
-                                                      distribtued_varnames),
+                ParameterServerRuntime.__exclude_vars(
+                    sparse_varnames + distribtued_varnames +
+                    sparse_related_optimize_varnames +
+                    distributed_related_optimize_varnames),
                 fluid.default_main_program().list_vars()))
 
+        if not model_dirname:
+            return
+
+        if not os.path.isdir(model_dirname):
+            raise ValueError("There is no directory named '%s'", model_dirname)
+
+        # load dense
         fluid.io.load_vars(
             executor,
             main_program=fluid.default_main_program(),
             dirname=model_dirname,
             vars=remaining_vars)
 
+        # load sparse
         self._load_sparse_params(
-            dirname=model_dirname, varnames=sparse_varnames)
+            executor=executor,
+            dirname=model_dirname,
+            varnames=sparse_varnames + sparse_related_optimize_varnames)
 
-        # todo(tangwei12) load distributed vars
-        # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
+        # load large scale
+        self._load_distributed_params(
+            dirname=model_dirname,
+            varnames=distribtued_varnames +
+            distributed_related_optimize_varnames)
 
     def _run_server(self):
         executor = self._get_executor()
@@ -455,8 +521,7 @@ def _save_sparse_params(self, executor, dirname, context, main_program):
         executor.run(prog)
         return context.keys()
 
-    def _save_distributed_params(self, executor, dirname, context,
-                                 main_program):
+    def _save_distributed_params(self, executor, dirname, context, mode):
         prog = Program()
         block = prog.global_block()
 
@@ -465,7 +530,7 @@ def _save_distributed_params(self, executor, dirname, context,
                 type='checkpoint_notify',
                 attrs={
                     "varname": name,
-                    "is_slice": True,
+                    "mode": mode,
                     "slice_varnames": var_ctx.split_varnames(),
                     "remote_varnames": var_ctx.split_varnames(),
                     "endpoints": var_ctx.split_endpoints(),
@@ -475,7 +540,8 @@ def _save_distributed_params(self, executor, dirname, context,
         executor.run(prog)
         return context.keys()
 
-    def _save_distributed_persistables(self, executor, dirname, main_program):
+    def _save_distributed_persistables(self, executor, dirname, main_program,
+                                       mode):
         dense_ctx = self.compiled_strategy.get_communicator_recv_context(
             recv_type=1, use_origin_program=True)
 
@@ -492,7 +558,7 @@ def _save_distributed_persistables(self, executor, dirname, main_program):
             executor, dirname, sparse_ctx, main_program)
 
         recv_distributed_varnames = self._save_distributed_params(
-            executor, dirname, distributed_ctx, main_program)
+            executor, dirname, distributed_ctx, mode)
 
         saved_varnames = recv_dense_varnames + list(
             recv_sparse_varnames) + list(recv_distributed_varnames)
@@ -512,6 +578,7 @@ def _ps_inference_save_persistables(self,
                                         executor,
                                         dirname,
                                         main_program=None,
+                                        mode=0,
                                         **kwargs):
         """
         This function filters out all variables with `persistable==True` from the
@@ -542,7 +609,8 @@ def _ps_inference_save_persistables(self,
                 "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed"
             )
 
-        self._save_distributed_persistables(executor, dirname, main_program)
+        self._save_distributed_persistables(executor, dirname, main_program,
+                                            mode)
 
     def _ps_inference_save_inference_model(self,
                                            executor,
@@ -588,7 +656,8 @@ def _ps_inference_save_inference_model(self,
 
             program = Program.parse_from_string(program_desc_str)
             program._copy_dist_param_info_from(fluid.default_main_program())
-            self._ps_inference_save_persistables(executor, dirname, program)
+            self._ps_inference_save_persistables(
+                executor, dirname, program, mode=0)
 
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
diff --git a/python/paddle/distributed/fleet/utils/http_server.py b/python/paddle/distributed/fleet/utils/http_server.py
index 4f42ffc0b3d24..d053040882d26 100644
--- a/python/paddle/distributed/fleet/utils/http_server.py
+++ b/python/paddle/distributed/fleet/utils/http_server.py
@@ -36,6 +36,7 @@ def get_logger(name, level, fmt):
     formatter = logging.Formatter(fmt=fmt)
     handler.setFormatter(formatter)
     logger.addHandler(handler)
+    logger.propagate = False
     return logger
 
 
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index fab391e9fdf69..16b031e116acd 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -92,9 +92,7 @@ def train():
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
                 
-                loss = dp_layer.scale_loss(loss)
                 loss.backward()
-                dp_layer.apply_collective_grads()
 
                 adam.step()
                 adam.clear_grad()
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index ca35a3c92594a..fda898799f4fc 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -314,9 +314,7 @@ def train(print_result=False):
                 if print_result is True:
                     print("loss:", loss.numpy())
                 
-                loss = dp_layer.scale_loss(loss)
                 loss.backward()
-                dp_layer.apply_collective_grads()
 
                 adam.step()
                 adam.clear_grad()
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index ff3e882229ae8..9133751a5309f 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -25,9 +25,9 @@
 from .fluid.layers import tensor
 from .fluid.layers import ops
 from .fluid.layers import nn
+from .fluid.layers import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
 from .fluid import core
 from .fluid.framework import in_dygraph_mode
-from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
 from .tensor import arange, gather_nd, concat, multinomial
 import math
 import numpy as np
@@ -662,48 +662,54 @@ class Categorical(Distribution):
 
     Args:
         logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Categorical
+            import paddle
+            from paddle.distribution import Categorical
 
-          x = paddle.rand([6])
-          print(x.numpy())
-          # [0.32564053, 0.99334985, 0.99034804,
-          #  0.09053693, 0.30820143, 0.19095989]
-          y = paddle.rand([6])
-          print(y.numpy())
-          # [0.6365463 , 0.7278677 , 0.90260243, 
-          # 0.5226815 , 0.35837543, 0.13981032]
+            paddle.manual_seed(100) # on CPU device
+            x = paddle.rand([6])
+            print(x.numpy())
+            # [0.5535528  0.20714243 0.01162981
+            #  0.51577556 0.36369765 0.2609165 ]
 
-          cat = Categorical(x)
-          cat2 = Categorical(y)
+            paddle.manual_seed(200) # on CPU device
+            y = paddle.rand([6])
+            print(y.numpy())
+            # [0.77663314 0.90824795 0.15685187
+            #  0.04279523 0.34468332 0.7955718 ]
 
-          cat.sample([2,3])
-          # [[5, 1, 1],
-          # [0, 1, 2]]
+            cat = Categorical(x)
+            cat2 = Categorical(y)
 
-          cat.entropy()
-          # [1.71887]
+            paddle.manual_seed(1000) # on CPU device
+            cat.sample([2,3])
+            # [[0, 0, 5],
+            #  [3, 4, 5]]
 
-          cat.kl_divergence(cat2)
-          # [0.0278455]
+            cat.entropy()
+            # [1.77528]
 
-          value = paddle.to_tensor([2,1,3])
-          cat.probs(value)
-          # [0.341613 0.342648 0.03123]
+            cat.kl_divergence(cat2)
+            # [0.071952]
 
-          cat.log_prob(value)
-          # [-1.07408 -1.07105 -3.46638]
+            value = paddle.to_tensor([2,1,3])
+            cat.probs(value)
+            # [0.00608027 0.108298 0.269656]
+
+            cat.log_prob(value)
+            # [-5.10271 -2.22287 -1.31061]
 
     """
 
     def __init__(self, logits, name=None):
         """
         Args:
-            logits(list|numpy.ndarray|Variable): The logits input of categorical distribution. The data type is float32 or float64.
+            logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+            name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
         """
         if not in_dygraph_mode():
             check_type(logits, 'logits', (np.ndarray, tensor.Variable, list),
@@ -727,27 +733,29 @@ def sample(self, shape):
         """Generate samples of the specified shape.
 
         Args:
-          shape (list): Shape of the generated samples.
+            shape (list): Shape of the generated samples.
 
         Returns:
-          Tensor: A tensor with prepended dimensions shape.
+            Tensor: A tensor with prepended dimensions shape.
         
         Examples:
-        .. code-block:: python
+            .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Categorical
+                import paddle
+                from paddle.distribution import Categorical
 
-          x = paddle.rand([6])
-          print(x.numpy())
-          # [0.32564053, 0.99334985, 0.99034804,
-          #  0.09053693, 0.30820143, 0.19095989]
+                paddle.manual_seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x.numpy())
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
 
-          cat = Categorical(x)
+                cat = Categorical(x)
 
-          cat.sample([2,3])
-          # [[5, 1, 1],
-          # [0, 1, 2]]
+                paddle.manual_seed(1000) # on CPU device
+                cat.sample([2,3])
+                # [[0, 0, 5],
+                #  [3, 4, 5]]
 
         """
         name = self.name + '_sample'
@@ -775,28 +783,31 @@ def kl_divergence(self, other):
             other (Categorical): instance of Categorical. The data type is float32.
 
         Returns:
-            Variable: kl-divergence between two Categorical distributions.
+            Tensor: kl-divergence between two Categorical distributions.
         
         Examples:
-        .. code-block:: python
+            .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Categorical
+                import paddle
+                from paddle.distribution import Categorical
+
+                paddle.manual_seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x.numpy())
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
 
-          x = paddle.rand([6])
-          print(x.numpy())
-          # [0.32564053, 0.99334985, 0.99034804,
-          #  0.09053693, 0.30820143, 0.19095989]
-          y = paddle.rand([6])
-          print(y.numpy())
-          # [0.6365463 , 0.7278677 , 0.90260243, 
-          # 0.5226815 , 0.35837543, 0.13981032]
+                paddle.manual_seed(200) # on CPU device
+                y = paddle.rand([6])
+                print(y.numpy())
+                # [0.77663314 0.90824795 0.15685187
+                #  0.04279523 0.34468332 0.7955718 ]
 
-          cat = Categorical(x)
-          cat2 = Categorical(y)
+                cat = Categorical(x)
+                cat2 = Categorical(y)
 
-          cat.kl_divergence(cat2)
-          # [0.0278455]
+                cat.kl_divergence(cat2)
+                # [0.071952]
 
         """
         name = self.name + '_kl_divergence'
@@ -823,23 +834,24 @@ def entropy(self):
         """Shannon entropy in nats.
 
         Returns:
-          Variable: Shannon entropy of Categorical distribution. The data type is float32.
+            Tensor: Shannon entropy of Categorical distribution. The data type is float32.
         
         Examples:
-        .. code-block:: python
+            .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Categorical
+                import paddle
+                from paddle.distribution import Categorical
 
-          x = paddle.rand([6])
-          print(x.numpy())
-          # [0.32564053, 0.99334985, 0.99034804,
-          #  0.09053693, 0.30820143, 0.19095989]
+                paddle.manual_seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x.numpy())
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
 
-          cat = Categorical(x)
+                cat = Categorical(x)
 
-          cat.entropy()
-          # [1.71887]
+                cat.entropy()
+                # [1.77528]
 
         """
         name = self.name + '_entropy'
@@ -864,27 +876,28 @@ def probs(self, value):
         with ``logits. That is, ``value[:-1] = logits[:-1]``.
 
         Args:
-          value (Tensor): The input tensor represents the selected category index.
+            value (Tensor): The input tensor represents the selected category index.
 
         Returns:
-          Tensor: probability according to the category index.
+            Tensor: probability according to the category index.
         
         Examples:
-        .. code-block:: python
+            .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Categorical
+                import paddle
+                from paddle.distribution import Categorical
 
-          x = paddle.rand([6])
-          print(x.numpy())
-          # [0.32564053, 0.99334985, 0.99034804,
-          #  0.09053693, 0.30820143, 0.19095989]
+                paddle.manual_seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x.numpy())
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
 
-          cat = Categorical(x)
+                cat = Categorical(x)
 
-          value = paddle.to_tensor([2,1,3])
-          cat.probs(value)
-          # [0.341613 0.342648 0.03123]
+                value = paddle.to_tensor([2,1,3])
+                cat.probs(value)
+                # [0.00608027 0.108298 0.269656]
 
         """
         name = self.name + '_probs'
@@ -929,28 +942,28 @@ def log_prob(self, value):
         """Log probabilities of the given category. Refer to ``probs`` method.
 
         Args:
-          value (Tensor): The input tensor represents the selected category index.
+            value (Tensor): The input tensor represents the selected category index.
 
         Returns:
-          Tensor: Log probability.
+            Tensor: Log probability.
         
         Examples:
-        .. code-block:: python
-
-          import paddle
-          from paddle.distribution import Categorical
+            .. code-block:: python
 
-          x = paddle.rand([6])
-          print(x.numpy())
-          # [0.32564053, 0.99334985, 0.99034804,
-          #  0.09053693, 0.30820143, 0.19095989]
+                import paddle
+                from paddle.distribution import Categorical
 
-          cat = Categorical(x)
+                paddle.manual_seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x.numpy())
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
 
-          value = paddle.to_tensor([2,1,3])
+                cat = Categorical(x)
 
-          cat.log_prob(value)
-          # [-1.07408 -1.07105 -3.46638]
+                value = paddle.to_tensor([2,1,3])
+                cat.log_prob(value)
+                # [-5.10271 -2.22287 -1.31061]
 
         """
         name = self.name + '_log_prob'
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 590d76ae1708f..c40b8db6948cf 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1345,7 +1345,7 @@ def append_backward(loss,
             x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
             y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
             x_emb = paddle.static.nn.embedding(x, size=[100, 256])
-            y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
+            y_predict = paddle.static.nn.fc(x=x_emb, size=1, activation=None, name='my_fc')
             loss = F.square_error_cost(input=y_predict, label=y)
             avg_loss = paddle.mean(loss)
 
diff --git a/python/paddle/fluid/communicator.py b/python/paddle/fluid/communicator.py
index 814a70a10e06c..b203e2a80bda4 100644
--- a/python/paddle/fluid/communicator.py
+++ b/python/paddle/fluid/communicator.py
@@ -162,3 +162,6 @@ def save(self, varname, dirname):
 
     def load(self, varname, dirname):
         self.scale_kv.load(varname, dirname)
+
+    def size(self, varname):
+        return self.scale_kv.size(varname)
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 31cacf075b7ef..0b980c7ebab58 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -120,7 +120,7 @@ class CompiledProgram(object):
             exe = static.Executor(place)
 
             data = static.data(name='X', shape=[None, 1], dtype='float32')
-            hidden = static.nn.fc(input=data, size=10)
+            hidden = static.nn.fc(x=data, size=10)
             loss = paddle.mean(hidden)
             paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
@@ -243,7 +243,7 @@ def with_data_parallel(self,
                 exe = static.Executor(place)
 
                 data = static.data(name='X', shape=[None, 1], dtype='float32')
-                hidden = static.nn.fc(input=data, size=10)
+                hidden = static.nn.fc(x=data, size=10)
                 loss = paddle.mean(hidden)
 
                 test_program = static.default_main_program().clone(for_test=True)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 8d7ebcf4caa53..7fc177e7ad765 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -15,18 +15,37 @@
 import logging
 import numpy as np
 import sys
+import os
 import paddle
-from paddle.fluid import dygraph
-from paddle.fluid.dygraph.nn import Conv2D
-from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid import dygraph, core, framework
+from paddle.fluid.executor import Executor
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
+from paddle.fluid.dygraph.nn import Conv2D, Linear, BatchNorm, Pool2D, Conv2DTranspose
+from paddle.fluid.io import load_inference_model, save_inference_model
+from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6, Tanh, Softmax, PReLU
 from paddle.fluid.log_helper import get_logger
 from . import quant_nn
 
-__all__ = ['ImperativeQuantAware']
+__all__ = ['ImperativeQuantAware', 'ImperativeCalcOutScale']
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
+_op_real_in_out_name = {
+    "conv2d": [["Input", "Filter"], ["Output"]],
+    "conv2d_transpose": [["Input", "Filter"], ["Output"]],
+    "pool2d": [["X"], ["Out"]],
+    "elementwise_add": [["X", "Y"], ["Out"]],
+    "softmax": [["X"], ["Out"]],
+    "relu": [["X"], ["Out"]],
+    "relu6": [["X"], ["Out"]],
+    "leaky_relu": [["X"], ["Out"]],
+    "prelu": [["X"], ["Out"]],
+    "tanh": [["X"], ["Out"]],
+    "batch_norm": [["X"], ["Y"]],
+    "sigmoid": [["X"], ["Out"]],
+}
+
 
 class ImperativeQuantAware(object):
     """
@@ -141,7 +160,6 @@ def quantize(self, model):
         for name, layer in model.named_sublayers():
             if not isinstance(layer, self._quantizable_layer_type):
                 continue
-
             scopes = name.split('.')
             target = scopes[-1]
             obj = model
@@ -173,3 +191,204 @@ def _get_quantized_counterpart(self, layer):
             layer, self._weight_bits, self._activation_bits, self._moving_rate,
             self._weight_quantize_type, self._activation_quantize_type)
         return quantized_layer
+
+
+class ImperativeCalcOutScale(object):
+    def __init__(self,
+                 moving_rate=0.9,
+                 target_layer_types=[
+                     'BatchNorm', 'Conv2D', 'Conv2DTranspose', 'LeakyReLU',
+                     'Linear', 'PReLU', 'Pool2D', 'ReLU', 'ReLU6', 'Sigmoid',
+                     'Softmax', 'Tanh'
+                 ]):
+        """
+        Add the logic of calculating and setting output quantization scales of some layers.
+        These output quantization scales may be used by tensorRT or some other inference engines.
+
+        Args:
+            moving_rate(float): The decay coefficient of moving average. The default value is 0.9.
+            quantizable_op_type(list[str]): List the type of layers that will be calculated out_scale. 
+                Default is ['Conv2D', 'ReLU', 'PReLU', 'LeakyReLU', 'Linear', 'Sigmoid', 'BatchNorm', 'ReLU6', 'Tanh', 'Softmax', 'Conv2DTranspose']
+        """
+        super(ImperativeCalcOutScale, self).__init__()
+        self._moving_rate = moving_rate
+        self._out_scale_layers_map = {
+            'BatchNorm': BatchNorm,
+            'Conv2D': Conv2D,
+            'Conv2DTranspose': Conv2DTranspose,
+            'LeakyReLU': LeakyReLU,
+            'Linear': Linear,
+            'PReLU': PReLU,
+            'Pool2D': Pool2D,
+            'ReLU': ReLU,
+            'ReLU6': ReLU6,
+            'Sigmoid': Sigmoid,
+            'Softmax': Softmax,
+            'Tanh': Tanh
+        }
+        self._out_scale_layer_type = tuple(
+            self._out_scale_layers_map[layer]
+            if layer in self._out_scale_layers_map else layer
+            for layer in target_layer_types)
+        for layer in self._out_scale_layer_type:
+            assert not isinstance(
+                layer, str), "{} is unspported to be out_scaled.".format(layer)
+        self._register_hook_handle_list = []
+        self._out_scale_dict = {}
+
+    def calc_out_scale(self, model):
+        """
+        Insert the `moving_average_abs_max_scale` op to calculate output scale of Specific layers in model.
+
+        Args:
+            model(fluid.dygraph.Layer): The target model which would be calculate the output quantization scale.
+
+        Returns:
+            None
+        """
+        assert isinstance(
+            model, dygraph.Layer), "model must be the instance of dygraph.Layer"
+        for _, layer in model.named_sublayers():
+            if not isinstance(layer, self._out_scale_layer_type):
+                continue
+            forward_post_hook_handle = layer.register_forward_post_hook(
+                self._forward_post_hook)
+            self._register_hook_handle_list.append(forward_post_hook_handle)
+
+    # Get the output var name of the op
+    def _get_op_output_names(self, op):
+        assert isinstance(
+            op, framework.Operator), "The input op should be Operator."
+        var_names = []
+        name_list = _op_real_in_out_name[op.type][1]
+        for name in name_list:
+            var_name = op.output(name)
+            if isinstance(var_name, list):
+                var_names.extend(var_name)
+            else:
+                var_names.append(var_name)
+        return var_names
+
+    def save_quantized_model(self, layer, path, input_spec=None, **config):
+        """
+        Save the quantized model for the inference.
+
+        Args:
+            layer (Layer): The Layer to be saved.
+            path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+            input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
+                method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+                the original Layer's forward method would be the inputs of the saved model. Default None.
+            **configs (dict, optional): Other save configuration options for compatibility. We do not 
+                recommend using these configurations, they may be removed in the future. If not necessary, 
+                DO NOT use them. Default None.
+                The following options are currently supported:
+                (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
+                By default, all return variables of original Layer's forward method are kept as the 
+                output of the saved model. If the provided ``output_spec`` list is not all output variables, 
+                the saved model will be pruned according to the given ``output_spec`` list. 
+
+        Returns:
+            None
+        """
+
+        assert isinstance(
+            layer, dygraph.Layer), "model must be the instance of dygraph.Layer"
+        with dygraph.guard():
+            layer.eval()
+            for handle in self._register_hook_handle_list:
+                handle.remove()
+            for key in self._out_scale_dict:
+                self._out_scale_dict[key] = float(self._out_scale_dict[key]
+                                                  .numpy())
+
+        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = Executor(place)
+
+        file_prefix = os.path.basename(path)
+        dirname = os.path.dirname(path)
+        model_filename = file_prefix + INFER_MODEL_SUFFIX
+        params_filename = file_prefix + INFER_PARAMS_SUFFIX
+
+        [inference_program, feed_target_names, fetch_targets] = (
+            load_inference_model(
+                dirname=dirname,
+                executor=exe,
+                model_filename=model_filename,
+                params_filename=params_filename))
+
+        # Traverse all ops in the program and find out the op matching
+        # the Layer in the dynamic graph.
+        layer_var_dict = {}
+        for block in inference_program.blocks:
+            for op in block.ops:
+                if op.type in _op_real_in_out_name:
+                    output_var_names = self._get_op_output_names(op)
+                    for output_var_name in output_var_names:
+                        output_var_tensor = block.var(output_var_name)
+                        if output_var_tensor.dtype not in [
+                                core.VarDesc.VarType.FP64,
+                                core.VarDesc.VarType.FP32
+                        ]:
+                            continue
+                        # Because the Layer in dygraph may correspond to multiple ops
+                        # in static program after being saved. To ensure correctness,
+                        # the outscale collected for output of dygraph Layer can only
+                        # be set to the last op in the corresponding ops in static program.
+                        #
+                        # We can judge the execution order of the ops which corresponding
+                        # to dygraph Layer by the name of output. And use dict to save
+                        # the corresponding relationship between the dygraph Layer and the
+                        # static graph op that needs to set the outscale attribute.
+                        dynamic_layer_name, var_name_suffix = output_var_name.split(
+                            ".")
+                        if dynamic_layer_name in layer_var_dict:
+                            if layer_var_dict[dynamic_layer_name][
+                                    0] < var_name_suffix:
+                                layer_var_dict[dynamic_layer_name] = [
+                                    var_name_suffix, op
+                                ]
+                        else:
+                            layer_var_dict[
+                                dynamic_layer_name] = [var_name_suffix, op]
+
+        # Because the naming styles of static and dynamic graph are different,
+        # in order to avoid mistakes, we unify the name here.
+        for (layer_name, var_name_op_list) in layer_var_dict.items():
+            if 'prelu' in layer_name:
+                layer_name = layer_name.replace('prelu', 'p_re_lu')
+            if 'relu' in layer_name:
+                layer_name = layer_name.replace('relu', 're_lu')
+            if layer_name not in self._out_scale_dict:
+                continue
+            var_name_op_list[1]._set_attr('out_threshold',
+                                          self._out_scale_dict[layer_name])
+
+        # Save the processed program.
+        save_inference_model(
+            dirname=dirname,
+            feeded_var_names=feed_target_names,
+            target_vars=fetch_targets,
+            executor=exe,
+            main_program=inference_program.clone(),
+            model_filename=model_filename,
+            params_filename=params_filename)
+
+    def _forward_post_hook(self, layer, input, output):
+        assert isinstance(
+            output, core.VarBase
+        ), "Multiple outputs are not currently supported in ImperativeOutScale."
+        if output.dtype not in [
+                core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64
+        ]:
+            return
+        if not hasattr(layer, "_out_scale"):
+            layer._out_scale = quant_nn.MovingAverageAbsMaxScale(
+                output.name, self._moving_rate, output.dtype)
+        scale_out = layer._out_scale(output)
+        self._out_scale_dict[layer.full_name()] = scale_out
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 2e35ac288c715..bbaae56439eb6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -24,7 +24,8 @@
 
 __all__ = [
     'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
-    'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax'
+    'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax',
+    'MovingAverageAbsMaxScale'
 ]
 
 
@@ -494,3 +495,78 @@ def forward(self, input):
         else:
             pre_activation = mul_out
         return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class MovingAverageAbsMaxScale(layers.Layer):
+    def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
+        """
+        MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
+        Its computational formula is described as below:
+
+        :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
+        :math:`Out = X`
+        """
+        super(MovingAverageAbsMaxScale, self).__init__()
+        self._moving_rate = moving_rate
+        self._dtype = dtype
+
+        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
+        name = unique_name.generate(scale_prefix)
+        scale_attr = ParamAttr(
+            name=name, initializer=Constant(1), trainable=False)
+        self._scale = self.create_parameter(
+            shape=[1], attr=scale_attr, dtype=self._dtype)
+        self._scale.stop_gradient = True
+
+        state_prefix = "{}.state".format(name) if name else 'outscale.state'
+        state_attr = ParamAttr(
+            name=unique_name.generate(state_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._state = self.create_parameter(
+            shape=[1], attr=state_attr, dtype=self._dtype)
+        self._state.stop_gradient = True
+
+        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
+        accum_attr = ParamAttr(
+            name=unique_name.generate(accum_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._accum = self.create_parameter(
+            shape=[1], attr=accum_attr, dtype=self._dtype)
+        self._accum.stop_gradient = True
+        MovingAverageAbsMaxScale._has_create = True
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('moving_rate', self._moving_rate, 'is_test',
+                     not self.training)
+            state = self._state if self.training else None
+            accum = self._accum if self.training else None
+
+            out_scale, _, _ = core.ops.moving_average_abs_max_scale(
+                input, accum, state, self._scale, state, accum, *attrs)
+            return out_scale
+
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'MovingAverageAbsMaxScale')
+
+        scale_out = self._scale
+        attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training}
+
+        inputs = {"X": [input]}
+        outputs = {"OutScale": [scale_out]}
+
+        if self.training:
+            inputs['InState'] = [self._state]
+            inputs['InAccum'] = [self._accum]
+            outputs['OutState'] = [self._state]
+            outputs['OutAccum'] = [self._accum]
+
+        self._helper.append_op(
+            type="moving_average_abs_max_scale",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return scale_out
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
new file mode 100644
index 0000000000000..3fc8352493d93
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -0,0 +1,461 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid import core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import ImperativeCalcOutScale
+from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass, OutScaleForInferencePass
+from paddle.fluid.dygraph.container import Sequential
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
+from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
+from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
+from paddle.fluid.log_helper import get_logger
+
+paddle.enable_static()
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
+    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+    conv1 = fluid.layers.conv2d(
+        data,
+        num_filters=6,
+        filter_size=3,
+        stride=1,
+        padding=1,
+        param_attr=conv2d_w1_attr,
+        bias_attr=conv2d_b1_attr)
+    batch_norm1 = layers.batch_norm(conv1)
+    relu1 = layers.relu(batch_norm1)
+    pool1 = fluid.layers.pool2d(
+        relu1, pool_size=2, pool_type='max', pool_stride=2)
+    conv2 = fluid.layers.conv2d(
+        pool1,
+        num_filters=16,
+        filter_size=5,
+        stride=1,
+        padding=0,
+        param_attr=conv2d_w2_attr,
+        bias_attr=conv2d_b2_attr)
+    batch_norm2 = layers.batch_norm(conv2)
+    relu6_1 = layers.relu6(batch_norm2)
+    pool2 = fluid.layers.pool2d(
+        relu6_1, pool_size=2, pool_type='max', pool_stride=2)
+
+    fc1 = fluid.layers.fc(input=pool2,
+                          size=120,
+                          param_attr=fc_w1_attr,
+                          bias_attr=fc_b1_attr)
+    leaky_relu1 = layers.leaky_relu(fc1, alpha=0.01)
+    fc2 = fluid.layers.fc(input=leaky_relu1,
+                          size=84,
+                          param_attr=fc_w2_attr,
+                          bias_attr=fc_b2_attr)
+    sigmoid1 = layers.sigmoid(fc2)
+    fc3 = fluid.layers.fc(input=sigmoid1,
+                          size=num_classes,
+                          act=classifier_activation,
+                          param_attr=fc_w3_attr,
+                          bias_attr=fc_b3_attr)
+    return fc3
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation='softmax'):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=6,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                param_attr=conv2d_w1_attr,
+                bias_attr=conv2d_b1_attr),
+            BatchNorm(6),
+            ReLU(),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(
+                num_channels=6,
+                num_filters=16,
+                filter_size=5,
+                stride=1,
+                padding=0,
+                param_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            BatchNorm(16),
+            ReLU6(),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                input_dim=400,
+                output_dim=120,
+                param_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            LeakyReLU(),
+            Linear(
+                input_dim=120,
+                output_dim=84,
+                param_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Sigmoid(),
+            Linear(
+                input_dim=84,
+                act=classifier_activation,
+                output_dim=num_classes,
+                param_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
+class TestImperativeOutSclae(unittest.TestCase):
+    def test_calc_out_scale_save(self):
+        imperative_out_scale = ImperativeCalcOutScale()
+
+        with fluid.dygraph.guard():
+            lenet = ImperativeLenet()
+            adam = AdamOptimizer(
+                learning_rate=0.001, parameter_list=lenet.parameters())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32)
+            imperative_out_scale.calc_out_scale(lenet)
+            epoch_num = 1
+            for epoch in range(epoch_num):
+                lenet.train()
+                for batch_id, data in enumerate(train_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+                    out = lenet(img)
+                    acc = fluid.layers.accuracy(out, label)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    lenet.clear_gradients()
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
+                            format(epoch, batch_id,
+                                   avg_loss.numpy(), acc.numpy()))
+                lenet.eval()
+                for batch_id, data in enumerate(test_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+
+                    out = lenet(img)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=out, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=out, label=label, k=5)
+
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
+                            format(epoch, batch_id,
+                                   acc_top1.numpy(), acc_top5.numpy()))
+
+            # save weights
+            model_dict = lenet.state_dict()
+            fluid.save_dygraph(model_dict, "save_temp")
+
+            # test the correctness of `save_quantized_model`
+            data = next(test_reader())
+            test_data = np.array([x[0].reshape(1, 28, 28)
+                                  for x in data]).astype('float32')
+            test_img = fluid.dygraph.to_variable(test_data)
+            lenet.eval()
+            before_save = lenet(test_img)
+
+        # save inference quantized model
+        path = "./outscale_infer_model/lenet"
+        save_dir = "./outscale_infer_model"
+        imperative_out_scale.save_quantized_model(
+            layer=lenet,
+            path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=save_dir,
+                executor=exe,
+                model_filename="lenet" + INFER_MODEL_SUFFIX,
+                params_filename="lenet" + INFER_PARAMS_SUFFIX))
+        after_save, = exe.run(inference_program,
+                              feed={feed_target_names[0]: test_data},
+                              fetch_list=fetch_targets)
+
+        self.assertTrue(
+            np.allclose(after_save, before_save.numpy()),
+            msg='Failed to save the inference quantized model.')
+
+    def test_out_scale_acc(self):
+        def _build_static_lenet(main, startup, is_test=False, seed=1000):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    main.random_seed = seed
+                    startup.random_seed = seed
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    prediction = StaticLenet(img)
+                    if not is_test:
+                        loss = fluid.layers.cross_entropy(
+                            input=prediction, label=label)
+                        avg_loss = fluid.layers.mean(loss)
+                    else:
+                        avg_loss = prediction
+            return img, label, avg_loss
+
+        reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+        param_init_map = {}
+        seed = 1000
+        lr = 0.1
+        dynamic_out_scale_list = []
+        static_out_scale_list = []
+
+        # imperative train
+        _logger.info(
+            "--------------------------dynamic graph qat--------------------------"
+        )
+        imperative_out_scale = ImperativeCalcOutScale()
+
+        with fluid.dygraph.guard():
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+            lenet = ImperativeLenet()
+            fixed_state = {}
+            for name, param in lenet.named_parameters():
+                p_shape = param.numpy().shape
+                p_value = param.numpy()
+                if name.endswith("bias"):
+                    value = np.zeros_like(p_value).astype('float32')
+                else:
+                    value = np.random.normal(
+                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
+                            p_shape).astype('float32')
+                fixed_state[name] = value
+                param_init_map[param.name] = value
+            lenet.set_dict(fixed_state)
+            imperative_out_scale.calc_out_scale(lenet)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            dynamic_loss_rec = []
+            lenet.train()
+            for batch_id, data in enumerate(reader()):
+                x_data = np.array([x[0].reshape(1, 28, 28)
+                                   for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = fluid.dygraph.to_variable(x_data)
+                label = fluid.dygraph.to_variable(y_data)
+
+                out = lenet(img)
+                loss = fluid.layers.cross_entropy(out, label)
+                avg_loss = fluid.layers.mean(loss)
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                lenet.clear_gradients()
+                dynamic_loss_rec.append(avg_loss.numpy()[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+            lenet.eval()
+            op_object_list = (Conv2D, ReLU, ReLU6, LeakyReLU, Sigmoid, Pool2D,
+                              BatchNorm)
+
+        path = "./dynamic_outscale_infer_model/lenet"
+        save_dir = "./dynamic_outscale_infer_model"
+
+        imperative_out_scale.save_quantized_model(
+            layer=lenet,
+            path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        _logger.info(
+            "--------------------------static graph qat--------------------------"
+        )
+        static_loss_rec = []
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        infer = fluid.Program()
+        startup = fluid.Program()
+        static_img, static_label, static_loss = _build_static_lenet(
+            main, startup, False, seed)
+        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
+                                                      seed)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                opt = AdamOptimizer(learning_rate=lr)
+                opt.minimize(static_loss)
+
+        scope = core.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        for param in main.all_parameters():
+            param_tensor = scope.var(param.name).get_tensor()
+            param_tensor.set(param_init_map[param.name], place)
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
+        transform_pass = OutScaleForTrainingPass(scope=scope, place=place)
+        transform_pass.apply(main_graph)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_all_reduce_ops = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=static_loss.name, build_strategy=build_strategy)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[static_img, static_label], place=place)
+        with fluid.scope_guard(scope):
+            for batch_id, data in enumerate(reader()):
+                loss_v, = exe.run(binary,
+                                  feed=feeder.feed(data),
+                                  fetch_list=[static_loss])
+                static_loss_rec.append(loss_v[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', loss_v))
+        scale_inference_pass = OutScaleForInferencePass(scope=scope)
+        scale_inference_pass.apply(infer_graph)
+
+        out_scale_op_list = [
+            "batch_norm", "conv2d", "leaky_relu", "pool2d", "relu6", "relu",
+            "sigmoid", "tanh", "relu6", "softmax", "conv2d_transpose",
+            "elementwise_add"
+        ]
+        op_nodes = infer_graph.all_op_nodes()
+        for op_node in op_nodes:
+            if op_node.name() in out_scale_op_list:
+                static_out_scale_list.append(op_node.op().attr("out_threshold"))
+
+        save_program = infer_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
+                                          [infer_pre], exe, save_program)
+        rtol = 1e-05
+        atol = 1e-08
+        for i, (loss_d,
+                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
+            diff = np.abs(loss_d - loss_s)
+            if diff > (atol + rtol * np.abs(loss_s)):
+                _logger.info(
+                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
+                    format(diff, i, loss_d, loss_s))
+                break
+
+        self.assertTrue(
+            np.allclose(
+                np.array(dynamic_loss_rec),
+                np.array(static_loss_rec),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True),
+            msg='Failed to do the imperative qat.')
+        # load dynamic model
+        [inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=save_dir,
+                executor=exe,
+                model_filename="lenet" + INFER_MODEL_SUFFIX,
+                params_filename="lenet" + INFER_PARAMS_SUFFIX))
+
+        global_block = inference_program.global_block()
+        for op in global_block.ops:
+            if op.has_attr('out_threshold'):
+                dynamic_out_scale_list.append(op.attr('out_threshold'))
+
+        check_list = [
+            False for item in dynamic_out_scale_list
+            if item not in static_out_scale_list
+        ]
+        self.assertTrue(len(check_list) == 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py b/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
new file mode 100644
index 0000000000000..c947eeb31fc19
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
@@ -0,0 +1,83 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.contrib.slim.quantization.imperative import quant_nn
+
+paddle.enable_static()
+
+
+def init_data(batch_size=32, img_shape=[784], label_range=9):
+    np.random.seed(5)
+    assert isinstance(img_shape, list)
+    input_shape = [batch_size] + img_shape
+    img = np.random.random(size=input_shape).astype(np.float32)
+    label = np.array(
+        [np.random.randint(0, label_range) for _ in range(batch_size)]).reshape(
+            (-1, 1)).astype("int64")
+    return img, label
+
+
+class TestMovingAverageAbsMaxScaleOp(unittest.TestCase):
+    def check_backward(self, use_cuda):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            image = fluid.layers.data(
+                name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            fc_tmp = fluid.layers.fc(image, size=10, act='softmax')
+            out_scale = quant_nn.MovingAverageAbsMaxScale(
+                name=fc_tmp.name, dtype=fc_tmp.dtype)
+            fc_tmp_1 = out_scale(fc_tmp)
+            cross_entropy = fluid.layers.softmax_with_cross_entropy(fc_tmp,
+                                                                    label)
+            loss = fluid.layers.reduce_mean(cross_entropy)
+            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            sgd.minimize(loss)
+
+        moving_average_abs_max_scale_ops = [
+            op for op in main_program.blocks[0].ops
+            if op.type == u'moving_average_abs_max_scale'
+        ]
+        assert len(
+            moving_average_abs_max_scale_ops
+        ) == 1, "The number of moving_average_abs_max_scale_ops should be 1."
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+
+        binary = fluid.compiler.CompiledProgram(
+            main_program).with_data_parallel(loss_name=loss.name)
+
+        img, label = init_data()
+        feed_dict = {"image": img, "label": label}
+        res = exe.run(binary, feed_dict)
+
+    def test_fw_bw(self):
+        if core.is_compiled_with_cuda():
+            self.check_backward(use_cuda=True)
+        self.check_backward(use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 1ef0d494e0725..7d203b349a130 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -17,6 +17,7 @@
 import sys
 import time
 import signal
+import numbers
 import logging
 import itertools
 import threading
@@ -81,12 +82,17 @@ def default_collate_fn(batch):
             else:
                 slots[i].append(item)
 
-    if isinstance(slots[0][0], np.ndarray):
-        return [np.stack(slot, axis=0) for slot in slots]
-    elif isinstance(slots[0][0], paddle.Tensor):
-        return [layers.stack(slot, axis=0) for slot in slots]
-    else:
-        raise RuntimeError("Unknown data type {}".format(type(slots[0][0])))
+    outputs = []
+    for slot in slots:
+        if isinstance(slot[0], (np.ndarray, np.bool, numbers.Number)):
+            tmp = np.stack(slot, axis=0)
+            outputs.append(tmp)
+        elif isinstance(slot[0], paddle.Tensor):
+            tmp = layers.stack(slot, axis=0)
+            outputs.append(tmp)
+        else:
+            raise RuntimeError("Unknown data type {}".format(type(slot[0])))
+    return outputs
 
 
 class _DatasetKind(object):
@@ -346,6 +352,12 @@ def __next__(self):
     def next(self):
         return self.__next__()
 
+    def __del__(self):
+        # _blocking_queue in keep order mode holds sub-threads
+        # need to release thread resources on unexpected exit
+        if self._blocking_queue:
+            self._blocking_queue.close()
+
 
 # NOTE(chenweihang): _worker_loop must be top level method to be pickled
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 4796cd5ada420..ec91417a0f2ee 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -221,6 +221,13 @@ def _gen_worker_desc(self, trainer_desc):
                 for i in program_configs[program_id]["pull_dense"]:
                     pc.pull_dense_table_id.extend([i])
                     dense_table_set.add(i)
+                # code for partial push dense table such as multitask
+                if "cond2denseid" in program_configs[program_id]:
+                    cond2denseid = program_configs[program_id]["cond2denseid"]
+                    for key, value in cond2denseid.items():
+                        mc_map = pc.partial_pushdense_condtable_map.add()
+                        mc_map.key = key
+                        mc_map.value = value
                 break
 
         trainer_desc.device_worker_name = opt_info.get("worker_class",
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 9b540c378e9d3..db1a705167cb9 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -480,7 +480,7 @@ def test_dygraph_grad(create_graph):
             paddle.disable_static()
 
             def test_dygraph_grad(grad_outputs=None):
-                x = paddle.fill_constant(shape=[1], value=2.0, dtype='float32')
+                x = paddle.fluid.layers.fill_constant(shape=[1], value=2.0, dtype='float32')
                 x.stop_gradient = False
 
                 y1 = x * x
@@ -503,7 +503,7 @@ def test_dygraph_grad(grad_outputs=None):
 
                 return dx.numpy()
 
-            grad_value = paddle.fill_constant(shape=[1], value=4.0, dtype='float32')
+            grad_value = paddle.fluid.layers.fill_constant(shape=[1], value=4.0, dtype='float32')
 
             # dy1 = [1], dy2 = [1]
             print(test_dygraph_grad(None)) # [7.]
@@ -515,7 +515,7 @@ def test_dygraph_grad(grad_outputs=None):
             print(test_dygraph_grad([grad_value, None])) # [19.]
 
             # dy1 = [3], dy2 = [4]
-            grad_y1 = paddle.fill_constant(shape=[1], value=3.0, dtype='float32')
+            grad_y1 = paddle.fluid.layers.fill_constant(shape=[1], value=3.0, dtype='float32')
             print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
 	'''
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index fb87ea4455d34..a18a6ed5c3985 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -140,12 +140,12 @@ def load_dygraph(model_path, **configs):
     Args:
         model_path(str) : The file prefix store the state_dict. 
             (The path should Not contain suffix '.pdparams') 
-        **configs (dict, optional): other save configuration options for compatibility. We do not 
+        **configs (dict, optional): Other load configuration options for compatibility. We do not 
             recommend using these configurations, if not necessary, DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (string): The inference model file name of the paddle 1.x ``save_inference_model`` 
+            (1) model_filename (str): The inference model file name of the paddle 1.x ``save_inference_model`` 
             save format. Default file name is :code:`__model__` . 
-            (2) params_filename (string): The persistable variables file name of the paddle 1.x ``save_inference_model`` 
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x ``save_inference_model`` 
             save format. No default file name, save variables separately by default.
 
     Returns:
@@ -164,7 +164,7 @@ def load_dygraph(model_path, **configs):
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, "paddle_dy")
 
-            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+            scheduler = paddle.optimizer.lr.NoamDecay(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 1004665ca15fb..feb8b0f9c9a16 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -82,6 +82,29 @@ def __getitem__(self, item):
         return self.tolist()[item]
 
 
+class LazyInitialized(object):
+    """
+    Descriptor to implement lazy initialization of property.
+    """
+
+    def __init__(self, function):
+        self.function = function
+
+    def __get__(self, instance, cls):
+        val = self.function(instance)
+        setattr(instance, self.function.__name__, val)
+        return val
+
+
+def _change_is_test_status(program, is_test):
+    # change all `is_test` attributes
+    for block in program.blocks:
+        for op in block.ops:
+            if op.has_attr('is_test'):
+                op._set_attr('is_test', is_test)
+    return program
+
+
 class PartialProgramLayer(layers.Layer):
     """
     PartialProgramLayer wraps all the ops from layers decorated by `@declarative`
@@ -109,15 +132,30 @@ def __init__(self, main_program, inputs, outputs, parameters=None):
         self._outputs = NestSequence(outputs, need_check=True)
         self._params = parameters if parameters is not None else []
 
-        main_program = self._verify_program(main_program)
-        self._infer_program = self._clone_for_test(main_program)
-        self._train_program = self._append_backward_desc(main_program)
-
-        self._set_grad_type(self._params)
+        self._origin_main_program = self._verify_program(main_program)
         self._inner_scope = core.Scope()
         # Set default mode to train
         self.training = True
 
+    @LazyInitialized
+    def _infer_program(self):
+        """
+        Lazy initialized property of infer_program.
+        """
+        return self._clone_for_test(self._origin_main_program)
+
+    @LazyInitialized
+    def _train_program(self):
+        """
+        Lazy initialized property of train_program.
+        """
+        train_program = self._append_backward_desc(self._origin_main_program)
+        # Note: Only set grad type once after initializing train program. So we
+        # put it here.
+        self._set_grad_type(self._params, train_program)
+
+        return train_program
+
     def _verify_program(self, main_program):
         """
         Verify that the program parameter is initialized, prune some unused params,
@@ -132,7 +170,8 @@ def _verify_program(self, main_program):
 
     @switch_to_static_graph
     def _append_backward_desc(self, main_program):
-        program = main_program.clone()
+        # make sure all status of is_test are False in train mode.
+        program = _change_is_test_status(main_program.clone(), is_test=False)
         targets = []
         for out in self._outputs.tolist():
             if isinstance(out, framework.Variable):
@@ -280,7 +319,7 @@ def _remove_no_value(self, out_vars):
 
         return out_vars
 
-    def _set_grad_type(self, params):
+    def _set_grad_type(self, params, train_program):
         # NOTE: if user set sparse gradient mode, the param's gradient
         # will be SelectedRows, not LoDTensor. But tracer will just
         # set param grad VarBase by forward VarBase(LoDTensor)
@@ -289,7 +328,7 @@ def _set_grad_type(self, params):
         # be user wanted result.
         for param in params:
             grad_name = param.name + core.grad_var_suffix()
-            grad_var = self._train_program.desc.block(0).find_var(
+            grad_var = train_program.desc.block(0).find_var(
                 cpt.to_bytes(grad_name))
             # NOTE: cannot find var desc maybe no problem, such as in batch_norm
             if grad_var is None:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index ddf44d805d1bf..2ff3fe833d66d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -174,7 +174,7 @@ def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
         # 1. filter `self` in args
         if args and isinstance(args[0], layers.Layer):
             args = args[1:]
-        # 2. convert tensor and numpy array into InputSpec 
+        # 2. convert tensor and numpy array into InputSpec
         _args, _kwargs = function_spec.unified_args_and_kwargs(args, kwargs)
         input_with_spec = function_spec.args_to_input_spec(_args, _kwargs)
 
@@ -592,9 +592,8 @@ def from_func_spec(func_spec, input_spec, class_instance):
                     inputs = tuple([class_instance] + list(inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
-                all_parameters_and_buffers = list(
-                    get_parameters(class_instance).values()) + list(
-                        get_buffers(class_instance).values())
+                all_parameters_and_buffers = _extract_indeed_params_buffers(
+                    class_instance)
 
                 # 3. Builds program only once and returns the output Variables.
                 with param_guard(get_parameters(
@@ -622,6 +621,17 @@ def from_func_spec(func_spec, input_spec, class_instance):
             startup_program=startup_program)
 
 
+def _extract_indeed_params_buffers(class_instance):
+    """
+    To filter not initialzed buffers.
+    """
+    params = list(get_parameters(class_instance).values())
+    buffers = list(get_buffers(class_instance).values())
+    buffers = [buffer for buffer in buffers if len(buffer.shape) != 0]
+
+    return params + buffers
+
+
 class ProgramCache(object):
     """
     Wrapper class for the program functions defined by dygraph function.
@@ -702,11 +712,11 @@ class ProgramTranslator(object):
     Examples:
         .. code-block:: python
 
-        import paddle.fluid as fluid
+            import paddle
 
-        # Two methods get same object because ProgramTranslator is a singleton
-        fluid.dygraph.ProgramTranslator()
-        fluid.dygraph.ProgramTranslator.get_instance()
+            # Two methods get same object because ProgramTranslator is a singleton
+            paddle.jit.ProgramTranslator()
+            paddle.jit.ProgramTranslator.get_instance()
 
     """
 
@@ -743,11 +753,11 @@ def __init__(self):
 
     def enable(self, enable_to_static):
         """
-        Enable or disable the converting from imperative to declarative by
+        Enable or disable the converting from imperative to static graph by
         ProgramTranslator globally.
 
         Args:
-            enable_to_static (bool): True or False to enable or disable declarative.
+            enable_to_static (bool): True or False to enable or disable converting to static.
 
         Returns:
             None.
@@ -755,25 +765,24 @@ def enable(self, enable_to_static):
         Examples:
             .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
+                import paddle
 
-            @fluid.dygraph.jit.declarative
-            def func(x):
-                x = fluid.dygraph.to_variable(x)
-                if fluid.layers.mean(x) > 0:
-                    x_v = x - 1
-                else:
-                    x_v = x + 1
-                return x_v
 
-            prog_trans = fluid.dygraph.ProgramTranslator()
-            prog_trans.enable(False)
+                @paddle.jit.to_static
+                def func(x):
+                    if paddle.mean(x) > 0:
+                        x_v = x - 1
+                    else:
+                        x_v = x + 1
+                    return x_v
 
-            x = np.ones([1, 2])
-            # The declarative is disabled so the func is run in dygraph
-            with fluid.dygraph.guard():
-                print(func(x).numpy()) # [[2. 2.]]
+
+                prog_trans = paddle.jit.ProgramTranslator()
+                prog_trans.enable(False)
+
+                x = paddle.ones([1, 2])
+                # ProgramTranslator is disabled so the func is run in dygraph
+                print(func(x).numpy())  # [[0. 0.]]
 
         """
         check_type(enable_to_static, "enable_to_static", bool,
@@ -782,38 +791,37 @@ def func(x):
 
     def get_output(self, dygraph_func, *args, **kwargs):
         """
-        Returns the output dygraph VarBase for dygraph function. The dygraph
+        Returns the output dygraph Tensor for dygraph function. The dygraph
         function will be translated into static graph function so the under
-        beneath numerical result will be calculated by declarative mode.
+        beneath numerical result will be calculated by static graph mode.
 
         Args:
             dygraph_func (callable): the dygraph function.
-            *args, **kwargs : the input argument of dygraph_func.
+            *args (tuple): the input argument of dygraph_func.
+            **kwargs (dict): the input argument of dygraph_func.
 
         Returns:
-            VarBase or tuple of VarBase: the dygraph VarBase containing digital
-                result.
+            Tensor or tuple of Tensors: the dygraph Tensor containing digital result.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                import numpy as np
+                import paddle
+
 
                 def func(x):
-                    x = fluid.dygraph.to_variable(x)
-                    if fluid.layers.mean(x) > 0:
+                    if paddle.mean(x) > 0:
                         x_v = x - 1
                     else:
                         x_v = x + 1
                     return x_v
 
-                prog_trans = fluid.dygraph.ProgramTranslator()
 
-                with fluid.dygraph.guard():
-                    x = np.ones([1, 2])
-                    x_v = prog_trans.get_output(func, x)
-                    print(x_v.numpy()) # [[0. 0.]]
+                prog_trans = paddle.jit.ProgramTranslator()
+
+                x = paddle.ones([1, 2])
+                x_v = prog_trans.get_output(func, x)
+                print(x_v.numpy())  # [[0. 0.]]
 
         """
         assert callable(
@@ -875,19 +883,18 @@ def get_func(self, dygraph_func):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                import numpy as np
+                import paddle
+
 
                 def func(x):
-                    x = fluid.dygraph.to_variable(x)
-                    if fluid.layers.mean(x) > 0:
+                    if paddle.mean(x) > 0:
                         x_v = x - 1
                     else:
                         x_v = x + 1
                     return x_v
 
-                prog_trans = fluid.dygraph.ProgramTranslator()
 
+                prog_trans = paddle.jit.ProgramTranslator()
                 static_func = prog_trans.get_func(func)
                 print(callable(static_func)) # True
 
@@ -908,43 +915,43 @@ def func(x):
 
     def get_program(self, dygraph_func, *args, **kwargs):
         """
-        Returns the translated static program and input/output variables from
+        Returns the translated static program and input/output Tensors from
         dygraph function. The users can use the program to run by executor.
 
         Args:
             dygraph_func (callable): the dygraph function.
-            *args, **kwargs : the input argument of dygraph_func.
+            *args (tuple): the input argument of dygraph_func.
+            **kwargs (dict): the input argument of dygraph_func.
 
         Returns:
             tuple of (main_program, startup_program, inputs, outputs) whose
-            types are (Program, Program, list of Variable, list of Variable).
+            types are (Program, Program, list of Tensors, list of Tensors).
             main_program: the converted main program.
             startup_program: the converted startup program.
-            inputs: list of input Variables which need to be fed.
-            outputs: list of output Variables which users can fetch.
+            inputs: list of input Tensors which need to be fed.
+            outputs: list of output Tensors which users can fetch.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                import numpy as np
+                import paddle
+
 
                 def func(x):
-                    x = fluid.dygraph.to_variable(x)
-                    if fluid.layers.mean(x) > 0:
+                    if paddle.mean(x) > 0:
                         x_v = x - 1
                     else:
                         x_v = x + 1
                     return x_v
 
-                prog_trans = fluid.dygraph.ProgramTranslator()
 
-                x = np.ones([1, 2])
+                prog_trans = paddle.jit.ProgramTranslator()
+                x = paddle.ones([1, 2])
                 main_prog, start_prog, inputs, outputs = prog_trans.get_program(func, x)
                 print([i.name for i in inputs])
-                # ['feed_0'] the feed input variable name representing x
+                # [u'generated_tensor_0'] the feed input Tensor name representing x
                 print([o.name for o in outputs])
-                # ['_generated_var_4'] the fetch output variable name representing x_v        
+                # [u'_generated_var_4'] the fetch output Tensor name representing x_v        
 
         """
         assert callable(
@@ -993,21 +1000,21 @@ def get_code(self, dygraph_func):
         Examples:
             .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
+                import paddle
 
-            def func(x):
-                x = fluid.dygraph.to_variable(x)
-                if fluid.layers.mean(x) > 0:
-                    x_v = x - 1
-                else:
-                    x_v = x + 1
-                return x_v
 
-            prog_trans = fluid.dygraph.ProgramTranslator()
+                def func(x):
+                    if paddle.mean(x) > 0:
+                        x_v = x - 1
+                    else:
+                        x_v = x + 1
+                    return x_v
 
-            code = prog_trans.get_code(func)
-            print(type(code)) # <class 'str'>
+
+                prog_trans = paddle.jit.ProgramTranslator()
+
+                code = prog_trans.get_code(func)
+                print(type(code)) # <class 'str'>
 
         """
         assert callable(
@@ -1040,9 +1047,9 @@ def get_program_cache(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                prog_trans = fluid.dygraph.ProgramTranslator()
+                prog_trans = paddle.jit.ProgramTranslator()
                 prog_cache = prog_trans.get_program_cache()
 
         """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 8da7b40db4c6b..b7ebd3800c4c3 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -87,7 +87,7 @@ def create_static_variable_gast_node(name):
 
 
 def create_fill_constant_node(name, value):
-    func_code = "{} = paddle.fill_constant(shape=[1], ".format(name)
+    func_code = "{} = paddle.fluid.layers.fill_constant(shape=[1], ".format(name)
     if isinstance(value, bool):
         func_code += "dtype='bool', value={})".format(value)
         return gast.parse(func_code).body[0]
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 6cdd13fba82ac..9eea6d659f7b1 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -470,10 +470,10 @@ def save(layer, path, input_spec=None, **configs):
     format model, which can be used for inference or fine-tuning after loading.
 
     It will save the translated program and all related persistable 
-    variables of input Layer to given ``path``.
+    variables of input Layer to given ``path`` .
     
     ``path`` is the prefix of saved objects, and the saved translated program file 
-    suffix is ``.pdmodel``, the saved persistable variables file suffix is ``.pdiparams``,
+    suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` ,
     and here also saved some additional variable description information to a file,  
     its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
@@ -483,18 +483,17 @@ def save(layer, path, input_spec=None, **configs):
       - Other C++ inference APIs
 
     Args:
-        layer (Layer): the Layer to be saved. The Layer should be decorated by `@paddle.jit.to_static`.
+        layer (Layer): The Layer to be saved.
         path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model. 
-            It is the example inputs that will be passed to saved TranslatedLayer's forward
-            function. If None, all input variables of the original Layer's forward function
-            would be the inputs of the saved model. Default None.
-        **configs (dict, optional): other save configuration options for compatibility. We do not 
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+            the original Layer's forward method would be the inputs of the saved model. Default None.
+        **configs (dict, optional): Other save configuration options for compatibility. We do not 
             recommend using these configurations, they may be removed in the future. If not necessary, 
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
-            By default, all return variables of original Layer's forward function are kept as the 
+            By default, all return variables of original Layer's forward method are kept as the 
             output of the saved model. If the provided ``output_spec`` list is not all output variables, 
             the saved model will be pruned according to the given ``output_spec`` list. 
 
@@ -735,14 +734,14 @@ def load(path, **configs):
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
     Args:
-        path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-        **configs (dict, optional): other load configuration options for compatibility. We do not 
+        path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix`` .
+        **configs (dict, optional): Other load configuration options for compatibility. We do not 
             recommend using these configurations, they may be removed in the future. If not necessary, 
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            (1) model_filename (str): The inference model file name of the paddle 1.x 
             ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
             ``save_inference_model`` save format. No default file name, save variables separately 
             by default.
 
@@ -844,7 +843,6 @@ def train(layer, loader, loss_fn, opt):
 
             import numpy as np
             import paddle
-            import paddle.fluid as fluid
             import paddle.static as static
             import paddle.nn as nn
             import paddle.optimizer as opt
@@ -870,9 +868,11 @@ def __getitem__(self, idx):
                 def __len__(self):
                     return self.num_samples
 
+            paddle.enable_static()
+
             image = static.data(name='image', shape=[None, 784], dtype='float32')
             label = static.data(name='label', shape=[None, 1], dtype='int64')
-            pred = static.nn.fc(input=image, size=10, act='softmax')
+            pred = static.nn.fc(x=image, size=10, activation='softmax')
             loss = F.cross_entropy(input=pred, label=label)
             avg_loss = paddle.mean(loss)
 
@@ -901,7 +901,7 @@ def __len__(self):
                     fetch_list=[avg_loss])
 
             model_path = "fc.example.model"
-            fluid.io.save_inference_model(
+            paddle.fluid.io.save_inference_model(
                 model_path, ["image"], [pred], exe)
 
             # 2. load model
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 1a488844dec21..9a23e11b8a8bc 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -702,9 +702,6 @@ def forward(self, input):
 
 class Pool2D(layers.Layer):
     """
-    :alias_main: paddle.nn.Pool2D
-	:alias: paddle.nn.Pool2D,paddle.nn.layer.Pool2D,paddle.nn.layer.common.Pool2D
-	:old_api: paddle.fluid.dygraph.Pool2D
 
     This interface is used to construct a callable object of the ``Pool2D`` class.
     For more details, refer to code examples.
@@ -2354,9 +2351,6 @@ def forward(self, input):
 
 class BilinearTensorProduct(layers.Layer):
     """
-    :alias_main: paddle.nn.BilinearTensorProduct
-	:alias: paddle.nn.BilinearTensorProduct,paddle.nn.layer.BilinearTensorProduct,paddle.nn.layer.common.BilinearTensorProduct
-	:old_api: paddle.fluid.dygraph.BilinearTensorProduct
 
     **Add Bilinear Tensor Product Layer**
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index d810709e670c4..28670aa1b038b 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -397,9 +397,7 @@ def train():
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
                 
-                loss = dp_layer.scale_loss(loss)
                 loss.backward()
-                dp_layer.apply_collective_grads()
 
                 adam.step()
                 adam.clear_grad()
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6ac13923a2a7f..0f65bdd6f2100 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -162,8 +162,8 @@ def backward(self, retain_graph=False):
                     # there is no one need gradient on it.
                     tmp.stop_gradient=False
                     inputs.append(tmp)
-                ret = paddle.sums(inputs)
-                loss = paddle.reduce_sum(ret)
+                ret = paddle.add_n(inputs)
+                loss = paddle.sum(ret)
                 loss.backward()
 
         """
@@ -236,22 +236,15 @@ def __str__(self):
             .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
-                x = paddle.rand([1, 5])
+                x = paddle.rand([2, 5])
                 print(x)
-                # Variable: eager_tmp_0
-                #   - place: CUDAPlace(0)
-                #   - shape: [1, 5]
-                #   - layout: NCHW
-                #   - dtype: float
-                #   - data: [0.645307 0.597973 0.732793 0.646921 0.540328]
-                paddle.enable_static()
+                
+                # Tensor(shape=[2, 5], dtype=float32, place=CPUPlace,
+                #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
+                #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
         """
-        tensor = self.value().get_tensor()
-        if tensor._is_initialized():
-            return 'Tensor: %s\n%s' % (self.name, str(tensor))
-        else:
-            return 'Tensor: %s, not initialized' % (self.name)
+        from paddle.tensor.to_string import to_string
+        return to_string(self)
 
     @property
     def block(self):
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index f5660c3fc91a1..be72b4158c317 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -480,7 +480,7 @@ class Executor(object):
     and single/multiple-CPU running.
 
     Args:
-        place(fluid.CPUPlace()|fluid.CUDAPlace(n)|None): This parameter represents
+        place(paddle.CPUPlace()|paddle.CUDAPlace(n)|None): This parameter represents
             which device the executor runs on. When this parameter is None, PaddlePaddle
             will set the default device according to its installation version. If Paddle
             is CPU version, the default device would be set to `CPUPlace()` . If Paddle is
@@ -492,60 +492,57 @@ class Executor(object):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import paddle.fluid.compiler as compiler
-          import numpy
-          import os
-
-          # Set place explicitly.
-          # use_cuda = True
-          # place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-          # exe = fluid.Executor(place)
-
-          # If you don't set place, PaddlePaddle sets the default device.
-          exe = fluid.Executor()
-
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.data(name='X', shape=[None, 1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-          # Run the startup program once and only once.
-          # Not need to optimize/compile the startup program.
-          startup_program.random_seed=1
-          exe.run(startup_program)
-
-          # Run the main program directly without compile.
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          loss_data, = exe.run(train_program,
-                               feed={"X": x},
-                               fetch_list=[loss.name])
-
-          # Or, compiled the program and run. See `CompiledProgram`
-          # for more detail.
-          # NOTE: If you use CPU to run the program or Paddle is
-          # CPU version, you need to specify the CPU_NUM, otherwise,
-          # fluid will use all the number of the logic core as
-          # the CPU_NUM, in that case, the batch size of the input
-          # should be greater than CPU_NUM, if not, the process will be
-          # failed by an exception.
-
-          # Set place explicitly.
-          # if not use_cuda:
-          #     os.environ['CPU_NUM'] = str(2)
-
-          # If you don't set place and PaddlePaddle is CPU version
-          os.environ['CPU_NUM'] = str(2)
-
-          compiled_prog = compiler.CompiledProgram(
-              train_program).with_data_parallel(
-              loss_name=loss.name)
-          loss_data, = exe.run(compiled_prog,
-                               feed={"X": x},
-                               fetch_list=[loss.name])
+            import paddle
+            import numpy
+            import os
+
+            # Executor is only used in static graph mode
+            paddle.enable_static()
+
+            # Set place explicitly.
+            # use_cuda = True
+            # place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            # exe = paddle.static.Executor(place)
+
+            # If you don't set place, PaddlePaddle sets the default device.
+            exe = paddle.static.Executor()
+
+            train_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+            with paddle.static.program_guard(train_program, startup_program):
+                data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+                hidden = paddle.static.nn.fc(data, 10)
+                loss = paddle.mean(hidden)
+                paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+            # Run the startup program once and only once.
+            # Not need to optimize/compile the startup program.
+            exe.run(startup_program)
+
+            # Run the main program directly without compile.
+            x = numpy.random.random(size=(10, 1)).astype('float32')
+            loss_data, = exe.run(train_program, feed={"X": x}, fetch_list=[loss.name])
+
+            # Or, compiled the program and run. See `CompiledProgram`
+            # for more details.
+            # NOTE: If you use CPU to run the program or Paddle is
+            # CPU version, you need to specify the CPU_NUM, otherwise,
+            # PaddlePaddle will use all the number of the logic core as
+            # the CPU_NUM, in that case, the batch size of the input
+            # should be greater than CPU_NUM, if not, the process will be
+            # failed by an exception.
+
+            # Set place explicitly.
+            # if not use_cuda:
+            #     os.environ['CPU_NUM'] = str(2)
+
+            # If you don't set place and PaddlePaddle is CPU version
+            os.environ['CPU_NUM'] = str(2)
+
+            compiled_prog = paddle.static.CompiledProgram(
+                train_program).with_data_parallel(loss_name=loss.name)
+            loss_data, = exe.run(compiled_prog, feed={"X": x}, fetch_list=[loss.name])
+
     """
 
     def __init__(self, place=None):
@@ -842,10 +839,10 @@ def close(self):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
 
-              cpu = fluid.CPUPlace()
-              exe = fluid.Executor(cpu)
+              cpu = paddle.CPUPlace()
+              exe = paddle.static.Executor(cpu)
               # execute training or testing
               exe.close()
         """
@@ -855,7 +852,7 @@ def close(self):
 
     def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                       return_numpy, return_merged):
-        from paddle.optimizer.lr_scheduler import _LRScheduler
+        from paddle.optimizer.lr import LRScheduler
         exe = program._executor
         # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
         # instead of program. We will add support for checking Vars in Graph
@@ -901,7 +898,7 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
 
         if hasattr(program._program, 'lr_sheduler'):
             lr_sheduler = program._program.lr_sheduler
-            assert isinstance(lr_sheduler, _LRScheduler), "must be _LRScheduler"
+            assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
             lr_value = lr_sheduler()
             lr_var = program._program.global_block().vars[lr_sheduler._var_name]
             lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
@@ -928,17 +925,17 @@ def run(self,
         Run the specified :code:`Program` or :code:`CompiledProgram`. It should be noted that the executor
         will execute all the operators in :code:`Program` or :code:`CompiledProgram` without pruning some
         operators of the :code:`Program` or :code:`CompiledProgram` according to fetch_list. And you could
-        specify the scope to store the :code:`Variables` during the executor running if the scope
-        is not set, the executor will use the global scope, i.e. :code:`fluid.global_scope()`.
+        specify the scope to store the :code:`Tensor` during the executor running if the scope
+        is not set, the executor will use the global scope, i.e. :code:`paddle.static.global_scope()`.
 
         Args:
             program(Program|CompiledProgram): This parameter represents the :code:`Program` or
                 :code:`CompiledProgram` to be executed. If this parameter is not provided, that
-                parameter is None, the program will be set to :code:`fluid.default_main_program()`.
+                parameter is None, the program will be set to :code:`paddle.static.default_main_program()`.
                 The default is None.
-            feed(list|dict): This parameter represents the input variables of the model.
+            feed(list|dict): This parameter represents the input Tensors of the model.
                 If it is single card training, the feed is dict type, and if it is multi-card
-                training, the parameter feed can be dict or list type variable. If the
+                training, the parameter feed can be dict or list of Tensors. If the
                 parameter type is dict, the data in the feed will be split and sent to
                 multiple devices (CPU/GPU), that is to say, the input data will be evenly
                 sent to different devices, so you should make sure the number of samples of
@@ -946,23 +943,23 @@ def run(self,
                 if the parameter type is list, those data are copied directly to each device,
                 so the length of this list should be equal to the number of places.
                 The default is None.
-            fetch_list(list): This parameter represents the variables that need to be returned
+            fetch_list(list): This parameter represents the Tensors that need to be returned
                 after the model runs. The default is None. 
-            feed_var_name(str): This parameter represents the name of the input variable of
+            feed_var_name(str): This parameter represents the name of the input Tensor of
                 the feed operator. The default is "feed".
-            fetch_var_name(str): This parameter represents the name of the output variable of
+            fetch_var_name(str): This parameter represents the name of the output Tensor of
                 the fetch operator. The default is "fetch".
             scope(Scope): the scope used to run this program, you can switch 
-                it to different scope. default is :code:`fluid.global_scope()`
-            return_numpy(bool): This parameter indicates whether convert the fetched variables
-                (the variable specified in the fetch list) to numpy.ndarray. if it is False,
+                it to different scope. default is :code:`paddle.static.global_scope()`
+            return_numpy(bool): This parameter indicates whether convert the fetched Tensors
+                (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
                 the type of the return value is a list of :code:`LoDTensor`. The default is True.
             use_program_cache(bool): This parameter indicates whether the input :code:`Program` is cached.
                 If the parameter is True, the model may run faster in the following cases:
-                the input program is :code:`fluid.Program`, and the parameters(program, feed variable name
-                and fetch_list variable) of this interface remains unchanged during running.
+                the input program is :code:`paddle.static.Program`, and the parameters(program, feed Tensor name
+                and fetch_list Tensor) of this interface remains unchanged during running.
                 The default is False.
-            return_merged(bool): This parameter indicates whether fetched variables (the variables
+            return_merged(bool): This parameter indicates whether fetched Tensors (the Tensors
                 specified in the fetch list) should be merged according to the execution device dimension.
                 If :code:`return_merged` is False, the type of the return value is a two-dimensional list
                 of :code:`Tensor` / :code:`LoDTensorArray` ( :code:`return_numpy` is False) or a two-dimensional
@@ -996,81 +993,88 @@ def run(self,
                number of CPU cores or GPU cards, if it is less than, it is recommended that
                the batch be discarded.
             2. If the number of CPU cores or GPU cards available is greater than 1, the fetch
-               results are spliced together in dimension 0 for the same variable values
-               (variables in fetch_list) on different devices.
+               results are spliced together in dimension 0 for the same Tensor values
+               (Tensors in fetch_list) on different devices.
 
         Examples 1:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              import numpy
-
-              # First create the Executor.
-              place = fluid.CPUPlace() # fluid.CUDAPlace(0)
-              exe = fluid.Executor(place)
+                import paddle
+                import numpy
 
-              data = fluid.data(name='X', shape=[None, 1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              adam = fluid.optimizer.Adam()
-              adam.minimize(loss)
-              i = fluid.layers.zeros(shape=[1], dtype='int64')
-              array = fluid.layers.array_write(x=loss, i=i)
+                # First create the Executor.
+                paddle.enable_static()
+                place = paddle.CPUPlace()  # paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+
+                data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+                hidden = paddle.static.nn.fc(data, 10)
+                loss = paddle.mean(hidden)
+                adam = paddle.optimizer.Adam()
+                adam.minimize(loss)
+                i = paddle.zeros(shape=[1], dtype='int64')
+                array = paddle.fluid.layers.array_write(x=loss, i=i)
 
-              # Run the startup program once and only once.
-              exe.run(fluid.default_startup_program())
+                # Run the startup program once and only once.
+                exe.run(paddle.static.default_startup_program())
 
-              x = numpy.random.random(size=(10, 1)).astype('float32')
-              loss_val, array_val = exe.run(feed={'X': x},
-                                            fetch_list=[loss.name, array.name])
-              print(array_val)
-              # [array([0.02153828], dtype=float32)]
+                x = numpy.random.random(size=(10, 1)).astype('float32')
+                loss_val, array_val = exe.run(feed={'X': x},
+                                              fetch_list=[loss.name, array.name])
+                print(array_val)
+                # [array([0.02153828], dtype=float32)]
 
         Examples 2:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
                 import numpy as np
 
                 # First create the Executor.
-                place = fluid.CUDAPlace(0)
-                exe = fluid.Executor(place)
+                paddle.enable_static()
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
 
-                data = fluid.data(name='X', shape=[None, 1], dtype='float32')
+                data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
                 class_dim = 2
-                prediction = fluid.layers.fc(input=data, size=class_dim)
-                loss = fluid.layers.mean(prediction)
-                adam = fluid.optimizer.Adam()
+                prediction = paddle.static.nn.fc(data, class_dim)
+                loss = paddle.mean(prediction)
+                adam = paddle.optimizer.Adam()
                 adam.minimize(loss)
 
                 # Run the startup program once and only once.
-                exe.run(fluid.default_startup_program())
-                build_strategy = fluid.BuildStrategy()
-                binary = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(
-                    loss_name=loss.name, build_strategy=build_strategy)
+                exe.run(paddle.static.default_startup_program())
+                build_strategy = paddle.static.BuildStrategy()
+                binary = paddle.static.CompiledProgram(
+                    paddle.static.default_main_program()).with_data_parallel(
+                        loss_name=loss.name, build_strategy=build_strategy)
                 batch_size = 6
                 x = np.random.random(size=(batch_size, 1)).astype('float32')
 
                 # Set return_merged as False to fetch unmerged results:
-                unmerged_prediction, = exe.run(binary, feed={'X': x},
-                    fetch_list=[prediction.name],
-                    return_merged=False)
+                unmerged_prediction, = exe.run(binary,
+                                               feed={'X': x},
+                                               fetch_list=[prediction.name],
+                                               return_merged=False)
                 # If the user uses two GPU cards to run this python code, the printed result will be
                 # (2, 3, class_dim). The first dimension value of the printed result is the number of used
                 # GPU cards, and the second dimension value is the quotient of batch_size and the
                 # number of used GPU cards.
-                print("The unmerged prediction shape: {}".format(np.array(unmerged_prediction).shape))
+                print("The unmerged prediction shape: {}".format(
+                    np.array(unmerged_prediction).shape))
                 print(unmerged_prediction)
 
                 # Set return_merged as True to fetch merged results:
-                merged_prediction, = exe.run(binary, feed={'X': x},
-                    fetch_list=[prediction.name],
-                    return_merged=True)
+                merged_prediction, = exe.run(binary,
+                                             feed={'X': x},
+                                             fetch_list=[prediction.name],
+                                             return_merged=True)
                 # If the user uses two GPU cards to run this python code, the printed result will be
                 # (6, class_dim). The first dimension value of the printed result is the batch_size.
-                print("The merged prediction shape: {}".format(np.array(merged_prediction).shape))
+                print("The merged prediction shape: {}".format(
+                    np.array(merged_prediction).shape))
                 print(merged_prediction)
-
+ 
                 # Out:
                 # The unmerged prediction shape: (2, 3, 2)
                 # [array([[-0.37620035, -0.19752218],
@@ -1085,6 +1089,7 @@ def run(self,
                 #  [-0.24635398 -0.13003758]
                 #  [-0.49232286 -0.25939852]
                 #  [-0.44514108 -0.2345845 ]]
+
         """
         try:
             return self._run_impl(
@@ -1238,7 +1243,7 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
 
     def _run_program(self, program, feed, fetch_list, feed_var_name,
                      fetch_var_name, scope, return_numpy, use_program_cache):
-        from paddle.optimizer.lr_scheduler import _LRScheduler
+        from paddle.optimizer.lr import LRScheduler
         if feed is None:
             feed = {}
         elif isinstance(feed, (list, tuple)):
@@ -1296,7 +1301,7 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
         self._feed_data(program, feed, feed_var_name, scope)
         if hasattr(program, 'lr_sheduler'):
             assert isinstance(program.lr_sheduler,
-                              _LRScheduler), "must be _LRScheduler"
+                              LRScheduler), "must be LRScheduler"
             lr_sheduler = program.lr_sheduler
             lr_value = lr_sheduler()
             lr_var = program.global_block().vars[lr_sheduler._var_name]
@@ -1508,9 +1513,9 @@ def infer_from_dataset(self,
             thread(int): number of thread a user wants to run in this function. Default is 0, which
                 means using thread num of dataset
             debug(bool): whether a user wants to run infer_from_dataset, default is False
-            fetch_list(Variable List): fetch variable list, each variable will be printed during
+            fetch_list(Tensor List): fetch Tensor list, each Tensor will be printed during
                 training, default is None
-            fetch_info(String List): print information for each variable, default is None
+            fetch_info(String List): print information for each Tensor, default is None
             print_period(int): the number of mini-batches for each print, default is 100
             fetch_handler(FetchHandler): a user define class for fetch output.
 
@@ -1521,20 +1526,22 @@ def infer_from_dataset(self,
 
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                place = fluid.CPUPlace() # you can set place = fluid.CUDAPlace(0) to use gpu
-                exe = fluid.Executor(place)
-                x = fluid.data(name="x", shape=[None, 10, 10], dtype="int64")
-                y = fluid.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
-                dataset = fluid.DatasetFactory().create_dataset()
+                paddle.enable_static()
+                place = paddle.CPUPlace()  # you can set place = paddle.CUDAPlace(0) to use gpu
+                exe = paddle.static.Executor(place)
+                x = paddle.static.data(name="x", shape=[None, 10, 10], dtype="int64")
+                y = paddle.static.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
+                dataset = paddle.fluid.DatasetFactory().create_dataset()
                 dataset.set_use_var([x, y])
                 dataset.set_thread(1)
-                filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
+                # you should set your own filelist, e.g. filelist = ["dataA.txt"]
+                filelist = []
                 dataset.set_filelist(filelist)
-                exe.run(fluid.default_startup_program())
-                exe.infer_from_dataset(program=fluid.default_main_program(),
-                                       dataset=dataset)        
+                exe.run(paddle.static.default_startup_program())
+                exe.infer_from_dataset(program=paddle.static.default_main_program(),
+                                       dataset=dataset)
 
         """
         return self._run_from_dataset(program, dataset, scope, thread, True,
@@ -1627,9 +1634,9 @@ def train_from_dataset(self,
             thread(int): number of thread a user wants to run in this function. Default is 0, which
                 means using thread num of dataset
             debug(bool): whether a user wants to run train_from_dataset 
-            fetch_list(Variable List): fetch variable list, each variable will be printed
+            fetch_list(Tensor List): fetch Tensor list, each variable will be printed
                 during training
-            fetch_info(String List): print information for each variable, its length should be equal
+            fetch_info(String List): print information for each Tensor, its length should be equal
                 to fetch_list
             print_period(int): the number of mini-batches for each print, default is 100
             fetch_handler(FetchHandler): a user define class for fetch output.
@@ -1641,19 +1648,21 @@ def train_from_dataset(self,
         
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
 
-              place = fluid.CPUPlace() # you can set place = fluid.CUDAPlace(0) to use gpu
-              exe = fluid.Executor(place)
-              x = fluid.data(name="x", shape=[None, 10, 10], dtype="int64")
-              y = fluid.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
-              dataset = fluid.DatasetFactory().create_dataset()
+              paddle.enable_static()
+              place = paddle.CPUPlace() # you can set place = paddle.CUDAPlace(0) to use gpu
+              exe = paddle.static.Executor(place)
+              x = paddle.static.data(name="x", shape=[None, 10, 10], dtype="int64")
+              y = paddle.static.data(name="y", shape=[None, 1], dtype="int64", lod_level=1)
+              dataset = paddle.fluid.DatasetFactory().create_dataset()
               dataset.set_use_var([x, y])
               dataset.set_thread(1)
-              filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
+              # you should set your own filelist, e.g. filelist = ["dataA.txt"]
+              filelist = []
               dataset.set_filelist(filelist)
-              exe.run(fluid.default_startup_program())
-              exe.train_from_dataset(program=fluid.default_main_program(),
+              exe.run(paddle.static.default_startup_program())
+              exe.train_from_dataset(program=paddle.static.default_main_program(),
                                      dataset=dataset)
 
         """
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 52c1e5d5e16c1..aaceb22b98dff 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -284,7 +284,17 @@ def _current_expected_place():
     global _global_expected_place_
     if _global_expected_place_ is None:
         if core.is_compiled_with_cuda():
-            _global_expected_place_ = core.CUDAPlace(0)
+            try:
+                device_count = core.get_cuda_device_count()
+            except Exception as e:
+                device_count = 0
+            if device_count > 0:
+                _global_expected_place_ = core.CUDAPlace(0)
+            else:
+                warnings.warn(
+                    "You are using GPU version Paddle, but your CUDA device is not set properly. CPU device will be used by default."
+                )
+                _global_expected_place_ = core.CPUPlace()
         else:
             _global_expected_place_ = core.CPUPlace()
 
@@ -533,7 +543,7 @@ def name_scope(prefix=None):
           import paddle
           paddle.enable_static()
           with paddle.static.name_scope("s1"):
-             a = paddle.data(name='data', shape=[None, 1], dtype='int32')
+             a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
              b = a + 1
              with paddle.static.name_scope("s2"):
                 c = b * 1
@@ -1182,8 +1192,8 @@ def backward(self, retain_graph=False):
                     # there is no one need gradient on it.
                     tmp.stop_gradient=False
                     inputs.append(tmp)
-                ret = paddle.sums(inputs)
-                loss = paddle.reduce_sum(ret)
+                ret = paddle.add_n(inputs)
+                loss = paddle.sum(ret)
                 loss.backward()
 
         """
@@ -1333,7 +1343,9 @@ def to_string(self, throw_on_error, with_details=False):
             .. code-block:: python
 
                 import paddle.fluid as fluid
+                import paddle
 
+                paddle.enable_static()
                 cur_program = fluid.Program()
                 cur_block = cur_program.current_block()
                 new_variable = cur_block.create_var(name="X",
@@ -3996,7 +4008,7 @@ class Program(object):
             with static.program_guard(main_program=main_program, startup_program=startup_program):
                 x = static.data(name="x", shape=[-1, 784], dtype='float32')
                 y = static.data(name="y", shape=[-1, 1], dtype='int32')
-                z = static.nn.fc(name="fc", input=x, size=10, act="relu")
+                z = static.nn.fc(name="fc", x=x, size=10, activation="relu")
 
             print("main program is: {}".format(main_program))
             print("start up program is: {}".format(startup_program))
@@ -4344,7 +4356,7 @@ def clone(self, for_test=False):
             paddle.enable_static()
 
             img = static.data(name='image', shape=[None, 784])
-            pred = static.nn.fc(input=img, size=10, act='relu')
+            pred = static.nn.fc(x=img, size=10, actvation='relu')
             loss = paddle.mean(pred)
             # Here we use clone before Momentum
             test_program = static.default_main_program().clone(for_test=True)
@@ -4415,10 +4427,10 @@ def print_prog(prog):
                     with static.program_guard(train_program, startup_program):
                         with utils.unique_name.guard():
                             img = static.data(name='image', shape=[None, 784])
-                            hidden = static.nn.fc(input=img, size=200, act='relu')
+                            hidden = static.nn.fc(x=img, size=200, activation='relu')
                             hidden = F.dropout(hidden, p=0.5)
                             loss = F.cross_entropy(
-                                input=static.nn.fc(hidden, size=10, act='softmax'),
+                                input=static.nn.fc(x=hidden, size=10, activation='softmax'),
                                 label=static.data(name='label', shape=[1], dtype='int64'))
                             avg_loss = paddle.mean(loss)
                             test_program = train_program.clone(for_test=True)
@@ -4462,10 +4474,10 @@ def print_prog(prog):
 
                     def network():
                         img = static.data(name='image', shape=[None, 784])
-                        hidden = static.nn.fc(input=img, size=200, act='relu')
+                        hidden = static.nn.fc(x=img, size=200, activation='relu')
                         hidden = F.dropout(hidden, p=0.5)
                         loss = F.cross_entropy(
-                            input=static.nn.fc(hidden, size=10, act='softmax'),
+                            input=static.nn.fc(x=hidden, size=10, activation='softmax'),
                             label=static.data(name='label', shape=[1], dtype='int64'))
                         avg_loss = paddle.mean(loss)
                         return avg_loss
@@ -5079,7 +5091,7 @@ def all_parameters(self):
 
                 program = static.default_main_program()
                 data = static.data(name='x', shape=[None, 13], dtype='float32')
-                hidden = static.nn.fc(input=data, size=10)
+                hidden = static.nn.fc(x=data, size=10)
                 loss = paddle.mean(hidden)
                 paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
@@ -5310,8 +5322,8 @@ def __str__(self):
                 #   - data: [...] 
                 paddle.enable_static()
         """
-        return "Parameter containing:\n  {}\n  - stop_gradient: {}".format(
-            super(ParamBase, self).__str__(), self.stop_gradient)
+        return "Parameter containing:\n{tensor}".format(
+            tensor=super(ParamBase, self).__str__())
 
     __repr__ = __str__
 
@@ -5345,9 +5357,9 @@ def default_startup_program():
             main_program = paddle.static.Program()
             startup_program = paddle.static.Program()
             with paddle.static.program_guard(main_program=main_program, startup_program=startup_program):
-                x = paddle.data(name="x", shape=[-1, 784], dtype='float32')
-                y = paddle.data(name="y", shape=[-1, 1], dtype='int32')
-                z = paddle.static.nn.fc(name="fc", input=x, size=10, act="relu")
+                x = paddle.static.data(name="x", shape=[-1, 784], dtype='float32')
+                y = paddle.static.data(name="y", shape=[-1, 1], dtype='int32')
+                z = paddle.static.nn.fc(name="fc", x=x, size=10, activation="relu")
 
                 print("main program is: {}".format(paddle.static.default_main_program()))
                 print("start up program is: {}".format(paddle.static.default_startup_program()))
@@ -5360,7 +5372,7 @@ def default_main_program():
     This API can be used to get ``default main program`` which store the 
     descriptions of Ops and tensors.
     
-    For example ``z = paddle.elementwise_add(x, y)`` will create a new ``elementwise_add`` 
+    For example ``z = paddle.fluid.layers.elementwise_add(x, y)`` will create a new ``elementwise_add`` 
     Op and a new ``z`` tensor, and they will be recorded in ``default main program`` . 
 
     The ``default main program`` is the default value for ``Program`` parameter in 
@@ -5379,18 +5391,18 @@ def default_main_program():
             
             paddle.enable_static()
             # Sample Network:
-            data = paddle.data(name='image', shape=[None, 3, 224, 224], dtype='float32')
-            label = paddle.data(name='label', shape=[None, 1], dtype='int64')
+            data = paddle.static.data(name='image', shape=[None, 3, 224, 224], dtype='float32')
+            label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
             
             conv1 = paddle.static.nn.conv2d(data, 4, 5, 1, act=None)
             bn1 = paddle.static.nn.batch_norm(conv1, act='relu')
-            pool1 = paddle.nn.functional.pool2d(bn1, 2, 'max', 2)
+            pool1 = paddle.fluid.layers.pool2d(bn1, 2, 'max', 2)
             conv2 = paddle.static.nn.conv2d(pool1, 16, 5, 1, act=None)
             bn2 = paddle.static.nn.batch_norm(conv2, act='relu')
-            pool2 = paddle.nn.functional.pool2d(bn2, 2, 'max', 2)
+            pool2 = paddle.fluid.layers.pool2d(bn2, 2, 'max', 2)
             
-            fc1 = paddle.static.nn.fc(pool2, size=50, act='relu')
-            fc2 = paddle.static.nn.fc(fc1, size=102, act='softmax')
+            fc1 = paddle.static.nn.fc(x=pool2, size=50, activation='relu')
+            fc2 = paddle.static.nn.fc(x=fc1, size=102, activation='softmax')
             
             loss = paddle.nn.functional.loss.cross_entropy(input=fc2, label=label)
             loss = paddle.mean(loss)
@@ -5467,7 +5479,7 @@ def program_guard(main_program, startup_program=None):
           startup_program = paddle.static.Program()
           with paddle.static.program_guard(main_program, startup_program):
               data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-              hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
+              hidden = paddle.static.nn.fc(x=data, size=10, activation='relu')
 
     Notes: The temporary :code:`Program` can be used if the user does not need
     to construct either of startup program or main program.
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index 40cc2d2dd4e38..77a202317912f 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -19,6 +19,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGD
+from paddle.optimizer import SGD as SGD_v2
 
 from paddle.fluid.incubate.fleet.base.mode import Mode
 from paddle.distributed.fleet.base.role_maker import RoleMakerBase
@@ -291,7 +292,8 @@ class DistributedOptimizer(object):
 
     def __init__(self, optimizer, strategy=None):
         if not isinstance(optimizer, SGD.__bases__) \
-                and not isinstance(optimizer, OptimizerWithMixedPrecision):
+                and not isinstance(optimizer, OptimizerWithMixedPrecision) \
+                and not isinstance(optimizer, SGD_v2.__base__):
             raise TypeError("optimizer must be an instance of Optimizer")
 
         self._optimizer = optimizer
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index 6e5aae82517d1..6466ce4b42e6e 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -28,6 +28,8 @@
 from paddle.fluid import compiler
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel, CheckpointSaver
 
+import paddle
+
 import os
 import sys
 import six
@@ -505,10 +507,7 @@ def minimize(self,
                                     self._strategy)
 
         optimize_ops, param_grads = self._optimizer.minimize(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+            loss, startup_program, parameter_list, no_grad_set=no_grad_set)
 
         fleet._origin_program = main_program.clone(for_test=False)
         fleet._transpiled_program = main_program
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 90847382c86e1..fe2ba38ee00b6 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -236,9 +236,9 @@ def get_origin_ps_startup_program(self):
     def get_sparse_varname_on_ps(self, is_distributed, endpoint=None):
         if not endpoint:
             endpoint = self.get_ps_endpoint()
-
         varnames = get_sparse_tablenames(self.get_origin_main_program(),
                                          is_distributed)
+
         ps_sparse_varnames = []
         for varname in varnames:
             tables = self.get_var_distributed(varname, True)
@@ -248,6 +248,55 @@ def get_sparse_varname_on_ps(self, is_distributed, endpoint=None):
                     ps_sparse_varnames.append(table)
         return ps_sparse_varnames
 
+    def get_optimize_varname_on_ps(self, param_name):
+        origin_param_name, _, _ = _get_varname_parts(param_name)
+        optimize_var_names = []
+        for op in self.get_origin_main_program().global_block().ops:
+            # check all optimizer op
+            if int(op.all_attrs()["op_role"]) == 2:
+                # check param name 
+                if op.input("Param")[0] != origin_param_name:
+                    continue
+                # check all input
+                for key in op.input_names:
+                    if key in [
+                            "Param", "Grad", "LearningRate", "Beta1Tensor",
+                            "Beta2Tensor"
+                    ]:
+                        continue
+                    # check varibale shape related param, e.g: Moment1
+                    optimize_var_names += self._get_optimizer_param_related_var_name(
+                        op, op.type, key)
+        return optimize_var_names
+
+    def _get_optimizer_param_related_var_name(self, op, op_type, varkey):
+        """
+        Returns the names for optimizer inputs that need to be load 
+        """
+        related_var_names = []
+        if op_type == "adam":
+            if varkey in ["Moment1", "Moment2"]:
+                related_var_names.append(op.input(varkey)[0])
+        elif op_type == "adagrad":
+            if varkey == "Moment":
+                related_var_names.append(op.input(varkey)[0])
+        elif op_type in ["momentum", "lars_momentum"]:
+            if varkey == "Velocity":
+                related_var_names.append(op.input(varkey)[0])
+        elif op_type == "rmsprop":
+            if varkey in ["Moment", "MeanSquare"]:
+                related_var_names.append(op.input(varkey)[0])
+        elif op_type == "ftrl":
+            if varkey in ["SquaredAccumulator", "LinearAccumulator"]:
+                related_var_names.append(op.input(varkey)[0])
+        elif op_type == "sgd":
+            pass
+        else:
+            raise ValueError(
+                "Not supported optimizer for distributed training: %s" %
+                op_type)
+        return related_var_names
+
     def build_ctx(self,
                   vars,
                   mapping,
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 90bcdee50730f..3f826da3ae2be 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -169,7 +169,7 @@ def append_send_ops_pass(program, config):
     trainer_id = config.get_role_id()
     pserver_endpoints = config.get_ps_endpoints()
 
-    def _append_grad_send_op(union_vars, queue):
+    def _append_send_op(union_vars, queue):
 
         if queue == STEP_COUNTER:
             send_input_vars = []
@@ -198,43 +198,6 @@ def _append_grad_send_op(union_vars, queue):
 
         return dummy_output
 
-    def _append_sparse_ids_send_op():
-        sparse_var = []
-        sparse_tables = []
-        unique_sparse_var = {}
-        for op in program.global_block().ops:
-            if "is_sparse" in op.all_attrs():
-                if op.type == "lookup_table":
-                    op._set_attr('remote_prefetch', False)
-                for input_var_name, sparse_var_name in zip(
-                        op.input("Ids"), op.input("W")):
-                    if input_var_name in unique_sparse_var:
-                        if unique_sparse_var[input_var_name] == sparse_var_name:
-                            continue
-                    input_var = program.global_block().var(input_var_name)
-                    sparse_var.append(input_var)
-                    sparse_tables.append(sparse_var_name)
-                    unique_sparse_var[input_var_name] = sparse_var_name
-
-        dummy_output = []
-        if mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
-            dummy_output = program.global_block().create_var(
-                name=framework.generate_control_dev_var_name())
-
-        program.global_block().append_op(
-            type="send",
-            inputs={"X": sparse_var},
-            outputs={"Out": dummy_output},
-            attrs={
-                "send_varnames": sparse_tables,
-                "merge_add": True,
-                "use_send_handler": False,
-                "endpoints": pserver_endpoints,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
-
-        return dummy_output
-
     def _append_barrier_op(dummys):
         program.global_block().append_op(
             type="send_barrier",
@@ -251,12 +214,8 @@ def _append_barrier_op(dummys):
 
     sends = config.get_trainer_send_context()
 
-    if mode == DistributedMode.GEO:
-        dummys.append(_append_sparse_ids_send_op())
-    else:
-        for merged_name, send in sends.items():
-            dummys.append(
-                _append_grad_send_op(send.origin_varnames(), merged_name))
+    for merged_name, send in sends.items():
+        dummys.append(_append_send_op(send.origin_varnames(), merged_name))
 
     if mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
         _append_barrier_op(dummys)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 5cd1aa884a928..0189bc2bd7407 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -167,6 +167,113 @@ def _find_multi_distributed_lookup_table(self, losses):
             ret_list.append(x[0])
         return ret_list
 
+    def _if_last_block(self, op, _equal_dict):
+        # for conditional_block op 
+        cond_str = op.input('Cond')[0]
+        bool_test = False
+        if cond_str.startswith('equal'):
+            bool_test = True
+        vars_ = op.input('Input')
+        equal_keys = _equal_dict.keys()
+        for var_cond in vars_:
+            if var_cond in equal_keys:
+                if bool_test:
+                    print("the conditional block is error")
+                return False
+        return True
+
+    def _generte_cond_para_map(self, op, _fill_value_dict, _equal_fill_dict,
+                               _now_program, _all_params):
+        # generate cond value to parameter map recursively
+        cond_str = op.input('Cond')[0]
+        vars_ = op.input('Input')
+
+        if self._if_last_block(op, _equal_fill_dict):
+            vars_ = op.input('Input')
+            cond_key = ""
+            if cond_str.startswith('equal'):
+                cond_key = int(_fill_value_dict[_equal_fill_dict[cond_str]])
+            else:
+                cond_key = -1
+            p_list = []
+            for var_cond in vars_:
+                if var_cond in _all_params:
+                    p_list.append(var_cond)
+
+            self._cond_params[cond_key] = p_list
+            self._other_params.extend(p_list)
+        else:
+            ops_cond = _now_program.block(int(op.attr('sub_block').id)).ops
+            for op in ops_cond:
+                if op.type == 'conditional_block':
+                    self._generte_cond_para_map(op, _fill_value_dict,
+                                                _equal_fill_dict, _now_program,
+                                                _all_params)
+
+    def _has_conditional_block(self, loss):
+        now_program = loss.block.program
+        root_block = now_program.block(0)
+        ops_ = root_block.ops
+        for op in ops_:
+            if op.type == 'conditional_block':
+                return True
+        return False
+
+    def _check_params_grads(self, params, grads):
+        if len(params) != len(grads):
+            raise ValueError("params size != grads size, %s vs %s" %
+                             (len(params), len(grads)))
+
+        pname2grad = dict()
+        for i in range(len(params)):
+            pname = params[i].name
+            gname = grads[i].name
+            if pname != gname[:-5]:
+                raise ValueError(" params != grads , %s vs %s" % (pname, gname))
+            pname2grad[pname] = grads[i]
+
+        return pname2grad
+
+    def _generate_multi_dense_table(self,
+                                    params,
+                                    grads,
+                                    cond_params,
+                                    other_params,
+                                    sparse_table_names,
+                                    dense_table_id=0):
+        # generate multi dense table by cond value
+        pname2grad = self._check_params_grads(params, grads)
+        root_params_list = []
+        root_grads_list = []
+        dense_tables = []
+        for i, p in enumerate(params):
+            if p.name not in other_params and p.name not in sparse_table_names:
+                root_params_list.append(p)
+                root_grads_list.append(grads[i])
+        if len(root_params_list) > 0:
+            dense_tables.append(dense_table_id)
+            dense_table_id += 1
+        lists_params = [[] for i in range(len(cond_params.keys()))]
+        lists_grads = [[] for i in range(len(cond_params.keys()))]
+
+        key_id = 0
+        name2key = dict()
+        cond2denseid = dict()
+        for key, value in cond_params.items():
+            cond2denseid[key] = dense_table_id
+            dense_tables.append(dense_table_id)
+            dense_table_id += 1
+            for v in value:
+                name2key[v] = key_id
+            key_id += 1
+
+        for p in params:
+            if p.name in other_params:
+                lists_params[name2key[p.name]].append(p)
+                lists_grads[name2key[p.name]].append(pname2grad[p.name])
+
+        return dense_tables, cond2denseid, lists_params, lists_grads, root_params_list, root_grads_list
+
     def _minimize(self,
                   losses,
                   startup_program=None,
@@ -215,6 +322,31 @@ def _minimize(self,
                                                no_grad_set),
                 key=lambda x: x[0].name)
 
+            # has condition_block op means multi-task 
+            flag_multi_task = self._has_conditional_block(loss)
+            if flag_multi_task:
+                self._cond_params = dict()
+                self._other_params = []
+                now_program = loss.block.program
+                root_block = now_program.block(0)
+                all_params = []
+                for par in root_block.all_parameters():
+                    all_params.append(par.name)
+
+                ops_ = root_block.ops
+                fill_value_dict = dict()
+                equal_fill_dict = dict()
+                for op in ops_:
+                    # conditional_block op must has fill_constant and equal op
+                    if op.type == 'fill_constant':
+                        fill_value_dict[op.output('Out')[0]] = op.attr('value')
+                    if op.type == 'equal':
+                        equal_fill_dict[op.output('Out')[0]] = op.input('Y')[0]
+                    if op.type == 'conditional_block':
+                        self._generte_cond_para_map(op, fill_value_dict,
+                                                    equal_fill_dict,
+                                                    now_program, all_params)
+
             if prog_id not in program_id_set:
                 program_id_set.add(prog_id)
                 sparse_table = self._find_multi_distributed_lookup_table([loss])
@@ -402,17 +534,65 @@ def _minimize(self,
                                 data_norm_grads.append(i[1])
                         if not is_data_norm_data:
                             grads.append(i[1])
+                    # for new dense table
+                    multi_task_dense_tables_push = []
+                    multi_task_dense_tables_pull = []
+                    if flag_multi_task:
+                        dense_tables, cond2denseid, lists_params, lists_grads, root_params_list, root_grads_list = self._generate_multi_dense_table(
+                            params, grads, self._cond_params,
+                            self._other_params, sparse_table_names,
+                            dense_table_index)
+                        program_configs[program_id][
+                            'cond2denseid'] = cond2denseid
+                        multi_task_dense_tables_push = dense_tables
+                        multi_task_dense_tables_pull = dense_tables[:]
 
                     if strategy.get('dense_table') is not None:
-                        server.add_dense_table(dense_table_index, params, grads,
-                                               strategy['dense_table'],
-                                               sparse_table_names)
+                        if flag_multi_task:
+                            server_dense_table_index = dense_table_index
+                            if len(root_params_list) > 0:
+                                server.add_dense_table(
+                                    server_dense_table_index, root_params_list,
+                                    root_grads_list, strategy['dense_table'],
+                                    sparse_table_names)
+                                server_dense_table_index += 1
+
+                            for i in range(len(lists_params)):
+                                server.add_dense_table(
+                                    server_dense_table_index, lists_params[i],
+                                    lists_grads[i], strategy['dense_table'],
+                                    sparse_table_names)
+                                server_dense_table_index += 1
+                        else:
+                            server.add_dense_table(
+                                dense_table_index, params, grads,
+                                strategy['dense_table'], sparse_table_names)
+
                     else:
                         server.add_dense_table(dense_table_index, params, grads,
                                                None, sparse_table_names)
-                    worker.add_dense_table(
-                        dense_table_index, self._learning_rate, params, grads,
-                        dense_start_table_id, sparse_table_names)
+
+                    if flag_multi_task:
+
+                        if len(root_params_list) > 0:
+                            worker.add_dense_table(
+                                dense_table_index, self._learning_rate,
+                                root_params_list, root_grads_list,
+                                dense_start_table_id, sparse_table_names)
+                            dense_table_index += 1
+
+                        for i in range(len(lists_params)):
+                            worker.add_dense_table(
+                                dense_table_index, self._learning_rate,
+                                lists_params[i], lists_grads[i],
+                                dense_start_table_id, sparse_table_names)
+                            dense_table_index += 1
+
+                        dense_table_index -= 1
+                    else:
+                        worker.add_dense_table(
+                            dense_table_index, self._learning_rate, params,
+                            grads, dense_start_table_id, sparse_table_names)
 
                     if FLEET_GLOBAL_DICT["enable"]:
                         cur_prog = losses[loss_index].block.program
@@ -430,15 +610,28 @@ def _minimize(self,
                             program_id] and "push_dense" in program_configs[
                                 program_id] and len(program_configs[program_id][
                                     "pull_dense"]) > 0:
-                        program_configs[program_id]["pull_dense"].extend(
-                            [dense_table_index])
-                        program_configs[program_id]["push_dense"].extend(
-                            [dense_table_index])
+                        if flag_multi_task:
+                            program_configs[program_id]["pull_dense"].extend(
+                                multi_task_dense_tables_pull)
+                            program_configs[program_id]["push_dense"].extend(
+                                multi_task_dense_tables_push)
+                        else:
+                            program_configs[program_id]["pull_dense"].extend(
+                                [dense_table_index])
+                            program_configs[program_id]["push_dense"].extend(
+                                [dense_table_index])
                     else:
-                        program_configs[program_id][
-                            "pull_dense"] = [dense_table_index]
-                        program_configs[program_id][
-                            "push_dense"] = [dense_table_index]
+                        if flag_multi_task:
+                            program_configs[program_id][
+                                "pull_dense"] = multi_task_dense_tables_pull
+                            program_configs[program_id][
+                                "push_dense"] = multi_task_dense_tables_push
+                        else:
+                            program_configs[program_id][
+                                "pull_dense"] = [dense_table_index]
+                            program_configs[program_id][
+                                "push_dense"] = [dense_table_index]
+
                     if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                         dense_table_index += 1
                         if strategy.get('datanorm_table') is not None:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 67c572d4988ce..c21a96cb0108e 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -105,10 +105,14 @@ class ConstantInitializer(Initializer):
     Examples:
         .. code-block:: python
 
-    	    import paddle.fluid as fluid
+            import paddle
+            import paddle.fluid as fluid
+            paddle.enable_static()
             x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32")
-	    fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.Constant(value=2.0))
+            fc = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.Constant(value=2.0))
 
     """
 
@@ -623,7 +627,9 @@ class MSRAInitializer(Initializer):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
             x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32")
             fc = fluid.layers.fc(input=x, size=10,
                 param_attr=fluid.initializer.MSRA(uniform=False))
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 51fa1677b868e..111f33e613a16 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -49,7 +49,7 @@ def run_check():
     This func should not be called only if you need to verify installation
 
     Examples:
-        .. code-block: python
+        .. code-block:: python
 
             import paddle.fluid as fluid
             fluid.install_check.run_check()
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 0c77917c78190..65ca5a211e3c8 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1110,9 +1110,6 @@ def assign_skip_lod_tensor_array(input, output):
 def while_loop(cond, body, loop_vars, is_test=False, name=None):
     """
     :api_attr: Static Graph
-	:alias_main: paddle.nn.while_loop
-	:alias: paddle.nn.while_loop,paddle.nn.control_flow.while_loop
-	:old_api: paddle.fluid.layers.while_loop
 
     while_loop is one of the control flows. Repeats while_loop `body` until `cond` returns False.
 
@@ -1151,6 +1148,9 @@ def while_loop(cond, body, loop_vars, is_test=False, name=None):
 
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
+            import paddle
+            paddle.enable_static()
+
 
             def cond(i, ten):
                 return i < ten
@@ -2506,21 +2506,21 @@ def case(pred_fn_pairs, default=None, name=None):
             paddle.enable_static()
 
             def fn_1():
-                return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
+                return paddle.full(shape=[1, 2], dtype='float32', fill_value=1)
 
             def fn_2():
-                return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
+                return paddle.full(shape=[2, 2], dtype='int32', fill_value=2)
 
             def fn_3():
-                return paddle.fill_constant(shape=[3], dtype='int32', value=3)
+                return paddle.full(shape=[3], dtype='int32', fill_value=3)
 
             main_program = paddle.static.default_startup_program()
             startup_program = paddle.static.default_main_program()
 
             with paddle.static.program_guard(main_program, startup_program):
-                x = paddle.fill_constant(shape=[1], dtype='float32', value=0.3)
-                y = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
-                z = paddle.fill_constant(shape=[1], dtype='float32', value=0.2)
+                x = paddle.full(shape=[1], dtype='float32', fill_value=0.3)
+                y = paddle.full(shape=[1], dtype='float32', fill_value=0.1)
+                z = paddle.full(shape=[1], dtype='float32', fill_value=0.2)
 
                 pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
                 pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
@@ -3626,19 +3626,19 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
             paddle.enable_static()
 
             def fn_1():
-                return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
+                return paddle.full(shape=[1, 2], dtype='float32', fill_value=1)
 
             def fn_2():
-                return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
+                return paddle.full(shape=[2, 2], dtype='int32', fill_value=2)
 
             def fn_3():
-                return paddle.fill_constant(shape=[3], dtype='int32', value=3)
+                return paddle.full(shape=[3], dtype='int32', fill_value=3)
 
             main_program = paddle.static.default_startup_program()
             startup_program = paddle.static.default_main_program()
             with paddle.static.program_guard(main_program, startup_program):
-                index_1 = paddle.fill_constant(shape=[1], dtype='int32', value=1)
-                index_2 = paddle.fill_constant(shape=[1], dtype='int32', value=2)
+                index_1 = paddle.full(shape=[1], dtype='int32', fill_value=1)
+                index_2 = paddle.full(shape=[1], dtype='int32', fill_value=2)
 
                 out_1 = paddle.static.nn.switch_case(
                     branch_index=index_1,
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index bf87d1fc5a947..951817db015d5 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -629,9 +629,6 @@ def detection_output(loc,
                      nms_eta=1.0,
                      return_index=False):
     """
-	:alias_main: paddle.nn.functional.detection_output
-	:alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output
-	:old_api: paddle.fluid.layers.detection_output
 
     Given the regression locations, classification confidences and prior boxes,
     calculate the detection outputs by performing following steps:
@@ -700,6 +697,9 @@ class number, M is number of bounding boxes.
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+
+            paddle.enable_static()
 
             pb = fluid.data(name='prior_box', shape=[10, 4], dtype='float32')
             pbv = fluid.data(name='prior_box_var', shape=[10, 4], dtype='float32')
@@ -822,9 +822,6 @@ def box_coder(prior_box,
               name=None,
               axis=0):
     """
-	:alias_main: paddle.nn.functional.box_coder
-	:alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder
-	:old_api: paddle.fluid.layers.box_coder
 
     **Box Coder Layer**
 
@@ -911,6 +908,8 @@ def box_coder(prior_box,
         .. code-block:: python
  
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             # For encode
             prior_box_encode = fluid.data(name='prior_box_encode',
                                   shape=[512, 4],
@@ -1013,9 +1012,6 @@ def yolov3_loss(x,
                 name=None,
                 scale_x_y=1.):
     """
-	:alias_main: paddle.nn.functional.yolov3_loss
-	:alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss
-	:old_api: paddle.fluid.layers.yolov3_loss
 
     ${comment}
 
@@ -1060,6 +1056,8 @@ def yolov3_loss(x,
       .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           x = fluid.data(name='x', shape=[None, 255, 13, 13], dtype='float32')
           gt_box = fluid.data(name='gt_box', shape=[None, 6, 4], dtype='float32')
           gt_label = fluid.data(name='gt_label', shape=[None, 6], dtype='int32')
@@ -1140,9 +1138,6 @@ def yolo_box(x,
              name=None,
              scale_x_y=1.):
     """
-	:alias_main: paddle.nn.functional.yolo_box
-	:alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box
-	:old_api: paddle.fluid.layers.yolo_box
 
     ${comment}
 
@@ -1175,6 +1170,8 @@ def yolo_box(x,
     .. code-block:: python
 
         import paddle.fluid as fluid
+        import paddle
+        paddle.enable_static()
         x = fluid.data(name='x', shape=[None, 255, 13, 13], dtype='float32')
         img_size = fluid.data(name='img_size',shape=[None, 2],dtype='int64')
         anchors = [10, 13, 16, 30, 33, 23]
@@ -1319,9 +1316,6 @@ def bipartite_match(dist_matrix,
                     dist_threshold=None,
                     name=None):
     """
-	:alias_main: paddle.nn.functional.bipartite_match
-	:alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match
-	:old_api: paddle.fluid.layers.bipartite_match
 
     This operator implements a greedy bipartite matching algorithm, which is
     used to obtain the matching with the maximum distance based on the input
@@ -1413,9 +1407,6 @@ def target_assign(input,
                   mismatch_value=None,
                   name=None):
     """
-	:alias_main: paddle.nn.functional.target_assign
-	:alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign
-	:old_api: paddle.fluid.layers.target_assign
 
     This operator can be, for given the target bounding boxes or labels,
     to assign classification and regression targets to each prediction as well as
@@ -1484,6 +1475,8 @@ def target_assign(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             x = fluid.data(
                 name='x',
                 shape=[4, 20, 4],
@@ -1778,9 +1771,6 @@ def prior_box(input,
               name=None,
               min_max_aspect_ratios_order=False):
     """
-	:alias_main: paddle.nn.functional.prior_box
-	:alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box
-	:old_api: paddle.fluid.layers.prior_box
 
     This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
     Each position of the input produce N prior boxes, N is determined by
@@ -1832,6 +1822,8 @@ def prior_box(input,
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
+        import paddle
+        paddle.enable_static()
 	    input = fluid.data(name="input", shape=[None,3,6,9])
 	    image = fluid.data(name="image", shape=[None,3,9,12])
 	    box, var = fluid.layers.prior_box(
@@ -1939,10 +1931,6 @@ def density_prior_box(input,
                       flatten_to_2d=False,
                       name=None):
     """
-	:alias_main: paddle.nn.functional.density_prior_box
-	:alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box
-	:old_api: paddle.fluid.layers.density_prior_box
-
 
     This op generates density prior boxes for SSD(Single Shot MultiBox Detector) 
     algorithm. Each position of the input produce N prior boxes, N is 
@@ -2008,6 +1996,8 @@ def density_prior_box(input,
 
             import paddle.fluid as fluid
             import numpy as np
+            import paddle
+            paddle.enable_static()
 
             input = fluid.data(name="input", shape=[None,3,6,9])
             image = fluid.data(name="image", shape=[None,3,9,12])
@@ -2408,9 +2398,6 @@ def anchor_generator(input,
                      offset=0.5,
                      name=None):
     """
-	:alias_main: paddle.nn.functional.anchor_generator
-	:alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator
-	:old_api: paddle.fluid.layers.anchor_generator
 
     **Anchor generator operator**
 
@@ -2457,6 +2444,9 @@ def anchor_generator(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+
+            paddle.enable_static()
             conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32')
             anchor, var = fluid.layers.anchor_generator(
                 input=conv1,
@@ -2613,9 +2603,6 @@ def generate_proposal_labels(rpn_rois,
                              is_cls_agnostic=False,
                              is_cascade_rcnn=False):
     """
-	:alias_main: paddle.nn.functional.generate_proposal_labels
-	:alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels
-	:old_api: paddle.fluid.layers.generate_proposal_labels
 
     **Generate Proposal Labels of Faster-RCNN**
 
@@ -2738,9 +2725,6 @@ def generate_proposal_labels(rpn_rois,
 def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
                          labels_int32, num_classes, resolution):
     """
-	:alias_main: paddle.nn.functional.generate_mask_labels
-	:alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels
-	:old_api: paddle.fluid.layers.generate_mask_labels
 
     **Generate Mask Labels for Mask-RCNN**
 
@@ -2897,9 +2881,6 @@ def generate_proposals(scores,
                        return_rois_num=False,
                        name=None):
     """
-	:alias_main: paddle.nn.functional.generate_proposals
-	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
-	:old_api: paddle.fluid.layers.generate_proposals
 
     **Generate proposal Faster-RCNN**
 
@@ -2965,6 +2946,8 @@ def generate_proposals(scores,
         .. code-block:: python
         
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             scores = fluid.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
             bbox_deltas = fluid.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
             im_info = fluid.data(name='im_info', shape=[None, 3], dtype='float32')
@@ -3036,9 +3019,6 @@ def generate_proposals(scores,
 
 def box_clip(input, im_info, name=None):
     """
-	:alias_main: paddle.nn.functional.box_clip
-	:alias: paddle.nn.functional.box_clip,paddle.nn.functional.vision.box_clip
-	:old_api: paddle.fluid.layers.box_clip
 	
     Clip the box into the size given by im_info
     For each input box, The formula is given as follows:
@@ -3079,6 +3059,8 @@ def box_clip(input, im_info, name=None):
         .. code-block:: python
         
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             boxes = fluid.data(
                 name='boxes', shape=[None, 8, 4], dtype='float32', lod_level=1)
             im_info = fluid.data(name='im_info', shape=[-1 ,3])
@@ -3265,9 +3247,6 @@ def multiclass_nms(bboxes,
                    background_label=0,
                    name=None):
     """
-	:alias_main: paddle.nn.functional.multiclass_nms
-	:alias: paddle.nn.functional.multiclass_nms,paddle.nn.functional.extension.multiclass_nms
-	:old_api: paddle.fluid.layers.multiclass_nms
 
     **Multiclass NMS**
     
@@ -3363,6 +3342,8 @@ class number. The data type is float32 or float64.
 
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             boxes = fluid.data(name='bboxes', shape=[None,81, 4],
                                       dtype='float32', lod_level=1)
             scores = fluid.data(name='scores', shape=[None,81],
@@ -3674,9 +3655,6 @@ def distribute_fpn_proposals(fpn_rois,
                              rois_num=None,
                              name=None):
     """
-	:alias_main: paddle.nn.functional.distribute_fpn_proposals
-	:alias: paddle.nn.functional.distribute_fpn_proposals,paddle.nn.functional.vision.distribute_fpn_proposals
-	:old_api: paddle.fluid.layers.distribute_fpn_proposals
 	
     **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
     (FPN) models, it is needed to distribute all proposals into different FPN 
@@ -3732,6 +3710,8 @@ def distribute_fpn_proposals(fpn_rois,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             fpn_rois = fluid.data(
                 name='data', shape=[None, 4], dtype='float32', lod_level=1)
             multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals(
@@ -3798,9 +3778,6 @@ def box_decoder_and_assign(prior_box,
                            box_clip,
                            name=None):
     """
-	:alias_main: paddle.nn.functional.box_decoder_and_assign
-	:alias: paddle.nn.functional.box_decoder_and_assign,paddle.nn.functional.vision.box_decoder_and_assign
-	:old_api: paddle.fluid.layers.box_decoder_and_assign
 	
     ${comment}
     Args:
@@ -3825,6 +3802,8 @@ def box_decoder_and_assign(prior_box,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             pb = fluid.data(
                 name='prior_box', shape=[None, 4], dtype='float32')
             pbv = fluid.data(
@@ -3874,9 +3853,6 @@ def collect_fpn_proposals(multi_rois,
                           rois_num_per_level=None,
                           name=None):
     """
-	:alias_main: paddle.nn.functional.collect_fpn_proposals
-	:alias: paddle.nn.functional.collect_fpn_proposals,paddle.nn.functional.vision.collect_fpn_proposals
-	:old_api: paddle.fluid.layers.collect_fpn_proposals
 	
     **This OP only supports LoDTensor as input**. Concat multi-level RoIs 
     (Region of Interest) and select N RoIs with respect to multi_scores. 
@@ -3922,6 +3898,8 @@ def collect_fpn_proposals(multi_rois,
         .. code-block:: python
            
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             multi_rois = []
             multi_scores = []
             for i in range(4):
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 47e62016a20d7..2710ab12cd3da 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -52,9 +52,6 @@ def _decay_step_counter(begin=0):
 
 def noam_decay(d_model, warmup_steps, learning_rate=1.0):
     """
-	:alias_main: paddle.nn.functional.noam_decay
-	:alias: paddle.nn.functional.noam_decay,paddle.nn.functional.learning_rate.noam_decay
-	:old_api: paddle.fluid.layers.noam_decay
 
     Noam decay method. The numpy implementation of noam decay as follows.
 
@@ -115,9 +112,6 @@ def noam_decay(d_model, warmup_steps, learning_rate=1.0):
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-	:alias_main: paddle.nn.functional.exponential_decay
-	:alias: paddle.nn.functional.exponential_decay,paddle.nn.functional.learning_rate.exponential_decay
-	:old_api: paddle.fluid.layers.exponential_decay
 
     Applies exponential decay to the learning rate.
 
@@ -149,6 +143,9 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+
+          paddle.enable_static()
           base_lr = 0.1
           sgd_optimizer = fluid.optimizer.SGD(
 	      learning_rate=fluid.layers.exponential_decay(
@@ -176,9 +173,6 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-	:alias_main: paddle.nn.functional.natural_exp_decay
-	:alias: paddle.nn.functional.natural_exp_decay,paddle.nn.functional.learning_rate.natural_exp_decay
-	:old_api: paddle.fluid.layers.natural_exp_decay
 
 Applies natural exponential decay to the initial learning rate.
 
@@ -210,6 +204,9 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+
+          paddle.enable_static()
           base_lr = 0.1
           sgd_optimizer = fluid.optimizer.SGD(
 	      learning_rate=fluid.layers.natural_exp_decay(
@@ -237,9 +234,6 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-	:alias_main: paddle.nn.functional.inverse_time_decay
-	:alias: paddle.nn.functional.inverse_time_decay,paddle.nn.functional.learning_rate.inverse_time_decay
-	:old_api: paddle.fluid.layers.inverse_time_decay
 
     Applies inverse time decay to the initial learning rate.
 
@@ -271,6 +265,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           base_lr = 0.1
           sgd_optimizer = fluid.optimizer.SGD(
 	      learning_rate=fluid.layers.inverse_time_decay(
@@ -302,10 +298,6 @@ def polynomial_decay(learning_rate,
                      power=1.0,
                      cycle=False):
     """
-	:alias_main: paddle.nn.functional.polynomial_decay
-	:alias: paddle.nn.functional.polynomial_decay,paddle.nn.functional.learning_rate.polynomial_decay
-	:old_api: paddle.fluid.layers.polynomial_decay
-2
     Applies polynomial decay to the initial learning rate.
 
     .. code-block:: text
@@ -371,9 +363,6 @@ def polynomial_decay(learning_rate,
 
 def piecewise_decay(boundaries, values):
     """
-	:alias_main: paddle.nn.functional.piecewise_decay
-	:alias: paddle.nn.functional.piecewise_decay,paddle.nn.functional.learning_rate.piecewise_decay
-	:old_api: paddle.fluid.layers.piecewise_decay
 
 Applies piecewise decay to the initial learning rate.
 
@@ -401,6 +390,8 @@ def piecewise_decay(boundaries, values):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           boundaries = [10000, 20000]
           values = [1.0, 0.5, 0.1]
           optimizer = fluid.optimizer.Momentum(
@@ -450,9 +441,6 @@ def piecewise_decay(boundaries, values):
 
 def cosine_decay(learning_rate, step_each_epoch, epochs):
     """
-	:alias_main: paddle.nn.functional.cosine_decay
-	:alias: paddle.nn.functional.cosine_decay,paddle.nn.functional.learning_rate.cosine_decay
-	:old_api: paddle.fluid.layers.cosine_decay
 
     Applies cosine decay to the learning rate.
 
@@ -499,9 +487,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
 
 def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
     """
-	:alias_main: paddle.nn.functional.linear_lr_warmup
-	:alias: paddle.nn.functional.linear_lr_warmup,paddle.nn.functional.learning_rate.linear_lr_warmup
-	:old_api: paddle.fluid.layers.linear_lr_warmup
 
     This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 2b1449a94e6e5..b363c37f64b87 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -59,9 +59,6 @@ def center_loss(input,
                 update_center=True):
     """
     :api_attr: Static Graph
-	:alias_main: paddle.nn.functional.center_loss
-	:alias: paddle.nn.functional.center_loss,paddle.nn.functional.loss.center_loss
-	:old_api: paddle.fluid.layers.center_loss
 
     **Center loss Cost layer**
     
@@ -92,6 +89,8 @@ def center_loss(input,
         .. code-block:: python
 
           import paddle.fluid as fluid 
+          import paddle
+          paddle.enable_static()
 
           input = fluid.data(name='x',shape=[20,30],dtype='float32')
           label = fluid.data(name='y',shape=[20,1],dtype='int64')
@@ -153,9 +152,6 @@ def center_loss(input,
 
 def bpr_loss(input, label, name=None):
     """
-    :alias_main: paddle.nn.functional.bpr_loss
-	:alias: paddle.nn.functional.bpr_loss,paddle.nn.functional.loss.bpr_loss
-	:old_api: paddle.fluid.layers.bpr_loss
 
     **Bayesian Personalized Ranking Loss Operator**
 
@@ -183,6 +179,9 @@ def bpr_loss(input, label, name=None):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+
+          paddle.enable_static()
 
           neg_size = 10
           label = fluid.data(
@@ -1309,9 +1308,6 @@ def softmax_with_cross_entropy(logits,
 
 def rank_loss(label, left, right, name=None):
     """
-    :alias_main: paddle.nn.functional.rank_loss
-	:alias: paddle.nn.functional.rank_loss,paddle.nn.functional.loss.rank_loss
-	:old_api: paddle.fluid.layers.rank_loss
 
     This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
     with a training sample consisting of a pair of documents (A and B), The label (P) 
@@ -1349,6 +1345,8 @@ def rank_loss(label, left, right, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             label = fluid.data(name="label", shape=[-1, 1], dtype="float32")
             left = fluid.data(name="left", shape=[-1, 1], dtype="float32")
             right = fluid.data(name="right", shape=[-1, 1], dtype="float32")
@@ -1491,9 +1489,6 @@ def teacher_student_sigmoid_loss(input,
                                  soft_max_up_bound=15.0,
                                  soft_max_lower_bound=-15.0):
     """
-    :alias_main: paddle.nn.functional.teacher_student_sigmoid_loss
-	:alias: paddle.nn.functional.teacher_student_sigmoid_loss,paddle.nn.functional.loss.teacher_student_sigmoid_loss
-	:old_api: paddle.fluid.layers.teacher_student_sigmoid_loss
 
     **Teacher Student Log Loss Layer**
 
@@ -1521,7 +1516,8 @@ def teacher_student_sigmoid_loss(input,
         .. code-block:: python
           
           import paddle.fluid as fluid
-
+          import paddle
+          paddle.enable_static()
           batch_size = 64
           label = fluid.data(
                     name="label", shape=[batch_size, 1], dtype="int64")
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a6402a2852c2a..c2bb96ead2bf9 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1859,7 +1859,6 @@ def _get_default_param_initializer():
     return helper.append_activation(pre_act)
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool2d")
 @templatedoc()
 def pool2d(input,
            pool_size=-1,
@@ -1873,9 +1872,6 @@ def pool2d(input,
            exclusive=True,
            data_format="NCHW"):
     """
-    :alias_main: paddle.nn.functional.pool2d
-	:alias: paddle.nn.functional.pool2d,paddle.nn.functional.pooling.pool2d
-	:old_api: paddle.fluid.layers.pool2d
 
     ${comment}
 
@@ -1934,6 +1930,9 @@ def pool2d(input,
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+
+          paddle.enable_static()
 
           data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
 
@@ -2077,7 +2076,6 @@ def is_list_or_tuple(ele):
     return pool_out
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.pool3d")
 @templatedoc()
 def pool3d(input,
            pool_size=-1,
@@ -2091,9 +2089,6 @@ def pool3d(input,
            exclusive=True,
            data_format="NCDHW"):
     """
-    :alias_main: paddle.nn.functional.pool3d
-	:alias: paddle.nn.functional.pool3d,paddle.nn.functional.pooling.pool3d
-	:old_api: paddle.fluid.layers.pool3d
 
     ${comment}
 
@@ -2153,6 +2148,9 @@ def pool3d(input,
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+
+          paddle.enable_static()
 
           data = fluid.data(name='data', shape=[None, 3, 32, 32, 32], dtype='float32')
 
@@ -3674,10 +3672,11 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     Examples:
        .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
 
-            weight = fluid.data(name='weight', shape=[2, 8, 32, 32], dtype='float32')
-            x = fluid.layers.spectral_norm(weight=weight, dim=1, power_iters=2)
+            paddle.enable_static()
+            weight = paddle.data(name='weight', shape=[2, 8, 32, 32], dtype='float32')
+            x = paddle.static.nn.spectral_norm(weight=weight, dim=1, power_iters=2)
     """
     helper = LayerHelper('spectral_norm', **locals())
     check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
@@ -4317,9 +4316,6 @@ def is_list_or_tuple(ele):
 
 def reduce_sum(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_sum
-	:alias: paddle.reduce_sum,paddle.tensor.reduce_sum,paddle.tensor.math.reduce_sum
-	:old_api: paddle.fluid.layers.reduce_sum
 
     Computes the sum of tensor elements over the given dimension.
 
@@ -4349,6 +4345,8 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
@@ -4451,9 +4449,6 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
 
 def reduce_max(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_max
-	:alias: paddle.reduce_max,paddle.tensor.reduce_max,paddle.tensor.math.reduce_max
-	:old_api: paddle.fluid.layers.reduce_max
 
     Computes the maximum of tensor elements over the given dimension.
 
@@ -4480,6 +4475,8 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
@@ -4517,9 +4514,6 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
 
 def reduce_min(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_min
-	:alias: paddle.reduce_min,paddle.tensor.reduce_min,paddle.tensor.math.reduce_min
-	:old_api: paddle.fluid.layers.reduce_min
 
     Computes the minimum of tensor elements over the given dimension.
 
@@ -4546,6 +4540,9 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
@@ -4583,9 +4580,6 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
 
 def reduce_prod(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_prod
-	:alias: paddle.reduce_prod,paddle.tensor.reduce_prod,paddle.tensor.math.reduce_prod
-	:old_api: paddle.fluid.layers.reduce_prod
 
     Computes the product of tensor elements over the given dimension.
 
@@ -4612,6 +4606,8 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
@@ -4659,9 +4655,6 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
 
 def reduce_all(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_all
-	:alias: paddle.reduce_all,paddle.tensor.reduce_all,paddle.tensor.logic.reduce_all
-	:old_api: paddle.fluid.layers.reduce_all
 
     This OP computes the ``logical and`` of tensor elements over the given dimension, and output the result.
 
@@ -4723,10 +4716,6 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
 
 def reduce_any(input, dim=None, keep_dim=False, name=None):
     """
-    :alias_main: paddle.reduce_any
-	:alias: paddle.reduce_any,paddle.tensor.reduce_any,paddle.tensor.logic.reduce_any
-	:old_api: paddle.fluid.layers.reduce_any
-
     This OP computes the ``logical or`` of tensor elements over the given dimension, and output the result.
 
     Args:
@@ -4939,9 +4928,6 @@ def _get_SectionsTensorList(one_list):
 
 def l2_normalize(x, axis, epsilon=1e-12, name=None):
     """
-    :alias_main: paddle.nn.functional.l2_normalize
-	:alias: paddle.nn.functional.l2_normalize,paddle.nn.functional.norm.l2_normalize
-	:old_api: paddle.fluid.layers.l2_normalize
 
     This op normalizes `x` along dimension `axis` using an L2
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
@@ -4972,6 +4958,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
 	    # declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
+        import paddle
+        paddle.enable_static()
 	    input = fluid.data(name="input", shape=[2,3])
 	    output = fluid.layers.l2_normalize(x=input,axis=0)
 	    place = fluid.CPUPlace()
@@ -5785,9 +5773,6 @@ def multiplex(inputs, index):
 
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
-    :alias_main: paddle.nn.functional.smooth_l1
-	:alias: paddle.nn.functional.smooth_l1,paddle.nn.functional.loss.smooth_l1
-	:old_api: paddle.fluid.layers.smooth_l1
 
     This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
     It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
@@ -5823,6 +5808,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
             import paddle.fluid as fluid
             import numpy as np
+            import paddle
+            paddle.enable_static()
             data = fluid.data(name="x", shape=[-1, 3], dtype="float32")
             label = fluid.data(name="y", shape=[-1, 3], dtype="float32")
             result = fluid.layers.smooth_l1(data,label)
@@ -6132,7 +6119,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             return dygraph_utils._append_activation_in_dygraph(out, act)
 
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'reshape')
+        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64',
+                 'bool'], 'reshape')
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -6857,9 +6845,6 @@ def roi_pool(input,
              rois_num=None,
              name=None):
     """
-    :alias_main: paddle.nn.functional.roi_pool
-	:alias: paddle.nn.functional.roi_pool,paddle.nn.functional.vision.roi_pool
-	:old_api: paddle.fluid.layers.roi_pool
 
     This operator implements the roi_pooling layer.
     Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
@@ -6894,6 +6879,8 @@ def roi_pool(input,
 
         import paddle.fluid as fluid
         import numpy as np
+        import paddle
+        paddle.enable_static()
 
         DATATYPE='float32'
 
@@ -6964,9 +6951,6 @@ def roi_align(input,
               rois_num=None,
               name=None):
     """
-    :alias_main: paddle.nn.functional.roi_align
-	:alias: paddle.nn.functional.roi_align,paddle.nn.functional.vision.roi_align
-	:old_api: paddle.fluid.layers.roi_align
 
     ${comment}
 
@@ -6996,6 +6980,9 @@ def roi_align(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             x = fluid.data(
                 name='data', shape=[None, 256, 32, 32], dtype='float32')
             rois = fluid.data(
@@ -7108,9 +7095,6 @@ def image_resize(input,
                  align_mode=1,
                  data_format='NCHW'):
     """
-    :alias_main: paddle.nn.functional.image_resize
-	:alias: paddle.nn.functional.image_resize,paddle.nn.functional.vision.image_resize
-	:old_api: paddle.fluid.layers.image_resize
 
     This op resizes a batch of images.
 
@@ -7350,8 +7334,10 @@ def image_resize(input,
         .. code-block:: python
 
 	    #declarative mode
+	    import paddle
 	    import paddle.fluid as fluid
 	    import numpy as np
+	    paddle.enable_static()
 	    input = fluid.data(name="input", shape=[None,3,6,10])
 
 	    #1
@@ -7708,9 +7694,6 @@ def resize_bilinear(input,
                     align_mode=1,
                     data_format='NCHW'):
     """
-    :alias_main: paddle.nn.functional.resize_bilinear
-	:alias: paddle.nn.functional.resize_bilinear,paddle.nn.functional.vision.resize_bilinear
-	:old_api: paddle.fluid.layers.resize_bilinear
 
     This op resizes the input by performing bilinear interpolation based on given
     output shape which specified by actual_shape, out_shape and scale
@@ -7804,6 +7787,8 @@ def resize_bilinear(input,
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
+	    import paddle
+	    paddle.enable_static()
 	    input = fluid.data(name="input", shape=[None,3,6,10])
 
 	    #1
@@ -7875,9 +7860,6 @@ def resize_trilinear(input,
                      align_mode=1,
                      data_format='NCDHW'):
     """
-    :alias_main: paddle.nn.functional.resize_trilinear
-	:alias: paddle.nn.functional.resize_trilinear,paddle.nn.functional.vision.resize_trilinear
-	:old_api: paddle.fluid.layers.resize_trilinear
 
     This op resizes the input by performing trilinear interpolation based on given
     output shape which specified by actual_shape, out_shape and scale
@@ -7970,7 +7952,9 @@ def resize_trilinear(input,
 
 	    #declarative mode
 	    import paddle.fluid as fluid
+	    import paddle
 	    import numpy as np
+	    paddle.enable_static()
 	    input = fluid.data(name="input", shape=[None,3,6,8,10])
 
 	    #1
@@ -8043,9 +8027,6 @@ def resize_nearest(input,
                    align_corners=True,
                    data_format='NCHW'):
     """
-    :alias_main: paddle.nn.functional.resize_nearest
-	:alias: paddle.nn.functional.resize_nearest,paddle.nn.functional.vision.resize_nearest
-	:old_api: paddle.fluid.layers.resize_nearest
 
     This op resizes the input by performing nearest neighbor interpolation in both the
     height direction and the width direction based on given output shape
@@ -8128,6 +8109,9 @@ def resize_nearest(input,
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
+	    import paddle
+	    paddle.enable_static()
+
 	    input = fluid.data(name="input", shape=[None,3,6,10])
 
 	    #1
@@ -8670,10 +8654,6 @@ def random_crop(x, shape, seed=None):
 
 def log(x, name=None):
     """
-    :alias_main: paddle.log
-	:alias: paddle.log,paddle.tensor.log,paddle.tensor.math.log
-	:old_api: paddle.fluid.layers.log
-
     Calculates the natural log of the given input tensor, element-wise.
 
     .. math::
@@ -8681,31 +8661,23 @@ def log(x, name=None):
         Out = \\ln(x)
 
     Args:
-        x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
+        x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
         name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
 
 
     Returns:
-        Variable: The natural log of the input LoDTensor or Tensor computed element-wise.
+        Tensor: The natural log of the input Tensor computed element-wise.
 
     Examples:
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
-
-            # Graph Organizing
-            x = fluid.layers.data(name="x", shape=[1], dtype="float32")
-            res = fluid.layers.log(x)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
+            import paddle
 
-            # Execute
-            x_i = np.array([[1], [2]]).astype(np.float32)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
-            print(res_val) # [[0.], [0.6931472]]
+            x = [[2,3,4], [7,8,9]]
+            x = paddle.to_tensor(x, dtype='float32')
+            res = paddle.log(x)
+            # [[0.693147, 1.09861, 1.38629], [1.94591, 2.07944, 2.19722]]
     """
     if in_dygraph_mode():
         return core.ops.log(x)
@@ -8846,33 +8818,36 @@ def mean_iou(input, label, num_classes):
 
 
     Parameters:
-        input (Variable): A n-D Tensor of prediction results for semantic labels with type int32 or int64.
-        label (Variable): A Tensor of ground truth labels with type int32 or int64.
+        input (Tensor): A n-D Tensor of prediction results for semantic labels with type int32 or int64.
+        label (Tensor): A Tensor of ground truth labels with type int32 or int64.
                            Its shape should be the same as input.
         num_classes (int32): The possible number of labels.
 
     Returns:
-	Three Variables.
+	Three Tensors.
 
-        - mean_iou(Variable) : A 1-D Tensor representing the mean intersection-over-union with shape [1]. \
+        - mean_iou(Tensor) : A 1-D Tensor representing the mean intersection-over-union with shape [1]. \
 			    Data type is float32.
-        - out_wrong(Variable) : A 1-D Tensor with shape [num_classes]. Data type is int32. \
+        - out_wrong(Tensor) : A 1-D Tensor with shape [num_classes]. Data type is int32. \
 			     The wrong numbers of each class.
-        - out_correct(Variable): A 1-D  Tensor with shape [num_classes]. Data type is int32. The correct numbers of each class.
+        - out_correct(Tensor): A 1-D  Tensor with shape [num_classes]. Data type is int32. The correct numbers of each class.
 
 
     Examples:
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            iou_shape = [None, 32, 32]
+            import paddle
+
+            iou_shape = [64, 32, 32]
             num_classes = 5
-            predict = fluid.data(name='predict', shape=iou_shape, dtype='int64')
-            label = fluid.data(name='label', shape=iou_shape, dtype='int64')
-            mean_iou, out_wrong, out_correct = fluid.layers.mean_iou(predict, label,
-                                                          num_classes)
+            predict = paddle.randint(low=0, high=255, shape=iou_shape, dtype='int64')
+            label = paddle.randint(low=0, high=255, shape=iou_shape, dtype='int64')
+            mean_iou, out_wrong, out_correct = paddle.metric.mean_iou(predict, label, num_classes)
     """
+    if in_dygraph_mode():
+        return core.ops.mean_iou(input, label, 'num_classes', num_classes)
+
     helper = LayerHelper('mean_iou', **locals())
     check_variable_and_dtype(input, 'Predictions', ['int32', 'int64'],
                              'mean_iou')
@@ -8962,6 +8937,9 @@ def crop(x, shape=None, offsets=None, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             x = fluid.data(name="x", shape=[3, 3, 5], dtype="float32")
             y = fluid.data(name="y", shape=[2, 2, 3], dtype="float32")
             crop = fluid.layers.crop(x, shape=y)
@@ -9000,10 +8978,6 @@ def crop(x, shape=None, offsets=None, name=None):
 
 def crop_tensor(x, shape=None, offsets=None, name=None):
     """
-    :alias_main: paddle.crop_tensor
-	:alias: paddle.crop_tensor,paddle.tensor.crop_tensor,paddle.tensor.creation.crop_tensor
-	:old_api: paddle.fluid.layers.crop_tensor
-
     Crop input into output, as specified by offsets and shape.
 
     .. code-block:: text
@@ -9073,6 +9047,9 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             x = fluid.data(name="x", shape=[None, 3, 5], dtype="float32")
             # x.shape = [-1, 3, 5], where -1 indicates batch size, and it will get the exact value in runtime.
 
@@ -9280,9 +9257,6 @@ def pad2d(input,
           data_format="NCHW",
           name=None):
     """
-    :alias_main: paddle.nn.functional.pad2d
-	:alias: paddle.nn.functional.pad2d,paddle.nn.functional.common.pad2d
-	:old_api: paddle.fluid.layers.pad2d
 
     Pad 2-d images according to 'paddings' and 'mode'.
     If mode is 'reflect', paddings[0] and paddings[1] must be no greater
@@ -9347,7 +9321,7 @@ def pad2d(input,
             x_shape = (1, 1, 3, 4)
             x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
             tensor_x = paddle.to_tensor(x)
-            y = F.pad2d(tensor_x, paddings=[1, 2, 2, 1], pad_value=1, mode='constant')
+            y = paddle.fluid.layers.pad2d(tensor_x, paddings=[1, 2, 2, 1], pad_value=1, mode='constant')
             print(y.numpy())
             # [[[[ 1.  1.  1.  1.  1.  1.  1.]
             #    [ 1.  1.  1.  2.  3.  4.  1.]
@@ -9360,7 +9334,7 @@ def pad2d(input,
             x_shape = (1, 1, 2, 3)
             x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
             tensor_x = paddle.to_tensor(x)
-            y = F.pad2d(tensor_x, paddings=[1, 1, 1, 1], mode='reflect')
+            y = paddle.fluid.layers.pad2d(tensor_x, paddings=[1, 1, 1, 1], mode='reflect')
             print(y.numpy())
             # [[[[5. 4. 5. 6. 5.]
             #    [2. 1. 2. 3. 2.]
@@ -9884,9 +9858,6 @@ def leaky_relu(x, alpha=0.02, name=None):
 
 def soft_relu(x, threshold=40.0, name=None):
     """
-    :alias_main: paddle.nn.functional.soft_relu
-	:alias: paddle.nn.functional.soft_relu,paddle.nn.functional.activation.soft_relu
-	:old_api: paddle.fluid.layers.soft_relu
 
     SoftRelu Activation Operator.
 
@@ -9906,7 +9877,10 @@ def soft_relu(x, threshold=40.0, name=None):
 
             import paddle.fluid as fluid
             import numpy as np
+            import numpy as np
+            import paddle
 
+            paddle.enable_static()
             inputs = fluid.layers.data(name="x", shape=[2, 2], dtype="float32")
             output = fluid.layers.soft_relu(inputs, threshold=20.0)
 
@@ -10246,6 +10220,11 @@ def unstack(x, axis=0, num=None):
             y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
+    if in_dygraph_mode():
+        if num == None:
+            num = x.shape[axis]
+        return core.ops.unstack(x, num, 'axis', int(axis), 'num', num)
+
     helper = LayerHelper('unstack', **locals())
     if num is None:
         if axis is None or x.shape[axis] <= 0:
@@ -10861,16 +10840,12 @@ def sum(x):
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
     """
 
-    return paddle.elementwise_sum(x)
+    return paddle.add_n(x)
 
 
 @templatedoc()
 def slice(input, axes, starts, ends):
     """
-    :alias_main: paddle.slice
-	:alias: paddle.slice,paddle.tensor.slice,paddle.tensor.manipulation.slice
-	:old_api: paddle.fluid.layers.slice
-
     This operator produces a slice of ``input`` along multiple axes. Similar to numpy:
     https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
     Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and
@@ -10903,43 +10878,42 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
+    
     Args:
-        input (Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
+        input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
-        starts (list|tuple|Variable): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
-                it should be integers or Tensors with shape [1]. If ``starts`` is an Variable, it should be an 1-D Tensor.
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.
                 It represents starting indices of corresponding axis in ``axes``.
-        ends (list|tuple|Variable): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
-                it should be integers or Tensors with shape [1]. If ``ends`` is an Variable, it should be an 1-D Tensor .
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .
                 It represents ending indices of corresponding axis in ``axes``.
 
     Returns:
-        Variable:  A ``Tensor`` or ``LoDTensor``. The data type is same as ``input``.
+        Tensor:  A ``Tensor``. The data type is same as ``input``.
 
     Raises:
-        TypeError: The type of ``starts`` must be list, tuple or Variable.
-        TypeError: The type of ``ends`` must be list, tuple or Variable.
+        TypeError: The type of ``starts`` must be list, tuple or Tensor.
+        TypeError: The type of ``ends`` must be list, tuple or Tensor.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            input = fluid.data(
-                name="input", shape=[4, 5, 6], dtype='float32')
+            import paddle
 
+            input = paddle.rand(shape=[4, 5, 6], dtype='float32')
             # example 1:
-            # attr starts is a list which doesn't contain tensor Variable.
+            # attr starts is a list which doesn't contain tensor.
             axes = [0, 1, 2]
             starts = [-3, 0, 2]
             ends = [3, 2, 4]
-            sliced_1 = fluid.layers.slice(input, axes=axes, starts=starts, ends=ends)
+            sliced_1 = paddle.slice(input, axes=axes, starts=starts, ends=ends)
             # sliced_1 is input[0:3, 0:2, 2:4].
 
             # example 2:
-            # attr starts is a list which contain tensor Variable.
-            minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-            sliced_2 = fluid.layers.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends)
+            # attr starts is a list which contain tensor.
+            minus_3 = paddle.full([1], -3, "int32")
+            sliced_2 = paddle.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends)
             # sliced_2 is input[0:3, 0:2, 2:4].
     """
     if in_dygraph_mode():
@@ -11387,10 +11361,6 @@ def _elementwise_op(helper):
 
 def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
-    :alias_main: paddle.scale
-	:alias: paddle.scale,paddle.tensor.scale,paddle.tensor.math.scale
-	:old_api: paddle.fluid.layers.scale
-
     Scale operator.
 
     Putting scale and bias to the input Tensor as following:
@@ -11406,52 +11376,33 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
                             Out=scale*(X+bias)
 
     Args:
-        x(Variable): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
-        scale(float|Variable): The scale factor of the input, it should be a float number or a Variable with shape [1] and data type as float32.
+        x(Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
+        scale(float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32.
         bias(float): The bias to be put on the input.
         bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
         act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
         name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable(Tensor|LoDTensor): Output tensor of scale operator, with shape and data type same as input.
+        Tensor: Output tensor of scale operator, with shape and data type same as input.
 
     Examples:
         .. code-block:: python
+            
+            # scale as a float32 number
+            import paddle
 
-            import paddle.fluid as fluid
-            import numpy as np
-
-            inputs = fluid.layers.data(name="x", shape=[2, 3], dtype='float32')
-            output = fluid.layers.scale(inputs, scale = 2.0, bias = 1.0)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
-
-            res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
-            print(res) # [array([[ 3.,  5.,  7.], [ 9., 11., 13.]], dtype=float32)]
+            data = paddle.randn(shape=[2,3], dtype='float32')
+            res = paddle.scale(data, scale=2.0, bias=1.0)
 
         .. code-block:: python
 
-            # scale with parameter scale as Variable
-            import paddle.fluid as fluid
-            import numpy as np
-
-            inputs = fluid.layers.data(name="x", shape=[2, 3], dtype='float32')
-            scale = fluid.layers.data(name="scale", shape=[1], dtype='float32',
-                                      append_batch_size=False)
-            output = fluid.layers.scale(inputs, scale = scale, bias = 1.0)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-
-            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
-            scale_np = np.array([2.]).astype(np.float32)
+            # scale with parameter scale as a Tensor
+            import paddle
 
-            res = exe.run(fluid.default_main_program(), feed={'x':img, 'scale':scale_np}, fetch_list=[output])
-            print(res) # [array([[ 3.,  5.,  7.], [ 9., 11., 13.]], dtype=float32)]
+            data = paddle.randn(shape=[2, 3], dtype='float32')
+            factor = paddle.to_tensor([2], dtype='float32')
+            res = paddle.scale(data, scale=factor, bias=1.0)
 
     """
 
@@ -11485,9 +11436,6 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
 def elementwise_add(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_add
-	:alias: paddle.elementwise_add,paddle.tensor.elementwise_add,paddle.tensor.math.elementwise_add
-	:old_api: paddle.fluid.layers.elementwise_add
 
 Examples:
 
@@ -11579,9 +11527,6 @@ def gen_data():
 @deprecated(since="2.0.0", update_to="paddle.divide")
 def elementwise_div(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_div
-	:alias: paddle.elementwise_div,paddle.tensor.elementwise_div,paddle.tensor.math.elementwise_div
-	:old_api: paddle.fluid.layers.elementwise_div
 
 Examples:
 
@@ -11667,9 +11612,6 @@ def gen_data():
 
 def elementwise_sub(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_sub
-	:alias: paddle.elementwise_sub,paddle.tensor.elementwise_sub,paddle.tensor.math.elementwise_sub
-	:old_api: paddle.fluid.layers.elementwise_sub
 
 Examples:
 
@@ -11756,9 +11698,6 @@ def gen_data():
 @deprecated(since="2.0.0", update_to="paddle.multiply")
 def elementwise_mul(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_mul
-	:alias: paddle.elementwise_mul,paddle.tensor.elementwise_mul,paddle.tensor.math.elementwise_mul
-	:old_api: paddle.fluid.layers.elementwise_mul
 
 Examples:
 
@@ -11966,9 +11905,6 @@ def gen_data():
 
 def elementwise_pow(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_pow
-	:alias: paddle.elementwise_pow,paddle.tensor.elementwise_pow,paddle.tensor.math.elementwise_pow
-	:old_api: paddle.fluid.layers.elementwise_pow
 
 Examples:
 
@@ -12003,9 +11939,6 @@ def gen_data():
 @deprecated(since="2.0.0", update_to="paddle.remainder")
 def elementwise_mod(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_mod
-	:alias: paddle.elementwise_mod,paddle.tensor.elementwise_mod,paddle.tensor.math.elementwise_mod
-	:old_api: paddle.fluid.layers.elementwise_mod
 
 Examples:
 
@@ -12041,9 +11974,6 @@ def gen_data():
 @deprecated(since="2.0.0", update_to="paddle.floor_divide")
 def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_floordiv
-	:alias: paddle.elementwise_floordiv,paddle.tensor.elementwise_floordiv,paddle.tensor.math.elementwise_floordiv
-	:old_api: paddle.fluid.layers.elementwise_floordiv
 
 Examples:
 
@@ -12544,6 +12474,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
         ..  code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             dataX = fluid.layers.data(name="dataX", append_batch_size = False, shape=[2, 5], dtype="float32")
             dataY = fluid.layers.data(name="dataY", append_batch_size = False, shape=[5, 3], dtype="float32")
             output = fluid.layers.mul(dataX, dataY,
@@ -12608,9 +12540,6 @@ def maxout(x, groups, name=None, axis=1):
 
 def space_to_depth(x, blocksize, name=None):
     """
-    :alias_main: paddle.nn.functional.space_to_depth
-	:alias: paddle.nn.functional.space_to_depth,paddle.nn.functional.vision.space_to_depth
-	:old_api: paddle.fluid.layers.space_to_depth
 
     Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
 
@@ -12667,7 +12596,10 @@ def space_to_depth(x, blocksize, name=None):
 
             import paddle.fluid as fluid
             import numpy as np
+            import numpy as np
+            import paddle
 
+            paddle.enable_static()
             data = fluid.data(
                 name='data', shape=[1, 4, 2, 2], dtype='float32')
             space_to_depthed = fluid.layers.space_to_depth(
@@ -12719,9 +12651,6 @@ def affine_channel(x,
                    name=None,
                    act=None):
     """
-    :alias_main: paddle.nn.functional.affine_channel
-	:alias: paddle.nn.functional.affine_channel,paddle.nn.functional.vision.affine_channel
-	:old_api: paddle.fluid.layers.affine_channel
 
     Applies a separate affine transformation to each channel of the input.
     Useful for replacing spatial batch norm with its equivalent fixed
@@ -12755,7 +12684,10 @@ def affine_channel(x,
 
             import numpy as np
             import paddle.fluid as fluid
+            import paddle.fluid as fluid
+            import paddle
 
+            paddle.enable_static()
             use_gpu = False
             place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -12881,6 +12813,8 @@ def similarity_focus(input, axis, indexes, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             data = fluid.data(
                 name='data', shape=[-1, 3, 2, 2], dtype='float32')
             fluid.layers.similarity_focus(input=data, axis=1, indexes=[0])
@@ -12908,9 +12842,6 @@ def similarity_focus(input, axis, indexes, name=None):
 
 def hash(input, hash_size, num_hash=1, name=None):
     """
-    :alias_main: paddle.nn.functional.hash
-	:alias: paddle.nn.functional.hash,paddle.nn.functional.lod.hash
-	:old_api: paddle.fluid.layers.hash
 
     This OP hash the input to an integer less than the hash_size.
     The hash algorithm we used was xxHash - Extremely fast hash algorithm
@@ -12932,6 +12863,8 @@ def hash(input, hash_size, num_hash=1, name=None):
 
             import paddle.fluid as fluid
             import numpy as np
+            import paddle
+            paddle.enable_static()
 
             place = fluid.core.CPUPlace()
 
@@ -12972,9 +12905,6 @@ def hash(input, hash_size, num_hash=1, name=None):
 @templatedoc()
 def grid_sampler(x, grid, name=None):
     """
-    :alias_main: paddle.nn.functional.grid_sampler
-	:alias: paddle.nn.functional.grid_sampler,paddle.nn.functional.vision.grid_sampler
-	:old_api: paddle.fluid.layers.grid_sampler
 
     This operation samples input X by using bilinear interpolation based on
     flow field grid, which is usually generated by :code:`affine_grid` . The grid of
@@ -13048,7 +12978,10 @@ def grid_sampler(x, grid, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle.fluid as fluid
+            import paddle
 
+            paddle.enable_static()
             # use with affine_grid
             x = fluid.data(name='x', shape=[None, 10, 32, 32], dtype='float32')
             theta = fluid.layers.data(name='theta', shape=[2, 3], dtype='float32')
@@ -13132,9 +13065,6 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
 def add_position_encoding(input, alpha, beta, name=None):
     """
-    :alias_main: paddle.nn.functional.add_position_encoding
-	:alias: paddle.nn.functional.add_position_encoding,paddle.nn.functional.extension.add_position_encoding
-	:old_api: paddle.fluid.layers.add_position_encoding
 
     This operator performs weighted sum of input feature at each position
     (position in the sequence) and the corresponding position encoding.
@@ -13175,10 +13105,9 @@ def add_position_encoding(input, alpha, beta, name=None):
         .. code-block:: python
 
           import paddle
-          import paddle.nn.functional as F
 
           tensor = paddle.randn([16, 32, 64])
-          position_tensor = F.add_position_encoding(
+          position_tensor = paddle.fluid.layers.add_position_encoding(
                 input=tensor, alpha=1.0, beta=1.0)
 
     """
@@ -13403,9 +13332,6 @@ def shuffle_channel(x, group, name=None):
 @templatedoc()
 def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     """
-    :alias_main: paddle.nn.functional.temporal_shift
-	:alias: paddle.nn.functional.temporal_shift,paddle.nn.functional.extension.temporal_shift
-	:old_api: paddle.fluid.layers.temporal_shift
 
     **Temporal Shift Operator**
 
@@ -13433,7 +13359,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
             import paddle.nn.functional as F
 
             input = paddle.randn([6, 4, 2, 2])
-            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+            out = paddle.fluid.layers.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
@@ -13630,7 +13556,7 @@ def simple_net(img, label):
                     # User-defined debug functions that print out the input Tensor
                     paddle.static.nn.py_func(func=debug_func, x=hidden, out=None)
 
-                prediction = paddle.static.nn.fc(hidden, size=10, act='softmax')
+                prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
                 loss = paddle.static.nn.cross_entropy(input=prediction, label=label)
                 return paddle.mean(loss)
 
@@ -13768,9 +13694,6 @@ def psroi_pool(input,
                pooled_width,
                name=None):
     """
-    :alias_main: paddle.nn.functional.psroi_pool
-	:alias: paddle.nn.functional.psroi_pool,paddle.nn.functional.vision.psroi_pool
-	:old_api: paddle.fluid.layers.psroi_pool
 
     ${comment}
 
@@ -13799,6 +13722,8 @@ def psroi_pool(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             x = fluid.data(name='x', shape=[100, 490, 28, 28], dtype='float32')
             rois = fluid.data(name='rois', shape=[None, 4], lod_level=1, dtype='float32')
             pool_out = fluid.layers.psroi_pool(x, rois, 10, 1.0, 7, 7)
@@ -13838,9 +13763,6 @@ def prroi_pool(input,
                batch_roi_nums=None,
                name=None):
     """
-    :alias_main: paddle.nn.functional.prroi_pool
-	:alias: paddle.nn.functional.prroi_pool,paddle.nn.functional.vision.prroi_pool
-	:old_api: paddle.fluid.layers.prroi_pool
 
     The precise roi pooling implementation for paddle. Reference: https://arxiv.org/pdf/1807.11590.pdf
 
@@ -14630,9 +14552,6 @@ def deformable_roi_pooling(input,
                            position_sensitive=False,
                            name=None):
     """
-    :alias_main: paddle.nn.functional.deformable_roi_pooling
-	:alias: paddle.nn.functional.deformable_roi_pooling,paddle.nn.functional.vision.deformable_roi_pooling
-	:old_api: paddle.fluid.layers.deformable_roi_pooling
 
     Deformable ROI Pooling Layer
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 6cdc617a0dc17..de0fbb16f6241 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -190,11 +190,9 @@
     .. code-block:: python
 
         import paddle
-        paddle.disable_static()
 
         x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.rsqrt(x)
-        print(out.numpy())
         # [3.16227766 2.23606798 1.82574186 1.58113883]
 
 """)
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 57c2489194337..079187e09c916 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -17,6 +17,7 @@
 import sys
 from functools import partial, reduce
 
+import paddle
 from . import nn
 from . import tensor
 from . import control_flow
@@ -488,7 +489,7 @@ def rnn(cell,
 
             inputs = paddle.rand((4, 23, 16))
             prev_h = paddle.randn((4, 32))
-            outputs, final_states = paddle.nn.functional.rnn(cell, inputs, prev_h) 
+            outputs, final_states = paddle.fluid.layers.rnn(cell, inputs, prev_h) 
 
     """
     if in_dygraph_mode():
@@ -507,6 +508,9 @@ def append(self, x):
         self.array.append(x)
         return self
 
+    def __getitem__(self, item):
+        return self.array.__getitem__(item)
+
 
 def _maybe_copy(state, new_state, step_mask):
     """update rnn state or just pass the old state through"""
@@ -711,7 +715,7 @@ def birnn(cell_fw,
             hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
             hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
             initial_states = ((hf, cf), (hb, cb))
-            outputs, final_states = paddle.nn.functional.birnn(
+            outputs, final_states = paddle.fluid.layers.birnn(
                 cell_fw, cell_bw, inputs, initial_states)
         
     """
@@ -859,8 +863,6 @@ def tracks_own_finished(self):
 
 class BeamSearchDecoder(Decoder):
     """
-	:api_attr: Static Graph
-
     Decoder with beam search decoding strategy. It wraps a cell to get probabilities,
     and follows a beam search step to calculate scores and select candidate
     token ids for each decoding step.
@@ -881,24 +883,20 @@ class BeamSearchDecoder(Decoder):
 
         .. code-block:: python
             
-            import paddle.fluid as fluid
-            from paddle.fluid.layers import GRUCell, BeamSearchDecoder
-
-            trg_embeder = lambda x: fluid.embedding(
-                x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
-            output_layer = lambda x: layers.fc(x,
-                                            size=10000,
-                                            num_flatten_dims=len(x.shape) - 1,
-                                            param_attr=fluid.ParamAttr(name=
-                                                                        "output_w"),
-                                            bias_attr=False)
-            decoder_cell = GRUCell(hidden_size=128)
+            import numpy as np
+            import paddle
+            from paddle.nn import BeamSearchDecoder, dynamic_decode
+            from paddle.nn import GRUCell, Linear, Embedding
+            trg_embeder = Embedding(100, 32)
+            output_layer = Linear(32, 32)
+            decoder_cell = GRUCell(input_size=32, hidden_size=32)
             decoder = BeamSearchDecoder(decoder_cell,
                                         start_token=0,
                                         end_token=1,
                                         beam_size=4,
                                         embedding_fn=trg_embeder,
                                         output_fn=output_layer)
+
     """
 
     def __init__(self,
@@ -912,16 +910,13 @@ def __init__(self,
         Constructor of BeamSearchDecoder.
 
         Parameters:
-            cell(RNNCell): An instance of `RNNCell` or object with the same interface.
+            cell(RNNCellBase): An instance of `RNNCellBase` or object with the same interface.
             start_token(int): The start token id.
             end_token(int): The end token id.
             beam_size(int): The beam width used in beam search.
             embedding_fn(optional): A callable to apply to selected candidate ids. 
                 Mostly it is an embedding layer to transform ids to embeddings,
                 and the returned value acts as the `input` argument for `cell.call`.
-                **Note that fluid.embedding should be used here rather than
-                fluid.layers.embedding, since shape of ids is [batch_size, beam_size].
-                when using fluid.layers.embedding, must unsqueeze in embedding_fn.**
                 If not provided, the id to embedding transformation must be built into
                 `cell.call`. Default None.
             output_fn(optional): A callable to apply to the cell's output prior to
@@ -1150,6 +1145,8 @@ def initialize(self, initial_cell_states):
                 np.array(
                     [[0.] + [-self.kinf] * (self.beam_size - 1)],
                     dtype="float32")), [self.batch_size, 1])
+        if paddle.get_default_dtype() == "float64":
+            log_probs = tensor.cast(log_probs, "float64")
         # TODO: remove the restriction of force_cpu
         init_finished = tensor.fill_constant_batch_size_like(
             input=state,
@@ -1197,7 +1194,11 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state):
             shape=[1], dtype="int64", value=self.vocab_size)
         noend_array = [-self.kinf] * self.vocab_size
         noend_array[self.end_token] = 0
+
         self.noend_mask_tensor = tensor.assign(np.array(noend_array, "float32"))
+        if paddle.get_default_dtype() == "float64":
+            self.noend_mask_tensor = tensor.cast(self.noend_mask_tensor,
+                                                 "float64")
 
         step_log_probs = nn.log(nn.softmax(logits))
         step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
@@ -1328,98 +1329,103 @@ def tracks_own_finished(self):
         return True
 
 
-def dynamic_decode(decoder,
-                   inits=None,
-                   max_step_num=None,
-                   output_time_major=False,
-                   impute_finished=False,
-                   is_test=False,
-                   return_length=False,
-                   **kwargs):
-    """
-	:api_attr: Static Graph
+def _dynamic_decode_imperative(decoder,
+                               inits=None,
+                               max_step_num=None,
+                               output_time_major=False,
+                               impute_finished=False,
+                               is_test=False,
+                               return_length=False,
+                               **kwargs):
+    def _maybe_copy(state, new_state, step_mask):
+        # TODO: use where_op
+        state_dtype = state.dtype
+        if convert_dtype(state_dtype) in ["bool"]:
+            state = tensor.cast(state, dtype="float32")
+            new_state = tensor.cast(new_state, dtype="float32")
+        if step_mask.dtype != state.dtype:
+            step_mask = tensor.cast(step_mask, dtype=state.dtype)
+            # otherwise, renamed bool gradients of would be summed up leading
+            # to sum(bool) error.
+            step_mask.stop_gradient = True
+        new_state = nn.elementwise_mul(
+            state, step_mask, axis=0) - nn.elementwise_mul(
+                new_state, (step_mask - 1), axis=0)
+        if convert_dtype(state_dtype) in ["bool"]:
+            new_state = tensor.cast(new_state, dtype=state_dtype)
+        return new_state
 
-    Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
-    Tensor indicating finished status contains all True values or the number of
-    decoding step reaches to :attr:`max_step_num`.
+    initial_inputs, initial_states, initial_finished = decoder.initialize(inits)
+    inputs, states, finished = (initial_inputs, initial_states,
+                                initial_finished)
+    cond = control_flow.logical_not((nn.reduce_all(initial_finished)))
+    sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64")
+    outputs = None
+
+    step_idx = 0
+    step_idx_tensor = tensor.fill_constant(
+        shape=[1], dtype="int64", value=step_idx)
+    while cond.numpy():
+        (step_outputs, next_states, next_inputs, next_finished) = decoder.step(
+            step_idx_tensor, inputs, states, **kwargs)
+        if not decoder.tracks_own_finished:
+            # BeamSearchDecoder would track it own finished, since
+            # beams would be reordered and the finished status of each
+            # entry might change. Otherwise, perform logical OR which
+            # would not change the already finished.
+            next_finished = control_flow.logical_or(next_finished, finished)
+            # To confirm states.finished/finished be consistent with
+            # next_finished.
+            tensor.assign(next_finished, finished)
+        next_sequence_lengths = nn.elementwise_add(
+            sequence_lengths,
+            tensor.cast(
+                control_flow.logical_not(finished), sequence_lengths.dtype))
 
-    :code:`decoder.initialize()` would be called once before the decoding loop.
-    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
-    would be called once after the decoding loop.
+        if impute_finished:  # rectify the states for the finished.
+            next_states = map_structure(
+                lambda x, y: _maybe_copy(x, y, finished), states, next_states)
+        outputs = map_structure(
+            lambda x: ArrayWrapper(x),
+            step_outputs) if step_idx == 0 else map_structure(
+                lambda x, x_array: x_array.append(x), step_outputs, outputs)
+        inputs, states, finished, sequence_lengths = (
+            next_inputs, next_states, next_finished, next_sequence_lengths)
 
-    Parameters:
-        decoder(Decoder): An instance of `Decoder`.
-        inits(object, optional): Argument passed to `decoder.initialize`. 
-            Default `None`.
-        max_step_num(int, optional): The maximum number of steps. If not provided,
-            decode until the decoder is fully done, or in other words, the returned
-            Tensor by :code:`decoder.step()` indicating finished status contains
-            all True. Default `None`.
-        output_time_major(bool, optional): Indicate the data layout of Tensor included
-            in the final outputs(the first returned value of this method). If
-            attr:`False`, the data layout would be batch major with shape
-            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
-            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
-        impute_finished(bool, optional): If `True`, then states get copied through
-            for batch entries which are marked as finished, which differs with the
-            unfinished using the new states returned by :code:`decoder.step()` and
-            ensures that the final states have the correct values. Otherwise, states
-            wouldn't be copied through when finished. If the returned `final_states`
-            is needed, it should be set as True, which causes some slowdown.
-            Default `False`.
-        is_test(bool, optional): A flag indicating whether to use test mode. In
-            test mode, it is more memory saving. Default `False`.
-        return_length(bool, optional):  A flag indicating whether to return an
-            extra Tensor variable in the output tuple, which stores the actual
-            lengths of all decoded sequences. Default `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. 
+        control_flow.increment(x=step_idx_tensor, value=1.0, in_place=True)
+        step_idx += 1
 
-    Returns:
-        tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
-            when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
-            The final outputs and states, both are Tensor or nested structure of Tensor. \
-            `final_outputs` has the same structure and data types as the :code:`outputs` \
-            returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
-            is the stacked of all decoding steps' outputs, which might be revised \
-            by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
-            `final_states` is the counterpart at last time step of initial states \
-            returned by :code:`decoder.initialize()` , thus has the same structure \
-            with it and has tensors with same shapes and data types. `sequence_lengths` \
-            is an `int64` tensor with the same shape as `finished` returned \
-            by :code:`decoder.initialize()` , and it stores the actual lengths of \
-            all decoded sequences.
-            
+        control_flow.logical_not(nn.reduce_all(finished), cond)
+        if max_step_num is not None and step_idx > max_step_num:
+            break
 
-    Examples:
+    final_outputs = map_structure(lambda x: nn.stack(x.array, axis=0), outputs)
+    final_states = states
 
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            from paddle.fluid.layers import GRUCell, BeamSearchDecoder, dynamic_decode
+    try:
+        final_outputs, final_states = decoder.finalize(
+            final_outputs, final_states, sequence_lengths)
+    except NotImplementedError:
+        pass
 
-            encoder_output = fluid.data(name="encoder_output",
-                                    shape=[-1, 32, 128],
-                                    dtype="float32")
-            trg_embeder = lambda x: fluid.embedding(
-                x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
-            output_layer = lambda x: layers.fc(x,
-                                            size=10000,
-                                            num_flatten_dims=len(x.shape) - 1,
-                                            param_attr=fluid.ParamAttr(name=
-                                                                        "output_w"),
-                                            bias_attr=False)
-            decoder_cell = GRUCell(hidden_size=128)
-            decoder = BeamSearchDecoder(decoder_cell,
-                                        start_token=0,
-                                        end_token=1,
-                                        beam_size=4,
-                                        embedding_fn=trg_embeder,
-                                        output_fn=output_layer)
+    if not output_time_major:
+        final_outputs = map_structure(
+            lambda x: nn.transpose(x, [1, 0] + list(range(2, len(x.shape)))),
+            final_outputs)
 
-            outputs = dynamic_decode(
-                decoder=decoder, inits=decoder_cell.get_initial_states(encoder_output))
-    """
+    return (final_outputs, final_states,
+            sequence_lengths) if return_length else (final_outputs,
+                                                     final_states)
+
+
+def _dynamic_decode_declarative(decoder,
+                                inits=None,
+                                max_step_num=None,
+                                output_time_major=False,
+                                impute_finished=False,
+                                is_test=False,
+                                return_length=False,
+                                **kwargs):
     initial_inputs, initial_states, initial_finished = decoder.initialize(inits)
     global_inputs, global_states, global_finished = (
         initial_inputs, initial_states, initial_finished)
@@ -1558,6 +1564,98 @@ def _create_array_out_of_while(dtype):
                                                      final_states)
 
 
+def dynamic_decode(decoder,
+                   inits=None,
+                   max_step_num=None,
+                   output_time_major=False,
+                   impute_finished=False,
+                   is_test=False,
+                   return_length=False,
+                   **kwargs):
+    """
+    Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
+    Tensor indicating finished status contains all True values or the number of
+    decoding step reaches to :attr:`max_step_num`.
+
+    :code:`decoder.initialize()` would be called once before the decoding loop.
+    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+    would be called once after the decoding loop.
+
+    Parameters:
+        decoder(Decoder): An instance of `Decoder`.
+        inits(object, optional): Argument passed to `decoder.initialize`. 
+            Default `None`.
+        max_step_num(int, optional): The maximum number of steps. If not provided,
+            decode until the decoder is fully done, or in other words, the returned
+            Tensor by :code:`decoder.step()` indicating finished status contains
+            all True. Default `None`.
+        output_time_major(bool, optional): Indicate the data layout of Tensor included
+            in the final outputs(the first returned value of this method). If
+            attr:`False`, the data layout would be batch major with shape
+            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
+            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
+        impute_finished(bool, optional): If `True`, then states get copied through
+            for batch entries which are marked as finished, which differs with the
+            unfinished using the new states returned by :code:`decoder.step()` and
+            ensures that the final states have the correct values. Otherwise, states
+            wouldn't be copied through when finished. If the returned `final_states`
+            is needed, it should be set as True, which causes some slowdown.
+            Default `False`.
+        is_test(bool, optional): A flag indicating whether to use test mode. In
+            test mode, it is more memory saving. Default `False`.
+        return_length(bool, optional):  A flag indicating whether to return an
+            extra Tensor variable in the output tuple, which stores the actual
+            lengths of all decoded sequences. Default `False`.
+        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. 
+
+    Returns:
+        tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
+            when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
+            The final outputs and states, both are Tensor or nested structure of Tensor. \
+            `final_outputs` has the same structure and data types as the :code:`outputs` \
+            returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
+            is the stacked of all decoding steps' outputs, which might be revised \
+            by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
+            `final_states` is the counterpart at last time step of initial states \
+            returned by :code:`decoder.initialize()` , thus has the same structure \
+            with it and has tensors with same shapes and data types. `sequence_lengths` \
+            is an `int64` tensor with the same shape as `finished` returned \
+            by :code:`decoder.initialize()` , and it stores the actual lengths of \
+            all decoded sequences.
+            
+
+    Examples:
+
+        .. code-block:: python
+            
+            import numpy as np
+            import paddle
+            from paddle.nn import BeamSearchDecoder, dynamic_decode
+            from paddle.nn import GRUCell, Linear, Embedding
+            trg_embeder = Embedding(100, 32)
+            output_layer = Linear(32, 32)
+            decoder_cell = GRUCell(input_size=32, hidden_size=32)
+            decoder = BeamSearchDecoder(decoder_cell,
+                                        start_token=0,
+                                        end_token=1,
+                                        beam_size=4,
+                                        embedding_fn=trg_embeder,
+                                        output_fn=output_layer)
+            encoder_output = paddle.ones((4, 8, 32), dtype=paddle.get_default_dtype())
+            outputs = dynamic_decode(decoder=decoder,
+                                    inits=decoder_cell.get_initial_states(encoder_output),
+                                    max_step_num=10)
+    """
+    if in_dygraph_mode():
+        return _dynamic_decode_imperative(decoder, inits, max_step_num,
+                                          output_time_major, impute_finished,
+                                          is_test, return_length, **kwargs)
+    else:
+        return _dynamic_decode_declarative(decoder, inits, max_step_num,
+                                           output_time_major, impute_finished,
+                                           is_test, return_length, **kwargs)
+
+
 class DecodeHelper(object):
     """
     DecodeHelper is the base class for any helper instance used in `BasicDecoder`.
@@ -3046,9 +3144,6 @@ def beam_search(pre_ids,
                 name=None,
                 return_parent_idx=False):
     """
-	:alias_main: paddle.nn.beam_search
-	:alias: paddle.nn.beam_search,paddle.nn.decode.beam_search
-	:old_api: paddle.fluid.layers.beam_search
 
     Beam search is a classical algorithm for selecting candidate words in a
     machine translation task.
@@ -3126,6 +3221,8 @@ def beam_search(pre_ids,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
             # Suppose `probs` contains predicted results from the computation
             # cell and `pre_ids` and `pre_scores` is the output of beam_search
@@ -3197,9 +3294,6 @@ def beam_search(pre_ids,
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
     """
-	:alias_main: paddle.nn.beam_search_decode
-	:alias: paddle.nn.beam_search_decode,paddle.nn.decode.beam_search_decode
-	:old_api: paddle.fluid.layers.beam_search_decode
 
     This operator is used after beam search has completed. It constructs the
     full predicted sequences for each sample by walking back along the search
@@ -3246,7 +3340,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-
+            import paddle
+            paddle.enable_static()
             # Suppose `ids` and `scores` are LodTensorArray variables reserving
             # the selected ids and scores of all steps
             ids = fluid.layers.create_array(dtype='int64')
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c633f7022d75e..fe3970ce1c10c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -199,49 +199,27 @@ def create_global_var(shape,
 
 def cast(x, dtype):
     """
-	:alias_main: paddle.cast
-	:alias: paddle.cast,paddle.tensor.cast,paddle.tensor.manipulation.cast
-	:old_api: paddle.fluid.layers.cast
 
     This OP takes in the Variable :attr:`x` with :attr:`x.dtype` and casts it
     to the output with :attr:`dtype`. It's meaningless if the output dtype
     equals the input dtype, but it's fine if you do so.
 
     Args:
-        x(Variable): An input N-D Tensor with data type bool, float16,
+        x(Tensor): An input N-D Tensor with data type bool, float16,
             float32, float64, int32, int64, uint8.
         dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output:
             bool, float16, float32, float64, int8, int32, int64, uint8.
 
     Returns:
-        Variable: A Tensor with the same shape as input's.
+        Tensor: A Tensor with the same shape as input's.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
+            import paddle
 
-            place = fluid.core.CPUPlace()
-
-            x_lod = fluid.data(name="x", shape=[2,2], lod_level=0)
-            cast_res1 = fluid.layers.cast(x=x_lod, dtype="uint8")
-            cast_res2 = fluid.layers.cast(x=x_lod, dtype=np.int32)
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            x_i_lod = fluid.core.LoDTensor()
-            x_i_lod.set(np.array([[1.3,-2.4],[0,4]]).astype("float32"), place)
-            x_i_lod.set_recursive_sequence_lengths([[0,2]])
-            res1 = exe.run(fluid.default_main_program(), feed={'x':x_i_lod}, fetch_list=[cast_res1], return_numpy=False)
-            res2 = exe.run(fluid.default_main_program(), feed={'x':x_i_lod}, fetch_list=[cast_res2], return_numpy=False)
-            print(np.array(res1[0]), np.array(res1[0]).dtype)
-            # [[  1 254]
-            #  [  0   4]] uint8
-            print(np.array(res2[0]), np.array(res2[0]).dtype)
-            # [[ 1 -2]
-            #  [ 0  4]] int32
+            x = paddle.to_tensor([2, 3, 4], 'float64')
+            y = paddle.cast(x, 'uint8')
     """
     check_variable_and_dtype(
         x, 'x',
@@ -550,9 +528,6 @@ def sums(input, out=None):
 
 def assign(input, output=None):
     """
-	:alias_main: paddle.nn.functional.assign
-	:alias: paddle.nn.functional.assign,paddle.nn.functional.common.assign
-	:old_api: paddle.fluid.layers.assign
 
     The OP copies the :attr:`input` to the :attr:`output`.
 
@@ -568,13 +543,16 @@ def assign(input, output=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
+          import paddle
           import numpy as np
-          data = fluid.layers.fill_constant(shape=[3, 2], value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
-          result1 = fluid.layers.create_tensor(dtype='float64')
-          fluid.layers.assign(data, result1) # result1 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
-          result2 = fluid.layers.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
-          result3 = fluid.layers.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          data = paddle.fill_constant(shape=[3, 2], value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          array = np.array([[1, 1],
+                            [3, 4],
+                            [1, 3]]).astype(np.int64)
+          result1 = paddle.zeros(shape=[3, 3], dtype='float32')
+          paddle.nn.functional.assign(array, result1) # result1 = [[1, 1], [3 4], [1, 3]]
+          result2 = paddle.nn.functional.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          result3 = paddle.nn.functional.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
     helper = LayerHelper('assign', **locals())
     check_type(input, 'input', (Variable, numpy.ndarray), 'assign')
@@ -627,8 +605,6 @@ def assign(input, output=None):
 
 def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     """
-	:alias_main: paddle.fill_constant
-	:alias: paddle.tensor.fill_constant, paddle.tensor.creation.fill_constant
 
     This OP creates a Tensor with specified `shape` and `dtype`, and
     initializes it with a constant specified by `value`.
@@ -737,7 +713,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     return out
 
 
-@deprecated(since='1.8.0', update_to="paddle.fill_constant")
+@deprecated(since='1.8.0', update_to="paddle.fluid.layers.fill_constant")
 @templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
@@ -1237,26 +1213,26 @@ def load_combine(out, file_path):
 
 def has_inf(x):
     """
-	:alias_main: paddle.has_inf
-	:alias: paddle.has_inf,paddle.tensor.has_inf,paddle.tensor.search.has_inf
-	:old_api: paddle.fluid.layers.has_inf
-
     Test if any of x contains an infinity number
 
     Args:
-       x (Variable): The Tensor/LoDTensor to be checked.
+       x (Tensor): The Tensor to be checked.
 
     Returns:
-       Variable: The tensor variable storing the output, only a bool value, indicating that whether there is infinity number in x or not.
+       Tensor: The tensor storing the output, only a bool value, indicating that whether there is infinity number in x or not.
     
     Examples:
         .. code-block:: python
           
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.has_inf(data)
+          import paddle
+          data = paddle.randn(shape=[4, 32, 32], dtype="float32")
+          res = paddle.fluid.layers.has_inf(data)
+          # [False]
 
     """
+    if in_dygraph_mode():
+        return core.ops.isinf(x)
+
     check_type(x, 'x', (Variable), 'has_inf')
     helper = LayerHelper("isinf", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -1266,26 +1242,26 @@ def has_inf(x):
 
 def has_nan(x):
     """
-	:alias_main: paddle.has_nan
-	:alias: paddle.has_nan,paddle.tensor.has_nan,paddle.tensor.search.has_nan
-	:old_api: paddle.fluid.layers.has_nan
-
     Test if any of x contains a NAN
 
     Args:
-       x (Variable): The Tensor/LoDTensor to be checked.
+       x (Tensor): The Tensor to be checked.
 
     Returns:
-       Variable: The tensor variable storing the output, only a bool value, indicating that whether there is NAN in x or not.
+       Tensor: The tensor variable storing the output, only a bool value, indicating that whether there is NAN in x or not.
     
     Examples:
         .. code-block:: python
     
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.has_nan(data)
+          import paddle
+          data = paddle.randn(shape=[2,3], dtype="float32")
+          res = paddle.fluid.layers.has_nan(data)
+          # [False]
 
     """
+    if in_dygraph_mode():
+        return core.ops.isnan(x)
+
     check_type(x, 'x', (Variable), 'has_nan')
     helper = LayerHelper("isnan", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -1438,9 +1414,9 @@ def linspace(start, stop, num, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
-             data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
+             import paddle
+             data = paddle.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
+             data = paddle.linspace(0, 10, 1, 'float32') # [0.0]
 
     """
     if dtype is None:
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index cab3daa29a171..0c3f6e1673287 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -851,6 +851,9 @@ class DetectionMAP(object):
 
             import paddle.fluid as fluid
 
+            import paddle
+            paddle.enable_static()
+
             batch_size = None # can be any size
             image_boxs_num = 10
             bounding_bboxes_num = 21
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 367be181f4725..cf49268a657e4 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -70,15 +70,15 @@ def __init__(self,
                  grad_clip=None,
                  name=None):
         # Because of the loop import, so place it in the function body
-        from paddle.optimizer.lr_scheduler import _LRScheduler
+        from paddle.optimizer.lr import LRScheduler
         self._parameter_list = list(
             parameter_list) if parameter_list is not None else None
         self._name = name
         if framework.in_dygraph_mode():
             if not isinstance(learning_rate,
-                              (float, LearningRateDecay, _LRScheduler)):
+                              (float, LearningRateDecay, LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or _LRScheduler, got %s here"
+                    "learning rate should be float or LRScheduler, got %s here"
                     % type(learning_rate))
             if self._parameter_list is None:
                 raise AttributeError(
@@ -94,9 +94,9 @@ def __init__(self,
                         break
         else:
             if not isinstance(learning_rate,
-                              (float, framework.Variable, _LRScheduler)):
+                              (float, framework.Variable, LRScheduler)):
                 raise TypeError(
-                    "learning rate should be float or _LRScheduler, got %s here"
+                    "learning rate should be float or LRScheduler, got %s here"
                     % type(learning_rate))
 
         if grad_clip is not None:
@@ -147,13 +147,13 @@ def state_dict(self):
                     state_dict = adam.state_dict()
 
         '''
-        from paddle.optimizer.lr_scheduler import _LRScheduler
+        from paddle.optimizer.lr import LRScheduler
         state_dict = {}
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
-        if isinstance(self._learning_rate, _LRScheduler):
+        if isinstance(self._learning_rate, LRScheduler):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
             return state_dict
         if isinstance(self._learning_rate, LearningRateDecay):
@@ -193,7 +193,7 @@ def set_state_dict(self, state_dict):
                 state_dict = emb.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
 
-                scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                scheduler = paddle.optimizer.lr.NoamDecay(	
                     d_model=0.01, warmup_steps=100, verbose=True)
                 adam = paddle.optimizer.Adam(
                     learning_rate=scheduler,
@@ -203,8 +203,8 @@ def set_state_dict(self, state_dict):
 
                 para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
         '''
-        from paddle.optimizer.lr_scheduler import _LRScheduler
-        if isinstance(self._learning_rate, _LRScheduler):
+        from paddle.optimizer.lr import LRScheduler
+        if isinstance(self._learning_rate, LRScheduler):
             self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
         if isinstance(self._learning_rate, LearningRateDecay):
@@ -269,8 +269,8 @@ def get_opti_var_name_list(self):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
-        from paddle.optimizer.lr_scheduler import _LRScheduler
-        if isinstance(self._learning_rate, _LRScheduler):
+        from paddle.optimizer.lr import LRScheduler
+        if isinstance(self._learning_rate, LRScheduler):
             lr_var = self._global_learning_rate()
             # only create global lr_var once
             if not isinstance(lr_var, framework.Variable):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index f757d8815f538..a9904d6f98239 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -30,7 +30,7 @@ class ParallelExecutor(object):
     """
 	:api_attr: Static Graph
 
-    The ParallelExecutor is an upgraded version of :code:`fluid.Executor` that supports multi-node model
+    The ParallelExecutor is an upgraded version of :code:`paddle.static.Executor` that supports multi-node model
     training and testing based on the data-parallel mode. In data-parallel mode,
     ParallelExecutor will broadcast the parameters from Node0 to other nodes during
     construction and copy the input Program to other nodes from Node0 to make sure
@@ -50,12 +50,12 @@ class ParallelExecutor(object):
 
     Args:
         use_cuda (bool): Whether to use CUDA or not.
-        loss_name (str): This parameter is the name of the loss variable of the
+        loss_name (str): This parameter is the name of the loss Tensor of the
             model. **Note: If it is data-parallel model training, you must set loss_name,
             otherwise, the results may be wrong**. The default is None.
         main_program (Program): This parameter represents the Program to be executed.
             If this parameter is not provided, that parameter is None, the program will
-            be set to :code:`fluid.default_main_program()`. The default is None.
+            be set to :code:`paddle.static.default_main_program()`. The default is None.
         share_vars_from(ParallelExecutor): If share_vars_from is set, the current
             ParallelExecutor will share the parameters with the ParallelExecutor
             specified by share_vars_from. This parameter needs to be set when model testing
@@ -66,13 +66,13 @@ class ParallelExecutor(object):
             The default is None.
         exec_strategy(ExecutionStrategy): exec_strategy specifies the options that can
             be changed when running the current model, such as the thread pool size.
-            For more information about exec_strategy, please refer to :code:`fluid.ExecutionStrategy`.
+            For more information about exec_strategy, please refer to :code:`paddle.static.ExecutionStrategy`.
             The default is None.
         build_strategy(BuildStrategy): By configuring build_strategy, we can
             optimize the computational graph, such as operators' fusion in the
             computational graph and memory optimization during the execution
             of the computational graph. For more information about build_strategy,
-            please refer to :code:`fluid.BuildStrategy`.  The default is None.
+            please refer to :code:`paddle.static.BuildStrategy`.  The default is None.
         num_trainers(int): This parameter needs to be set in GPU distributed training.
             If the parameter value is greater than 1, NCCL will be initialized by multi-level
             nodes. Each node should have the same number of GPUs. The default is 1.
@@ -81,7 +81,7 @@ class ParallelExecutor(object):
             Trainer_id indicates the "rank" of the current node. The trainer_id starts
             counting from 0. The default is 0.
         scope(Scope): Specifies the scope in which the program is executed.
-            The default is fluid.global_scope().
+            The default is paddle.static.global_scope().
 
     Returns:
         ParallelExecutor: The initialized ParallelExecutor object.
@@ -101,15 +101,16 @@ class ParallelExecutor(object):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
+          import paddle
           import numpy
           import os
 
           use_cuda = True
-          place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+          paddle.enable_static()
+          place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
           # NOTE: If you use CPU to run the program, you need
-          # to specify the CPU_NUM, otherwise, fluid will use
+          # to specify the CPU_NUM, otherwise, PaddlePaddle will use
           # all the number of the logic core as the CPU_NUM,
           # in that case, the batch size of the input should be
           # greater than CPU_NUM, if not, the process will be
@@ -117,26 +118,26 @@ class ParallelExecutor(object):
           if not use_cuda:
               os.environ['CPU_NUM'] = str(2)
 
-          exe = fluid.Executor(place)
+          exe = paddle.static.Executor(place)
 
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.data(name='X', shape=[None, 1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              test_program = fluid.default_main_program().clone(for_test=True)
-              fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+          train_program = paddle.static.Program()
+          startup_program = paddle.static.Program()
+          with paddle.static.program_guard(train_program, startup_program):
+              data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+              hidden = paddle.static.nn.fc(data, 10)
+              loss = paddle.mean(hidden)
+              test_program = paddle.static.default_main_program().clone(for_test=True)
+              paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
-          startup_program.random_seed=1
           exe.run(startup_program)
 
-          train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                             main_program=train_program,
-                                             loss_name=loss.name)
-          test_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                            main_program=test_program,
-                                            share_vars_from=train_exe)
+          train_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
+                                                     main_program=train_program,
+                                                     loss_name=loss.name)
+          # Note: if share_vars_from is not set here, the test parameter is different to the train one
+          test_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
+                                                    main_program=test_program,
+                                                    share_vars_from=train_exe)
 
           x = numpy.random.random(size=(10, 1)).astype('float32')
           loss_data, = train_exe.run(feed={"X": x},
@@ -205,11 +206,11 @@ def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         fetch_list.
 
         Args:
-            fetch_list(list): This parameter represents the variables that need to be returned
+            fetch_list(list): This parameter represents the Tensors that need to be returned
                 after the model runs. The default is None.
-            feed(list|dict): This parameter represents the input variables of the model.
+            feed(list|dict): This parameter represents the input Tensors of the model.
                 If it is single card training, the feed is dict type, and if it is multi-card
-                training, the parameter feed can be dict or list type variable. If the
+                training, the parameter feed can be dict or list of Tensor. If the
                 parameter type is dict, the data in the feed will be split and sent to
                 multiple devices (CPU/GPU), that is to say, the input data will be evenly
                 sent to different devices, so you should make sure the number of samples of
@@ -219,8 +220,8 @@ def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
                 The default is None.
             feed_dict: Alias for feed parameter, for backward compatibility.
                 This parameter has been deprecated. Default None.
-            return_numpy(bool): This parameter indicates whether convert the fetched variables
-                (the variable specified in the fetch list) to numpy.ndarray. if it is False,
+            return_numpy(bool): This parameter indicates whether convert the fetched Tensors
+                (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
                 the type of the return value is a list of :code:`LoDTensor`. The default is True.
 
         Returns:
@@ -241,22 +242,23 @@ def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
                number of CPU cores or GPU cards, if it is less than, it is recommended that
                the batch be discarded.
             2. If the number of CPU cores or GPU cards available is greater than 1, the fetch
-               results are spliced together in dimension 0 for the same variable values
-               (variables in fetch_list) on different devices.
+               results are spliced together in dimension 0 for the same Tensor values
+               (Tensors in fetch_list) on different devices.
 
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               import numpy
               import os
 
               use_cuda = True
-              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+              paddle.enable_static()
+              place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
               # NOTE: If you use CPU to run the program, you need
-              # to specify the CPU_NUM, otherwise, fluid will use
+              # to specify the CPU_NUM, otherwise, PaddlePaddle will use
               # all the number of the logic core as the CPU_NUM,
               # in that case, the batch size of the input should be
               # greater than CPU_NUM, if not, the process will be
@@ -264,21 +266,21 @@ def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
               if not use_cuda:
                   os.environ['CPU_NUM'] = str(2)
 
-              exe = fluid.Executor(place)
+              exe = paddle.static.Executor(place)
 
-              train_program = fluid.Program()
-              startup_program = fluid.Program()
-              with fluid.program_guard(train_program, startup_program):
-                  data = fluid.data(name='X', shape=[None, 1], dtype='float32')
-                  hidden = fluid.layers.fc(input=data, size=10)
-                  loss = fluid.layers.mean(hidden)
-                  fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
+              train_program = paddle.static.Program()
+              startup_program = paddle.static.Program()
+              with paddle.static.program_guard(train_program, startup_program):
+                  data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+                  hidden = paddle.static.nn.fc(data, 10)
+                  loss = paddle.mean(hidden)
+                  paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
               exe.run(startup_program)
 
-              train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                                 main_program=train_program,
-                                                 loss_name=loss.name)
+              train_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
+                                                         main_program=train_program,
+                                                         loss_name=loss.name)
 
               # If the feed is a dict:
               # the image will be split into devices. If there is two devices
@@ -314,7 +316,7 @@ def drop_local_exe_scopes(self):
         application and release of temporary variables, the strategy adopted by
         ParallelExecutor is to drop the local execution scopes after several iterations.
         ParallelExecutor provides the num_iteration_per_drop_scope option in
-        :code:`fluid.ExecutionStrategy`, which indicates how many iterations are intervened to
+        :code:`paddle.static.ExecutionStrategy`, which indicates how many iterations are intervened to
         drop the local execution scopes. If the num_iteration_per_drop_scope value
         is 100, but you want to drop the local execution scopes after 50 iterations,
         you can call the interface manually.
@@ -325,13 +327,13 @@ def drop_local_exe_scopes(self):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               import numpy
               import os
 
               use_cuda = True
               # NOTE: If you use CPU to run the program, you need
-              # to specify the CPU_NUM, otherwise, fluid will use
+              # to specify the CPU_NUM, otherwise, PaddlePaddle will use
               # all the number of the logic core as the CPU_NUM,
               # in that case, the batch size of the input should be
               # greater than CPU_NUM, if not, the process will be
@@ -339,26 +341,28 @@ def drop_local_exe_scopes(self):
               if not use_cuda:
                   os.environ['CPU_NUM'] = str(2)
 
-              train_program = fluid.Program()
-              startup_program = fluid.Program()
-              with fluid.program_guard(train_program, startup_program):
-                  data = fluid.data(name='X', shape=[None, 1], dtype='float32')
-                  hidden = fluid.layers.fc(input=data, size=10)
-                  loss = fluid.layers.mean(hidden)
+              paddle.enable_static()
+              train_program = paddle.static.Program()
+              startup_program = paddle.static.Program()
+              with paddle.static.program_guard(train_program, startup_program):
+                  data = paddle.static.data(name='X', shape=[None, 1], dtype='float32')
+                  hidden = paddle.static.nn.fc(data, 10)
+                  loss = paddle.mean(hidden)
 
-              place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-              exe = fluid.Executor(place)
+              place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+              exe = paddle.static.Executor(place)
               exe.run(startup_program)
 
-              parallel_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                                 main_program=train_program,
-                                                 loss_name=loss.name)
+              parallel_exe = paddle.static.ParallelExecutor(use_cuda=use_cuda,
+                                                            main_program=train_program,
+                                                            loss_name=loss.name)
 
               x = numpy.random.random(size=(10, 1)).astype('float32')
               loss_data, = parallel_exe.run(feed={"X": x},
-                                         fetch_list=[loss.name])
+                                            fetch_list=[loss.name])
 
               parallel_exe.drop_local_exe_scopes()
+
         """
         check_type(self._compiled_program._executor,
                    "the Executor of compiled program", core.ParallelExecutor,
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index bf04239370693..4105d5c1a4e49 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -264,17 +264,17 @@ class WeightNormParamAttr(ParamAttr):
 
             data = paddle.static.data(name="data", shape=[3, 32, 32], dtype="float32")
 
-            fc = paddle.static.nn.fc(input=data,
+            fc = paddle.static.nn.fc(x=data,
                                      size=1000,
-                                     param_attr=paddle.static.WeightNormParamAttr(
-                                                dim=None,
-                                                name='weight_norm_param',
-                                                initializer=paddle.nn.initializer.Constant(1.0),
-                                                learning_rate=1.0,
-                                                regularizer=paddle.regularizer.L2Decay(0.1),
-                                                trainable=True,
-                                                do_model_average=False,
-                                                need_clip=True))
+                                     weight_attr=paddle.static.WeightNormParamAttr(
+                                         dim=None,
+                                         name='weight_norm_param',
+                                         initializer=paddle.nn.initializer.Constant(1.0),
+                                         learning_rate=1.0,
+                                         regularizer=paddle.regularizer.L2Decay(0.1),
+                                         trainable=True,
+                                         do_model_average=False,
+                                         need_clip=True))
 
     """
     # List to record the parameters reparameterized by weight normalization.
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8f1b963ac8a5f..ba4f5ecf90323 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -129,6 +129,8 @@ if (NOT ${WITH_GPU})
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
+    LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
+
 elseif(${CUDNN_VERSION} VERSION_LESS 7100)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index 6612450b7cff8..ec57057164f61 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -41,7 +41,8 @@ def ops(self):
             is_sparse=True,
             padding_idx=self.padding_idx,
             param_attr=paddle.ParamAttr(
-                name=self.name, initializer=paddle.nn.initializer.Xavier()))
+                name=self.name,
+                initializer=paddle.nn.initializer.XavierUniform()))
 
         return emb
 
@@ -104,7 +105,7 @@ def ops(self, input):
         """
         operation
         """
-        mean = paddle.reduce_mean(input)
+        mean = paddle.mean(input)
         return mean
 
 
@@ -180,7 +181,7 @@ def ops(self, x, y):
         """
         operation
         """
-        sub = paddle.elementwise_sub(x, y)
+        sub = paddle.fluid.layers.elementwise_sub(x, y)
         return sub
 
 
@@ -202,7 +203,7 @@ def ops(self, input, shape, dtype, value):
         shape = list(shape)
         input_shape = paddle.shape(input)
         shape[0] = input_shape[0]
-        constant = paddle.fill_constant(shape, dtype, value)
+        constant = paddle.fluid.layers.fill_constant(shape, dtype, value)
         return constant
 
 
@@ -472,8 +473,8 @@ def forward(self, left, right):
         right_emb = paddle.reshape(
             right_emb, shape=[-1, self.seq_len, self.bow_dim])
 
-        bow_left = paddle.reduce_sum(left_emb, dim=1)
-        bow_right = paddle.reduce_sum(right_emb, dim=1)
+        bow_left = paddle.fluid.layers.reduce_sum(left_emb, dim=1)
+        bow_right = paddle.fluid.layers.reduce_sum(right_emb, dim=1)
         softsign_layer = SoftsignLayer()
         left_soft = softsign_layer.ops(bow_left)
         right_soft = softsign_layer.ops(bow_right)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index f54f70e4b854b..c4f5cc9e2bcbc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -25,7 +25,7 @@
 
 from predictor_utils import PredictorTools
 
-SEED = 2020
+SEED = 2000
 DATATYPE = 'float32'
 program_translator = ProgramTranslator()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
index e83128f045d8b..279c44d3245ea 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
@@ -24,7 +24,6 @@ def __init__(self, in_channels, hidden_size):
         self.lstm = nn.LSTM(
             in_channels, hidden_size, direction='bidirectional', num_layers=2)
 
-    @paddle.jit.to_static
     def forward(self, x):
         x, _ = self.lstm(x)
         return x
@@ -39,6 +38,7 @@ def run_lstm(self, to_static):
         paddle.static.default_startup_program().random_seed = 1001
 
         net = Net(12, 2)
+        net = paddle.jit.to_static(net)
         x = paddle.zeros((2, 10, 12))
         y = net(paddle.to_tensor(x))
         return y.numpy()
@@ -51,6 +51,69 @@ def test_lstm_to_static(self):
             msg='dygraph_out is {}\n static_out is \n{}'.format(dygraph_out,
                                                                 static_out))
 
+    def test_save_in_eval(self):
+        paddle.jit.ProgramTranslator().enable(True)
+        net = Net(12, 2)
+        x = paddle.randn((2, 10, 12))
+        dygraph_out = net(x)
+        # switch eval mode firstly
+        net.eval()
+
+        net = paddle.jit.to_static(
+            net, input_spec=[paddle.static.InputSpec(shape=[-1, 10, 12])])
+        paddle.jit.save(net, 'simple_lstm')
+        # load saved model
+        load_net = paddle.jit.load('simple_lstm')
+
+        static_out = load_net(x)
+        self.assertTrue(
+            np.allclose(dygraph_out.numpy(), static_out.numpy()),
+            msg='dygraph_out is {}\n static_out is \n{}'.format(dygraph_out,
+                                                                static_out))
+        # switch back into train mode.
+        net.train()
+        train_out = net(x)
+        self.assertTrue(
+            np.allclose(dygraph_out.numpy(), train_out.numpy()),
+            msg='dygraph_out is {}\n static_out is \n{}'.format(dygraph_out,
+                                                                train_out))
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self.fc = nn.Linear(10, 12)
+        self.dropout = nn.Dropout(0.5)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        y = self.fc(x)
+        y = self.dropout(y)
+        return y
+
+
+class TestSaveInEvalMode(unittest.TestCase):
+    def test_save_in_eval(self):
+        paddle.jit.ProgramTranslator().enable(True)
+        net = LinearNet()
+        # switch eval mode firstly
+        net.eval()
+        # save directly
+        net = paddle.jit.to_static(
+            net, input_spec=[paddle.static.InputSpec(shape=[-1, 10])])
+        paddle.jit.save(net, 'linear_net')
+        # load saved model
+        load_net = paddle.jit.load('linear_net')
+
+        x = paddle.randn((2, 10))
+        eval_out = net(x)
+
+        infer_out = load_net(x)
+        self.assertTrue(
+            np.allclose(eval_out.numpy(), infer_out.numpy()),
+            msg='eval_out is {}\n infer_out is \n{}'.format(eval_out,
+                                                            infer_out))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index b308854dc09a1..00b2d8dd1acc7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -64,9 +64,9 @@ def get_source_code(func):
 class StaticCode1():
     # TODO: Transform return statement
     def dyfunc_with_if_else(x_v, label=None):
-        __return_1 = paddle.fill_constant(shape=[1], dtype='bool', value=False)
-        __return_0 = paddle.fill_constant(shape=[1], dtype='bool', value=False)
-        __return_value_init_0 = paddle.fill_constant(
+        __return_1 = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=False)
+        __return_0 = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=False)
+        __return_value_init_0 = paddle.fluid.layers.fill_constant(
             shape=[1], dtype='float64', value=0.0)
         __return_value_0 = __return_value_init_0
 
@@ -84,7 +84,7 @@ def false_fn_0(x_v):
 
         def true_fn_1(__return_0, __return_value_0, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_0 = paddle.fill_constant(
+            __return_0 = paddle.fluid.layers.fill_constant(
                 shape=[1], dtype='bool', value=True)
             __return_value_0 = loss
             return __return_0, __return_value_0
@@ -98,7 +98,7 @@ def false_fn_1(__return_0, __return_value_0):
             (__return_0, __return_value_0), (__return_0, __return_value_0)))
 
         def true_fn_2(__return_1, __return_value_0, x_v):
-            __return_1 = paddle.fill_constant(
+            __return_1 = paddle.fluid.layers.fill_constant(
                 shape=[1], dtype='bool', value=True)
             __return_value_0 = x_v
             return __return_1, __return_value_0
@@ -116,9 +116,9 @@ def false_fn_2(__return_1, __return_value_0):
 class StaticCode2():
     # TODO: Transform return statement
     def dyfunc_with_if_else(x_v, label=None):
-        __return_3 = paddle.fill_constant(shape=[1], dtype='bool', value=False)
-        __return_2 = paddle.fill_constant(shape=[1], dtype='bool', value=False)
-        __return_value_init_1 = paddle.fill_constant(
+        __return_3 = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=False)
+        __return_2 = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=False)
+        __return_value_init_1 = paddle.fluid.layers.fill_constant(
             shape=[1], dtype='float64', value=0.0)
         __return_value_1 = __return_value_init_1
 
@@ -136,7 +136,7 @@ def false_fn_3(x_v):
 
         def true_fn_4(__return_2, __return_value_1, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_2 = paddle.fill_constant(
+            __return_2 = paddle.fluid.layers.fill_constant(
                 shape=[1], dtype='bool', value=True)
             __return_value_1 = loss
             return __return_2, __return_value_1
@@ -150,7 +150,7 @@ def false_fn_4(__return_2, __return_value_1):
             (__return_2, __return_value_1), (__return_2, __return_value_1)))
 
         def true_fn_5(__return_3, __return_value_1, x_v):
-            __return_3 = paddle.fill_constant(
+            __return_3 = paddle.fluid.layers.fill_constant(
                 shape=[1], dtype='bool', value=True)
             __return_value_1 = x_v
             return __return_3, __return_value_1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
index bdd5131db9da1..2c74e5b221f7e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
@@ -187,8 +187,8 @@ def forward(self, input, label, init_hidden, init_cell):
         loss = paddle.nn.functional.softmax_with_cross_entropy(
             logits=projection, label=label, soft_label=False)
         loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = paddle.reduce_mean(loss, dim=[0])
-        loss = paddle.reduce_sum(loss)
+        loss = paddle.mean(loss, axis=[0])
+        loss = paddle.fluid.layers.reduce_sum(loss)
 
         return loss, last_hidden, last_cell
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index a8cfeb90bd814..88c55f190768d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -153,7 +153,7 @@ def __init__(self, layers=50, class_dim=102):
 
         self.conv = ConvBNLayer(
             num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool2d_max = paddle.nn.Pool2D(
+        self.pool2d_max = paddle.fluid.dygraph.Pool2D(
             pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
 
         self.bottleneck_block_list = []
@@ -171,7 +171,7 @@ def __init__(self, layers=50, class_dim=102):
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = paddle.nn.Pool2D(
+        self.pool2d_avg = paddle.fluid.dygraph.Pool2D(
             pool_size=7, pool_type='avg', global_pooling=True)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
index e79209cb538c0..403b8f56a18d0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
@@ -51,24 +51,24 @@ def test_feed_mismatch_shape(self):
 class TestVariableTransFunc(unittest.TestCase):
     def test_create_fill_constant_node(self):
         node = create_fill_constant_node("a", 1.0)
-        source = "a = paddle.fill_constant(shape=[1], dtype='float64', value=1.0)"
+        source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0)"
         self.assertEqual(ast_to_source_code(node).strip(), source)
 
         node = create_fill_constant_node("b", True)
-        source = "b = paddle.fill_constant(shape=[1], dtype='bool', value=True)"
+        source = "b = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=True)"
         self.assertEqual(ast_to_source_code(node).strip(), source)
 
         if six.PY2:
             node = create_fill_constant_node("c", 214)
-            source = "c = paddle.fill_constant(shape=[1], dtype='int32', value=214)"
+            source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int32', value=214)"
             self.assertEqual(ast_to_source_code(node).strip(), source)
 
             node = create_fill_constant_node("d", long(10086))
-            source = "d = paddle.fill_constant(shape=[1], dtype='int64', value=10086)"
+            source = "d = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=10086)"
             self.assertEqual(ast_to_source_code(node).strip(), source)
         else:
             node = create_fill_constant_node("c", 4293)
-            source = "c = paddle.fill_constant(shape=[1], dtype='int64', value=4293)"
+            source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=4293)"
             self.assertEqual(ast_to_source_code(node).strip(), source)
 
         self.assertIsNone(create_fill_constant_node("e", None))
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index e7cdd49a32c26..48df06cddd934 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -118,5 +118,8 @@ def set_strategy(self, strategy, name):
                 'init_k_steps': 1,
                 'begin_step': 1,
             }
+        elif name == "gradient_merge":
+            strategy.gradient_merge = True
+            strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
         else:
             raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
index 5fa242df4e412..95cff4de6f6b0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -40,9 +40,9 @@ def setUp(self):
             matmul_ab_square = paddle.square(matmul_ab)
             matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
 
-            scale = paddle.fill_constant(shape=[1], value=0.5, dtype='float32')
+            scale = paddle.fluid.layers.fill_constant(shape=[1], value=0.5, dtype='float32')
 
-            sub_val = paddle.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square, matmul_square_ab)
             squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 6f0b4f9076ec4..0311eb887adf3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -23,12 +23,6 @@
 from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
 
 
-def conv2d_forward_refer(input, filter, group, conv_param):
-    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
-                                                          conv_param)
-    return out
-
-
 def conv2d_residual_naive(out, residual):
     assert out.shape == residual.shape
     out = np.add(out, residual)
@@ -176,6 +170,21 @@ def init_data_type(self):
         self.input_type = np.uint16
 
 
+class TestWithDilations(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 10, 10]
+        self.input_residual_size = [2, 6, 8, 8]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.input_type = np.uint16
+
+
 class TestWith1x1ForceFP32Output(TestConv2dBf16Op):
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 9731efced69d4..388eb38fc6e67 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -228,6 +228,22 @@ def init_test_case(self):
         self.scale_in_eltwise = 0.5
 
 
+class TestWithDilations(TestConv2dInt8Op):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 10, 10]
+        self.input_residual_size = [2, 6, 8, 8]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.scale_in = 1.0
+        self.scale_out = 0.8
+        self.scale_weights = [10.0]
+        self.scale_in_eltwise = 0.5
+
+
 class TestWith1x1(TestConv2dInt8Op):
     def init_test_case(self):
         self.pad = [0, 0]
@@ -343,6 +359,7 @@ def init_data_type(self):
 create_test_int8_class(TestConv2dInt8Op)
 create_test_int8_class(TestWithPad)
 create_test_int8_class(TestWithStride)
+create_test_int8_class(TestWithDilations)
 create_test_int8_class(TestWithGroup)
 create_test_int8_class(TestWith1x1)
 create_test_int8_class(TestWithInput1x1Filter1x1)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 6600d1456d787..6fad98874e077 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -215,5 +215,22 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
+class TestMKLDNNDilations(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index b98610760e6c7..1f68c35ec2b03 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -136,3 +136,17 @@ def init_test_case(self):
         self.data_format = "NHWC"
         N, C, H, W = self.input_size
         self.input_size = [N, H, W, C]
+
+
+class TestConv2dTransposeMKLDNNWithDilationsExplicitPad(
+        TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.stride = [2, 1]
+        self.dilations = [1, 2]
+        self.groups = 1
+        self.input_size = [4, 3, 8, 7]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 4, 3]
+        self.pad = [1, 3, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 96efc36ed0a50..649c12ea50f88 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -26,6 +26,7 @@
 import collections
 from collections import defaultdict
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
@@ -1133,8 +1134,10 @@ def find_actual(target_name, fetch_list):
             )
         # Check inplace for given op, its grad op, its grad_grad op, etc.
         # No effect on original OpTest
-        self.check_inplace_output_with_place(
-            place, no_check_set=no_check_set, inplace_atol=inplace_atol)
+        # Currently not support ParallelExecutor on XPUPlace.
+        if not paddle.is_compiled_with_xpu():
+            self.check_inplace_output_with_place(
+                place, no_check_set=no_check_set, inplace_atol=inplace_atol)
 
         if check_dygraph:
             return outs, dygraph_outs, fetch_list
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index 1320623f8f842..b7ef54a5c2a48 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -26,7 +26,7 @@
 import paddle.fluid.dygraph as dygraph
 from paddle.fluid import core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.nn import Conv2d, Pool2D, Linear, SyncBatchNorm
+from paddle.nn import Conv2d, Linear, SyncBatchNorm
 from paddle.fluid.dygraph.base import to_variable
 
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
index 41c252c2aa0a7..bb15b2713496d 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -70,10 +70,10 @@ def test_with_initial_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                init_h = paddle.data(
+                init_h = paddle.fluid.data(
                     "init_h", [-1, 32],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data, init_h)
@@ -98,7 +98,7 @@ def test_with_zero_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, 16],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data)
@@ -166,10 +166,10 @@ def test_with_initial_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                init_h = paddle.data(
+                init_h = paddle.fluid.data(
                     "init_h", [-1, 32],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data, init_h)
@@ -194,7 +194,7 @@ def test_with_zero_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, 16],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data)
@@ -263,13 +263,13 @@ def test_with_initial_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                init_h = paddle.data(
+                init_h = paddle.fluid.data(
                     "init_h", [-1, 32],
                     dtype=paddle.framework.get_default_dtype())
-                init_c = paddle.data(
+                init_c = paddle.fluid.data(
                     "init_c", [-1, 32],
                     dtype=paddle.framework.get_default_dtype())
                 y, (h, c) = rnn2(x_data, (init_h, init_c))
@@ -295,7 +295,7 @@ def test_with_zero_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, 16],
                     dtype=paddle.framework.get_default_dtype())
                 y, (h, c) = rnn2(x_data)
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 7c03b51837ef6..f40065cf8a3d0 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -29,11 +29,13 @@ def __init__(self, time_major=True, direction="forward", place="cpu"):
         self.time_major = time_major
         self.direction = direction
         self.num_directions = 2 if direction == "bidirectional" else 1
-        self.place = paddle.CPUPlace() if place == "cpu" \
-            else paddle.CUDAPlace(0)
+        self.place = place
 
     def setUp(self):
-        paddle.disable_static(self.place)
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
         rnn1 = SimpleRNN(
             16, 32, 2, time_major=self.time_major, direction=self.direction)
         rnn2 = paddle.nn.SimpleRNN(
@@ -103,11 +105,13 @@ def __init__(self, time_major=True, direction="forward", place="cpu"):
         self.time_major = time_major
         self.direction = direction
         self.num_directions = 2 if direction == "bidirectional" else 1
-        self.place = paddle.CPUPlace() if place == "cpu" \
-            else paddle.CUDAPlace(0)
+        self.place = place
 
     def setUp(self):
-        paddle.disable_static(self.place)
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
         rnn1 = GRU(16,
                    32,
                    2,
@@ -183,11 +187,13 @@ def __init__(self, time_major=True, direction="forward", place="cpu"):
         self.time_major = time_major
         self.direction = direction
         self.num_directions = 2 if direction == "bidirectional" else 1
-        self.place = paddle.CPUPlace() if place == "cpu" \
-            else paddle.CUDAPlace(0)
+        self.place = place
 
     def setUp(self):
-        paddle.disable_static(self.place)
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
         rnn1 = LSTM(
             16, 32, 2, time_major=self.time_major, direction=self.direction)
         rnn2 = paddle.nn.LSTM(
@@ -251,10 +257,68 @@ def test_with_input_lengths(self):
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
+    def test_predict(self):
+        place = paddle.set_device(self.place)
+        paddle.manual_seed(123)
+        np.random.seed(123)
+
+        class Net(paddle.nn.Layer):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.rnn1 = paddle.nn.LSTM(
+                    16, 32, 2, direction="bidirectional", dropout=0.1)
+
+            def forward(self, input):
+                return self.rnn1(input)
+
+        x = paddle.randn((4, 10, 16))
+        x.stop_gradient = False
+        seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
+        mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype)
+        mask = paddle.unsqueeze(mask, [2])
+        rnn = Net()
+        y, (h, c) = rnn(x)
+        y = y * mask
+        loss = paddle.mean(y)
+        loss.backward()
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.1, parameters=rnn.parameters())
+        optimizer.step()
+        rnn.eval()
+        y, (h, c) = rnn(x)
+        # `jit.to_static` would include a train_program, eval mode might cause
+        # some errors currently, such as dropout grad op gets `is_test == True`.
+        rnn.train()
+
+        rnn = paddle.jit.to_static(
+            rnn,
+            [paddle.static.InputSpec(
+                shape=[None, None, 16], dtype=x.dtype)])
+        paddle.jit.save(rnn, "./inference/lstm_infer")
+
+        paddle.enable_static()
+
+        new_scope = paddle.static.Scope()
+        with paddle.static.scope_guard(new_scope):
+            exe = paddle.static.Executor(place)
+            [inference_program, feed_target_names,
+             fetch_targets] = paddle.static.load_inference_model(
+                 dirname="./inference",
+                 executor=exe,
+                 model_filename="lstm_infer.pdmodel",
+                 params_filename="lstm_infer.pdiparams")
+            results = exe.run(inference_program,
+                              feed={feed_target_names[0]: x.numpy()},
+                              fetch_list=fetch_targets)
+            np.testing.assert_equal(
+                y.numpy(), results[0])  # eval results equal predict results
+        paddle.disable_static()
+
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
         self.test_with_input_lengths()
+        self.test_predict()
 
 
 def load_tests(loader, tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
index 71a0b5b7bcb34..f2a3da3ff6efe 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -30,10 +30,12 @@ def __init__(self, time_major=True, direction="forward", place="cpu"):
         self.time_major = time_major
         self.direction = direction
         self.num_directions = 2 if direction == "bidirectional" else 1
-        self.place = paddle.CPUPlace() if place == "cpu" \
-            else paddle.CUDAPlace(0)
+        self.place = place
 
     def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
         rnn1 = SimpleRNN(
             16, 32, 2, time_major=self.time_major, direction=self.direction)
 
@@ -48,7 +50,6 @@ def setUp(self):
                     time_major=self.time_major,
                     direction=self.direction)
 
-        place = self.place
         exe = paddle.static.Executor(place)
         scope = paddle.fluid.Scope()
         with paddle.static.scope_guard(scope):
@@ -81,10 +82,10 @@ def test_with_initial_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                init_h = paddle.data(
+                init_h = paddle.fluid.data(
                     "init_h", [2 * self.num_directions, -1, 32],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data, init_h)
@@ -112,7 +113,7 @@ def test_with_zero_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data)
@@ -142,10 +143,10 @@ def test_with_input_lengths(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                seq_len = paddle.fluid.data("seq_len", [-1], dtype="int64")
                 mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
                 if self.time_major:
                     mask = paddle.transpose(mask, [1, 0])
@@ -172,10 +173,12 @@ def __init__(self, time_major=True, direction="forward", place="cpu"):
         self.time_major = time_major
         self.direction = direction
         self.num_directions = 2 if direction == "bidirectional" else 1
-        self.place = paddle.CPUPlace() if place == "cpu" \
-            else paddle.CUDAPlace(0)
+        self.place = place
 
     def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
         rnn1 = GRU(16,
                    32,
                    2,
@@ -192,7 +195,6 @@ def setUp(self):
                                      time_major=self.time_major,
                                      direction=self.direction)
 
-        place = self.place
         exe = paddle.static.Executor(place)
         scope = paddle.fluid.Scope()
         with paddle.static.scope_guard(scope):
@@ -226,10 +228,10 @@ def test_with_initial_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                init_h = paddle.data(
+                init_h = paddle.fluid.data(
                     "init_h", [2 * self.num_directions, -1, 32],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data, init_h)
@@ -257,7 +259,7 @@ def test_with_zero_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
                 y, h = rnn2(x_data)
@@ -287,10 +289,10 @@ def test_with_input_lengths(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                seq_len = paddle.fluid.data("seq_len", [-1], dtype="int64")
                 mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
                 if self.time_major:
                     mask = paddle.transpose(mask, [1, 0])
@@ -316,10 +318,12 @@ def __init__(self, time_major=True, direction="forward", place="cpu"):
         self.time_major = time_major
         self.direction = direction
         self.num_directions = 2 if direction == "bidirectional" else 1
-        self.place = paddle.CPUPlace() if place == "cpu" \
-            else paddle.CUDAPlace(0)
+        self.place = place
 
     def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
         rnn1 = LSTM(
             16, 32, 2, time_major=self.time_major, direction=self.direction)
 
@@ -334,7 +338,6 @@ def setUp(self):
                     time_major=self.time_major,
                     direction=self.direction)
 
-        place = self.place
         exe = paddle.static.Executor(place)
         scope = paddle.fluid.Scope()
         with paddle.static.scope_guard(scope):
@@ -368,13 +371,13 @@ def test_with_initial_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                init_h = paddle.data(
+                init_h = paddle.fluid.data(
                     "init_h", [2 * self.num_directions, -1, 32],
                     dtype=paddle.framework.get_default_dtype())
-                init_c = paddle.data(
+                init_c = paddle.fluid.data(
                     "init_c", [2 * self.num_directions, -1, 32],
                     dtype=paddle.framework.get_default_dtype())
                 y, (h, c) = rnn2(x_data, (init_h, init_c))
@@ -403,7 +406,7 @@ def test_with_zero_state(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
                 y, (h, c) = rnn2(x_data)
@@ -434,10 +437,10 @@ def test_with_input_lengths(self):
 
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                x_data = paddle.data(
+                x_data = paddle.fluid.data(
                     "input", [-1, -1, 16],
                     dtype=paddle.framework.get_default_dtype())
-                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                seq_len = paddle.fluid.data("seq_len", [-1], dtype="int64")
                 mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
                 if self.time_major:
                     mask = paddle.transpose(mask, [1, 0])
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index e4412b1b24ee6..00cf7d5e9877b 100755
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -67,11 +68,27 @@ def test_errors(self):
             label = fluid.layers.data(
                 name='label', shape=[-1, 1], dtype="int32")
             self.assertRaises(TypeError, fluid.layers.accuracy, x1, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x1, label)
             # The input dtype of accuracy_op must be float32 or float64.
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
             self.assertRaises(TypeError, fluid.layers.accuracy, x2, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
             x3 = fluid.layers.data(name='input', shape=[-1, 2], dtype="float16")
             fluid.layers.accuracy(input=x3, label=label)
+            paddle.metric.accuracy(input=x3, label=label)
+
+
+class TestAccuracyAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype='float32')
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.metric.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype='float32')
+
+            self.assertEqual((result.numpy() == expect_value).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 4fed0c8552b44..8d9056f0ee37e 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -143,7 +143,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [11, 17])
+            x = paddle.fluid.data('X', [11, 17])
             out1 = F.log_sigmoid(x)
             m = paddle.nn.LogSigmoid()
             out2 = m(x)
@@ -167,7 +167,7 @@ def test_dygraph_api(self):
     def test_fluid_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [11, 17])
+            x = paddle.fluid.data('X', [11, 17])
             out = paddle.fluid.layers.logsigmoid(x)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
@@ -180,10 +180,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.log_sigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[11, 17], dtype='int32')
             self.assertRaises(TypeError, F.log_sigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[11, 17], dtype='float16')
             F.log_sigmoid(x_fp16)
 
 
@@ -222,7 +222,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12], self.dtype)
+            x = paddle.fluid.data('X', [10, 12], self.dtype)
             out1 = F.tanh(x)
             th = paddle.nn.Tanh()
             out2 = th(x)
@@ -260,10 +260,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.tanh, 1)
             # The input dtype must be float16, float32.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.tanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.tanh(x_fp16)
 
 
@@ -482,7 +482,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.tanhshrink(x)
             tanhshrink = paddle.nn.Tanhshrink()
             out2 = tanhshrink(x)
@@ -519,10 +519,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.tanhshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.tanhshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.tanhshrink(x_fp16)
 
 
@@ -572,7 +572,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12])
+            x = paddle.fluid.data('X', [10, 12])
             out1 = F.hardshrink(x)
             hd = paddle.nn.Hardshrink()
             out2 = hd(x)
@@ -616,10 +616,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardshrink(x_fp16)
 
 
@@ -642,7 +642,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12])
+            x = paddle.fluid.data('X', [10, 12])
             out1 = F.hardtanh(x)
             m = paddle.nn.Hardtanh()
             out2 = m(x)
@@ -676,10 +676,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardtanh, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardtanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardtanh(x_fp16)
 
 
@@ -722,7 +722,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.softshrink(x, self.threshold)
             softshrink = paddle.nn.Softshrink(self.threshold)
             out2 = softshrink(x)
@@ -759,13 +759,13 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.softshrink, x_int32)
             # The threshold must be no less than zero
-            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
             self.assertRaises(ValueError, F.softshrink, x_fp32, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.softshrink(x_fp16)
 
 
@@ -983,7 +983,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12])
+            x = paddle.fluid.data('X', [10, 12])
             out1 = F.relu(x)
             m = paddle.nn.ReLU()
             out2 = m(x)
@@ -1010,10 +1010,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[10, 12], dtype='int32')
             self.assertRaises(TypeError, F.relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[10, 12], dtype='float16')
             F.relu(x_fp16)
 
 
@@ -1075,7 +1075,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12])
+            x = paddle.fluid.data('X', [10, 12])
             out1 = F.leaky_relu(x)
             m = paddle.nn.LeakyReLU()
             out2 = m(x)
@@ -1119,10 +1119,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.leaky_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.leaky_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.leaky_relu(x_fp16)
 
 
@@ -1184,7 +1184,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [11, 17])
+            x = paddle.fluid.data('X', [11, 17])
             out1 = F.gelu(x)
             m = paddle.nn.GELU()
             out2 = m(x)
@@ -1218,10 +1218,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.gelu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[11, 17], dtype='int32')
             self.assertRaises(TypeError, F.gelu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[11, 17], dtype='float16')
             F.gelu(x_fp16)
 
 
@@ -1331,7 +1331,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.relu6(x)
             relu6 = paddle.nn.ReLU6()
             out2 = relu6(x)
@@ -1368,10 +1368,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.relu6, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.relu6, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.relu6(x_fp16)
 
 
@@ -1414,7 +1414,7 @@ def setUp(self):
 
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.hardswish(x)
             m = paddle.nn.Hardswish()
             out2 = m(x)
@@ -1455,10 +1455,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardswish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardswish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardswish(x_fp16)
 
 
@@ -1538,7 +1538,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12])
+            x = paddle.fluid.data('X', [10, 12])
             out1 = F.elu(x)
             m = paddle.nn.ELU()
             out2 = m(x)
@@ -1572,10 +1572,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.elu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[10, 12], dtype='int32')
             self.assertRaises(TypeError, F.elu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[10, 12], dtype='float16')
             F.elu(x_fp16)
 
 
@@ -1858,7 +1858,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.softplus(x, self.beta, self.threshold)
             softplus = paddle.nn.Softplus(self.beta, self.threshold)
             out2 = softplus(x)
@@ -1895,10 +1895,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softplus, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.softplus, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.softplus(x_fp16)
 
 
@@ -1935,7 +1935,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.softsign(x)
             softsign = paddle.nn.Softsign()
             out2 = softsign(x)
@@ -1972,10 +1972,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softsign, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.softsign, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.softsign(x_fp16)
 
 
@@ -2018,7 +2018,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.thresholded_relu(x, self.threshold)
             thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
             out2 = thresholded_relu(x)
@@ -2055,10 +2055,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.thresholded_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.thresholded_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.thresholded_relu(x_fp16)
 
 
@@ -2113,7 +2113,7 @@ def setUp(self):
 
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.hardsigmoid(x)
             m = paddle.nn.Hardsigmoid()
             out2 = m(x)
@@ -2154,10 +2154,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardsigmoid(x_fp16)
 
 
@@ -2195,7 +2195,7 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.swish(x)
             swish = paddle.nn.Swish()
             out2 = swish(x)
@@ -2232,10 +2232,10 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.swish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.swish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.swish(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
new file mode 100644
index 0000000000000..0ccd42aa674dd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
@@ -0,0 +1,41 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+import math
+
+
+class TestAdagradOpV2(unittest.TestCase):
+    def test_v20_coverage(self):
+        paddle.disable_static()
+        inp = paddle.rand(shape=[10, 10])
+        linear = paddle.nn.Linear(10, 10)
+        out = linear(inp)
+        loss = paddle.mean(out)
+        adagrad = paddle.optimizer.Adagrad(
+            learning_rate=0.1, parameters=linear.parameters())
+        out.backward()
+        adagrad.step()
+        adagrad.clear_grad()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 47bf8f49e39b6..f337e0079e7d9 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -455,8 +455,8 @@ def test_adam_op_with_state_dict(self):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        #learning_rate is _LRScheduler
-        learning_rate = paddle.optimizer.CosineAnnealingLR(
+        #learning_rate is LRScheduler
+        learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
             learning_rate=0.1, T_max=10)
         adam = paddle.optimizer.Adam(
             learning_rate=learning_rate,
@@ -499,7 +499,7 @@ def test_adam_op_with_set_lr(self):
         cur_lr = adam.get_lr()
         assert (lr == cur_lr)
         with self.assertRaises(TypeError):
-            lr_var = paddle.create_global_var(
+            lr_var = paddle.fluid.layers.create_global_var(
                 shape=[1], value=lr, dtype='float32')
             adam.set_lr(lr_var)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index b8c5bd2949124..25692808d090b 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -110,7 +110,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[3, 3])
@@ -205,7 +205,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
             out_1 = adaptive_avg_pool(x=x)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index bb36aaebf0842..ce85f6bf9fbed 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -125,7 +125,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -220,7 +220,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
                 output_size=[3, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index dfa6f3226c8ce..14de5aa53a5f5 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -110,7 +110,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_max_pool2d(
                 x=x, output_size=[3, 3])
@@ -200,7 +200,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
             out_1 = adaptive_max_pool(x=x)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
index 1fa703688cdd9..0aa97bdf1caf9 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -125,7 +125,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_max_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -215,7 +215,7 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
                 output_size=[3, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
index 4dc1ed99df628..5424a1447b862 100644
--- a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
@@ -18,7 +18,6 @@
 from op_test import OpTest
 import paddle.fluid as fluid
 import paddle
-import paddle.nn.functional as F
 from paddle.fluid import Program, program_guard
 
 
@@ -157,7 +156,7 @@ class TestAddPositionEncodingOpDygraph(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
         tensor = np.random.randn(16, 32, 64)
-        position_tensor = F.add_position_encoding(
+        position_tensor = paddle.fluid.layers.add_position_encoding(
             input=paddle.to_tensor(tensor), alpha=1.0, beta=1.0).numpy()
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_addcmul.py b/python/paddle/fluid/tests/unittests/test_addcmul.py
index 6657ebe77ad57..ed466cda3864d 100644
--- a/python/paddle/fluid/tests/unittests/test_addcmul.py
+++ b/python/paddle/fluid/tests/unittests/test_addcmul.py
@@ -37,7 +37,7 @@ def static(self, value=1.0):
             tensor1 = fluid.data(name="tensor1", dtype=self._dtype, shape=[100])
             tensor2 = fluid.data(
                 name="tensor2", dtype=self._dtype, shape=[3, 100])
-            out = paddle.addcmul(input, tensor1, tensor2, value)
+            out = paddle.tensor.math.addcmul(input, tensor1, tensor2, value)
 
         exe = fluid.Executor(self._place)
         return exe.run(feed={
@@ -53,7 +53,7 @@ def dynamic(self, value=1.0):
             input = fluid.dygraph.to_variable(self.input)
             tensor1 = fluid.dygraph.to_variable(self.tensor1)
             tensor2 = fluid.dygraph.to_variable(self.tensor2)
-            out = paddle.addcmul(input, tensor1, tensor2, value)
+            out = paddle.tensor.math.addcmul(input, tensor1, tensor2, value)
             return out.numpy()
 
     def numpy(self, value=1.0):
@@ -85,7 +85,7 @@ def test_addcmul(self):
             tensor1 = fluid.data(name='t1', shape=data_shape, dtype='float32')
             tensor2 = fluid.data(name='t2', shape=data_shape, dtype='float32')
 
-            out = paddle.addcmul(input, tensor1, tensor2)
+            out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
             self.assertEqual(out.shape, input.shape)
 
     def test_addcmul_with_broadcast0(self):
@@ -95,7 +95,7 @@ def test_addcmul_with_broadcast0(self):
             tensor1 = fluid.data(name='t1', shape=[3, 100], dtype='float32')
             tensor2 = fluid.data(name='t2', shape=[100], dtype='float32')
 
-            out = paddle.addcmul(input, tensor1, tensor2)
+            out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
             self.assertEqual(out.shape, input.shape)
 
     def test_addcmul_with_broadcast1(self):
@@ -105,7 +105,7 @@ def test_addcmul_with_broadcast1(self):
             tensor1 = fluid.data(name='t1', shape=[100], dtype='float32')
             tensor2 = fluid.data(name='t2', shape=[4, 100], dtype='float32')
 
-            out = paddle.addcmul(input, tensor1, tensor2)
+            out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
             self.assertEqual(out.shape, input.shape)
 
     def test_addcmul_with_broadcast2(self):
@@ -115,7 +115,7 @@ def test_addcmul_with_broadcast2(self):
             tensor1 = fluid.data(name='t1', shape=[100], dtype='float32')
             tensor2 = fluid.data(name='t2', shape=[100], dtype='float32')
 
-            out = paddle.addcmul(input, tensor1, tensor2)
+            out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
             self.assertEqual(out.shape, input.shape)
 
 
@@ -129,7 +129,7 @@ def test_invalid_input():
                     name='tensor1', shape=[20, 20], dtype='float32')
                 tensor2 = fluid.data(
                     name='tensor2', shape=[20, 20], dtype='float32')
-                out = paddle.addcmul(input, tensor1, tensor2)
+                out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
 
         self.assertRaises(TypeError, test_invalid_input)
 
@@ -141,7 +141,7 @@ def test_invalid_tensor1():
                 tensor1 = [20, 20]
                 tensor2 = fluid.data(
                     name='tensor2', shape=[20, 20], dtype='float32')
-                out = paddle.addcmul(input, tensor1, tensor2)
+                out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
 
         self.assertRaises(TypeError, test_invalid_tensor1)
 
@@ -153,7 +153,7 @@ def test_invalid_tensor2():
                 tensor1 = fluid.data(
                     name='tensor1', shape=[20, 20], dtype='float32')
                 tensor2 = [20, 20]
-                out = paddle.addcmul(input, tensor1, tensor2)
+                out = paddle.tensor.math.addcmul(input, tensor1, tensor2)
 
         self.assertRaises(TypeError, test_invalid_tensor2)
 
@@ -166,7 +166,7 @@ def test_invalid_value_int():
                     name='tensor1', shape=[20, 20], dtype='float32')
                 tensor2 = fluid.data(
                     name='tensor2', shape=[20, 20], dtype='float32')
-                out = paddle.addcmul(input, tensor1, tensor2, value=1)
+                out = paddle.tensor.math.addcmul(input, tensor1, tensor2, value=1)
 
         self.assertRaises(TypeError, test_invalid_value_int)
 
@@ -178,7 +178,7 @@ def test_invalid_value_float():
                     name='tensor1', shape=[20, 20], dtype='int32')
                 tensor2 = fluid.data(
                     name='tensor2', shape=[20, 20], dtype='int32')
-                out = paddle.addcmul(input, tensor1, tensor2, value=1.0)
+                out = paddle.tensor.math.addcmul(input, tensor1, tensor2, value=1.0)
 
         self.assertRaises(TypeError, test_invalid_value_float)
 
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_op.py b/python/paddle/fluid/tests/unittests/test_allclose_op.py
index dc50e569f8043..6441a789f1d68 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_op.py
@@ -22,19 +22,20 @@ class TestAllcloseOp(OpTest):
     def set_args(self):
         self.input = np.array([10000., 1e-07]).astype("float32")
         self.other = np.array([10000.1, 1e-08]).astype("float32")
-        self.rtol = 1e-05
-        self.atol = 1e-08
+        self.rtol = np.array([1e-05]).astype("float64")
+        self.atol = np.array([1e-08]).astype("float64")
         self.equal_nan = False
 
     def setUp(self):
         self.set_args()
         self.op_type = "allclose"
-        self.inputs = {'Input': self.input, 'Other': self.other}
-        self.attrs = {
-            'rtol': self.rtol,
-            'atol': self.atol,
-            'equal_nan': self.equal_nan
+        self.inputs = {
+            'Input': self.input,
+            'Other': self.other,
+            "Rtol": self.rtol,
+            "Atol": self.atol
         }
+        self.attrs = {'equal_nan': self.equal_nan}
         self.outputs = {
             'Out': np.array([
                 np.allclose(
@@ -54,8 +55,8 @@ class TestAllcloseOpSmallNum(TestAllcloseOp):
     def set_args(self):
         self.input = np.array([10000., 1e-08]).astype("float32")
         self.other = np.array([10000.1, 1e-09]).astype("float32")
-        self.rtol = 1e-05
-        self.atol = 1e-08
+        self.rtol = np.array([1e-05]).astype("float64")
+        self.atol = np.array([1e-08]).astype("float64")
         self.equal_nan = False
 
 
@@ -63,8 +64,8 @@ class TestAllcloseOpNanFalse(TestAllcloseOp):
     def set_args(self):
         self.input = np.array([1.0, float('nan')]).astype("float32")
         self.other = np.array([1.0, float('nan')]).astype("float32")
-        self.rtol = 1e-05
-        self.atol = 1e-08
+        self.rtol = np.array([1e-05]).astype("float64")
+        self.atol = np.array([1e-08]).astype("float64")
         self.equal_nan = False
 
 
@@ -72,8 +73,8 @@ class TestAllcloseOpNanTrue(TestAllcloseOp):
     def set_args(self):
         self.input = np.array([1.0, float('nan')]).astype("float32")
         self.other = np.array([1.0, float('nan')]).astype("float32")
-        self.rtol = 1e-05
-        self.atol = 1e-08
+        self.rtol = np.array([1e-05]).astype("float64")
+        self.atol = np.array([1e-08]).astype("float64")
         self.equal_nan = True
 
 
@@ -95,8 +96,8 @@ def test_input_dtype(self):
         def test_x_dtype():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
-                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
-                y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+                x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float16')
+                y = paddle.fluid.data(name='y', shape=[10, 10], dtype='float64')
                 result = paddle.allclose(x, y)
 
         self.assertRaises(TypeError, test_x_dtype)
@@ -104,15 +105,15 @@ def test_x_dtype():
         def test_y_dtype():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
-                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
-                y = paddle.data(name='y', shape=[10, 10], dtype='int32')
+                x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
+                y = paddle.fluid.data(name='y', shape=[10, 10], dtype='int32')
                 result = paddle.allclose(x, y)
 
         self.assertRaises(TypeError, test_y_dtype)
 
     def test_attr(self):
-        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
-        y = paddle.data(name='y', shape=[10, 10], dtype='float64')
+        x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
+        y = paddle.fluid.data(name='y', shape=[10, 10], dtype='float64')
 
         def test_rtol():
             result = paddle.allclose(x, y, rtol=True)
@@ -130,5 +131,33 @@ def test_equal_nan():
         self.assertRaises(TypeError, test_equal_nan)
 
 
+class TestAllcloseOpFloat32(TestAllcloseOp):
+    def set_args(self):
+        self.input = np.array([10.1]).astype("float32")
+        self.other = np.array([10]).astype("float32")
+        self.rtol = np.array([0.01]).astype("float64")
+        self.atol = np.array([0]).astype("float64")
+        self.equal_nan = False
+
+
+class TestAllcloseOpFloat64(TestAllcloseOp):
+    def set_args(self):
+        self.input = np.array([10.1]).astype("float64")
+        self.other = np.array([10]).astype("float64")
+        self.rtol = np.array([0.01]).astype("float64")
+        self.atol = np.array([0]).astype("float64")
+        self.equal_nan = False
+
+
+class TestAllcloseOpLargeDimInput(TestAllcloseOp):
+    def set_args(self):
+        self.input = np.array(np.zeros([2048, 1024])).astype("float64")
+        self.other = np.array(np.zeros([2048, 1024])).astype("float64")
+        self.input[-1][-1] = 100
+        self.rtol = np.array([1e-05]).astype("float64")
+        self.atol = np.array([1e-08]).astype("float64")
+        self.equal_nan = False
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 49c4182305550..82ddafb8f956f 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -17,6 +17,7 @@
 import op_test
 import numpy as np
 import unittest
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
@@ -99,5 +100,81 @@ def test_errors(self):
             self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
+class TestAssignOApi(unittest.TestCase):
+    def test_assign_LoDTensorArray(self):
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program):
+            x = fluid.data(name='x', shape=[100, 10], dtype='float32')
+            x.stop_gradient = False
+            y = fluid.layers.fill_constant(
+                shape=[100, 10], dtype='float32', value=1)
+            z = fluid.layers.elementwise_add(x=x, y=y)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            init_array = fluid.layers.array_write(x=z, i=i)
+            array = paddle.assign(init_array)
+            sums = fluid.layers.array_read(array=init_array, i=i)
+            mean = fluid.layers.mean(sums)
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        feed_x = np.random.random(size=(100, 10)).astype('float32')
+        ones = np.ones((100, 10)).astype('float32')
+        feed_add = feed_x + ones
+        res = exe.run(main_program,
+                      feed={'x': feed_x},
+                      fetch_list=[sums.name, x.grad_name])
+        self.assertTrue(np.allclose(res[0], feed_add))
+        self.assertTrue(np.allclose(res[1], ones / 1000.0))
+
+    def test_assign_NumpyArray(self):
+        with fluid.dygraph.guard():
+            array = np.random.random(size=(100, 10)).astype(np.bool)
+            result1 = paddle.zeros(shape=[3, 3], dtype='float32')
+            paddle.assign(array, result1)
+        self.assertTrue(np.allclose(result1.numpy(), array))
+
+    def test_assign_NumpyArray1(self):
+        with fluid.dygraph.guard():
+            array = np.random.random(size=(100, 10)).astype(np.float32)
+            result1 = paddle.zeros(shape=[3, 3], dtype='float32')
+            paddle.assign(array, result1)
+        self.assertTrue(np.allclose(result1.numpy(), array))
+
+    def test_assign_NumpyArray2(self):
+        with fluid.dygraph.guard():
+            array = np.random.random(size=(100, 10)).astype(np.int32)
+            result1 = paddle.zeros(shape=[3, 3], dtype='float32')
+            paddle.assign(array, result1)
+        self.assertTrue(np.allclose(result1.numpy(), array))
+
+    def test_assign_NumpyArray3(self):
+        with fluid.dygraph.guard():
+            array = np.random.random(size=(100, 10)).astype(np.int64)
+            result1 = paddle.zeros(shape=[3, 3], dtype='float32')
+            paddle.assign(array, result1)
+        self.assertTrue(np.allclose(result1.numpy(), array))
+
+
+class TestAssignOpErrorApi(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The type of input must be Variable or numpy.ndarray.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, paddle.assign, x1)
+            # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
+            self.assertRaises(TypeError, paddle.assign, x3)
+            # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
+            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            self.assertRaises(TypeError, paddle.assign, x4)
+            x5 = np.array([[2.5, 2.5]], dtype='uint8')
+            self.assertRaises(TypeError, paddle.assign, x5)
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 2af0b31d6fc26..324d4cf711036 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -168,5 +168,59 @@ def compute_v2(x_np):
             self.assertTrue(np.allclose(y1, y2))
 
 
+class TestBatchNormChannelLast(unittest.TestCase):
+    def setUp(self):
+        self.original_dtyep = paddle.get_default_dtype()
+        paddle.set_default_dtype("float64")
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            self.places.append(fluid.CUDAPlace(0))
+
+    def tearDown(self):
+        paddle.set_default_dtype(self.original_dtyep)
+
+    def test_1d(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 4])
+                net1 = paddle.nn.BatchNorm1d(4, data_format="NLC")
+                net2 = paddle.nn.BatchNorm1d(4)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                y1 = net1(x)
+                channel_first_x = paddle.transpose(x, [0, 2, 1])
+                y2 = net2(channel_first_x)
+                y2 = paddle.transpose(y2, [0, 2, 1])
+                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+
+    def test_2d(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 6, 4])
+                net1 = paddle.nn.BatchNorm2d(4, data_format="NHWC")
+                net2 = paddle.nn.BatchNorm2d(4)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                y1 = net1(x)
+                channel_first_x = paddle.transpose(x, [0, 3, 1, 2])
+                y2 = net2(channel_first_x)
+                y2 = paddle.transpose(y2, [0, 2, 3, 1])
+                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+
+    def test_3d(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 6, 6, 4])
+                net1 = paddle.nn.BatchNorm3d(4, data_format="NDHWC")
+                net2 = paddle.nn.BatchNorm3d(4)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                y1 = net1(x)
+                channel_first_x = paddle.transpose(x, [0, 4, 1, 2, 3])
+                y2 = net2(channel_first_x)
+                y2 = paddle.transpose(y2, [0, 2, 3, 4, 1])
+                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index a8054295b41c1..4b39436842b89 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -27,10 +27,10 @@ def test_static_layer(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
-            weight = paddle.data(
+            weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
             bce_loss = paddle.nn.loss.BCELoss(
                 weight=weight, reduction=reduction)
@@ -58,10 +58,10 @@ def test_static_functional(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
-            weight = paddle.data(
+            weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
             res = paddle.nn.functional.binary_cross_entropy(
                 input, label, weight=weight, reduction=reduction)
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
index 5ba13a6da01c7..a6175aa471d69 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -48,18 +48,18 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         pos_weight = None
         weight = None
         if pos_weight_np is not None:
-            pos_weight = paddle.data(
+            pos_weight = paddle.fluid.data(
                 name='pos_weight', shape=pos_weight_np.shape, dtype='float64')
             feed_dict["pos_weight"] = pos_weight_np
         if weight_np is not None:
-            weight = paddle.data(
+            weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
             feed_dict["weight"] = weight_np
         if functional:
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index 9fc4971fec239..58312979c523b 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -606,20 +606,6 @@ def test_case(self):
             self.assertTrue(np.allclose(res, expect_res))
 
 
-class TestUpsampleBilinear2dInterpOpAPI2_0(unittest.TestCase):
-    def test_case(self):
-
-        # dygraph
-        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
-        upsample = paddle.nn.UpsamplingBilinear2d(scale_factor=[2, 2])
-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(x_data)
-            interp = upsample(x)
-            expect = bilinear_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=True)
-            self.assertTrue(np.allclose(interp.numpy(), expect))
-
-
 class TestBilinearInterpOpAPI_dy(unittest.TestCase):
     def test_case(self):
         import paddle
diff --git a/python/paddle/fluid/tests/unittests/test_boxps.py b/python/paddle/fluid/tests/unittests/test_boxps.py
index 0eba0e8f26ef8..d1340bb1ce7d6 100644
--- a/python/paddle/fluid/tests/unittests/test_boxps.py
+++ b/python/paddle/fluid/tests/unittests/test_boxps.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import numpy as np
@@ -87,5 +88,19 @@ def test_run_cmd(self):
         self.assertTrue(ret2 == 0)
 
 
+class TestPullBoxSparseOP(unittest.TestCase):
+    """ TestCases for _pull_box_sparse op"""
+
+    def test_pull_box_sparse_op(self):
+        paddle.enable_static()
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[1], dtype='int64', lod_level=0)
+            y = fluid.layers.data(
+                name='y', shape=[1], dtype='int64', lod_level=0)
+            emb_x, emb_y = _pull_box_sparse([x, y], size=1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_op.py b/python/paddle/fluid/tests/unittests/test_chunk_op.py
index 043b326fbd987..8488bfe773f83 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_op.py
@@ -27,28 +27,28 @@ def test_errors(self):
         with program_guard(Program(), Program()):
             # The type of axis in chunk_op should be int or Variable.
             def test_axis_type():
-                x1 = paddle.data(shape=[4], dtype='float16', name='x3')
+                x1 = paddle.fluid.data(shape=[4], dtype='float16', name='x3')
                 paddle.chunk(x=x1, chunks=2, axis=3.2)
 
             self.assertRaises(TypeError, test_axis_type)
 
             # The type of axis in chunk op should be int or Variable.
             def test_axis_variable_type():
-                x2 = paddle.data(shape=[4], dtype='float16', name='x9')
-                x3 = paddle.data(shape=[1], dtype='float16', name='x10')
+                x2 = paddle.fluid.data(shape=[4], dtype='float16', name='x9')
+                x3 = paddle.fluid.data(shape=[1], dtype='float16', name='x10')
                 paddle.chunk(input=x2, chunks=2, axis=x3)
 
             self.assertRaises(TypeError, test_axis_variable_type)
 
             # The type of num_or_sections in chunk_op should be int, tuple or list.
             def test_chunks_type():
-                x4 = paddle.data(shape=[4], dtype='float16', name='x4')
+                x4 = paddle.fluid.data(shape=[4], dtype='float16', name='x4')
                 paddle.chunk(input=x4, chunks=2.1, axis=3)
 
             self.assertRaises(TypeError, test_chunks_type)
 
             def test_axis_type_tensor():
-                x5 = paddle.data(shape=[4], dtype='float16', name='x6')
+                x5 = paddle.fluid.data(shape=[4], dtype='float16', name='x6')
                 paddle.chunk(input=x5, chunks=2, axis=3.2)
 
             self.assertRaises(TypeError, test_axis_type_tensor)
@@ -57,8 +57,8 @@ def test_axis_type_tensor():
 class API_TestChunk(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data1 = paddle.data('data1', shape=[4, 6, 6], dtype='float64')
-            data2 = paddle.data('data2', shape=[1], dtype='int32')
+            data1 = paddle.fluid.data('data1', shape=[4, 6, 6], dtype='float64')
+            data2 = paddle.fluid.data('data2', shape=[1], dtype='int32')
             x0, x1, x2 = paddle.chunk(data1, chunks=3, axis=data2)
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -76,7 +76,7 @@ def test_out(self):
 class API_TestChunk1(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data1 = paddle.data('data1', shape=[4, 6, 6], dtype='float64')
+            data1 = paddle.fluid.data('data1', shape=[4, 6, 6], dtype='float64')
             x0, x1, x2 = paddle.chunk(data1, chunks=3, axis=2)
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 14c10e7aa2022..24a80ed2ed6ff 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -253,16 +253,17 @@ def test_fluid_api(self):
         assert np.array_equal(res_3, np.concatenate((input_2, input_3), axis=1))
 
     def test_api(self):
-        x_1 = paddle.data(shape=[None, 1, 4, 5], dtype='int32', name='x_1')
+        x_1 = paddle.fluid.data(
+            shape=[None, 1, 4, 5], dtype='int32', name='x_1')
         paddle.concat([x_1, x_1], 0)
 
         input_2 = np.random.random([2, 1, 4, 5]).astype("int32")
         input_3 = np.random.random([2, 2, 4, 5]).astype("int32")
         x_2 = fluid.data(shape=[2, 1, 4, 5], dtype='int32', name='x_2')
         x_3 = fluid.data(shape=[2, 2, 4, 5], dtype='int32', name='x_3')
-        positive_1_int32 = paddle.fill_constant([1], "int32", 1)
-        positive_1_int64 = paddle.fill_constant([1], "int64", 1)
-        negative_int64 = paddle.fill_constant([1], "int64", -3)
+        positive_1_int32 = paddle.fluid.layers.fill_constant([1], "int32", 1)
+        positive_1_int64 = paddle.fluid.layers.fill_constant([1], "int64", 1)
+        negative_int64 = paddle.fluid.layers.fill_constant([1], "int64", -3)
         out_1 = paddle.concat(x=[x_2, x_3], axis=1)
         out_2 = paddle.concat(x=[x_2, x_3], axis=positive_1_int32)
         out_3 = paddle.concat(x=[x_2, x_3], axis=positive_1_int64)
@@ -305,8 +306,8 @@ def test_errors(self):
                 np.array([[-1]]), [[1]], fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.concat, [x2])
             # The input dtype of concat_op must be float16, float32, float64, int32, int64.
-            x4 = paddle.data(shape=[4], dtype='uint8', name='x4')
-            x5 = paddle.data(shape=[4], dtype='uint8', name='x5')
+            x4 = paddle.fluid.data(shape=[4], dtype='uint8', name='x4')
+            x5 = paddle.fluid.data(shape=[4], dtype='uint8', name='x5')
             self.assertRaises(TypeError, fluid.layers.concat, [x4, x5])
 
             # The type of axis in concat_op should be int or Variable.
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
index a8899d9f022c0..0b6e5b444caf7 100644
--- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -48,8 +48,8 @@ def check_static_result(self, place):
             np_x1 = np.random.rand(*shape).astype(np.float32)
             np_x2 = np.random.rand(*shape).astype(np.float32)
 
-            x1 = paddle.data(name="x1", shape=shape)
-            x2 = paddle.data(name="x2", shape=shape)
+            x1 = paddle.fluid.data(name="x1", shape=shape)
+            x2 = paddle.fluid.data(name="x2", shape=shape)
             result = F.cosine_similarity(x1, x2, axis=axis, eps=eps)
             exe = Executor(place)
             fetches = exe.run(default_main_program(),
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 276d91fa025b6..7facc99a0736e 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -559,8 +559,10 @@ def test_queue_dataset_run(self):
         except Exception as e:
             self.assertTrue(False)
 
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
+        if os.path.exists("./test_queue_dataset_run_a.txt"):
+            os.remove("./test_queue_dataset_run_a.txt")
+        if os.path.exists("./test_queue_dataset_run_b.txt"):
+            os.remove("./test_queue_dataset_run_b.txt")
 
     def test_queue_dataset_run_2(self):
         """
@@ -611,8 +613,10 @@ def test_queue_dataset_run_2(self):
                 except Exception as e:
                     self.assertTrue(False)
 
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
+        if os.path.exists("./test_queue_dataset_run_a.txt"):
+            os.remove("./test_queue_dataset_run_a.txt")
+        if os.path.exists("./test_queue_dataset_run_b.txt"):
+            os.remove("./test_queue_dataset_run_b.txt")
 
     def test_queue_dataset_run_3(self):
         """
@@ -669,8 +673,10 @@ def test_queue_dataset_run_3(self):
                 except Exception as e:
                     self.assertTrue(False)
 
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
+        if os.path.exists("./test_queue_dataset_run_a.txt"):
+            os.remove("./test_queue_dataset_run_a.txt")
+        if os.path.exists("./test_queue_dataset_run_b.txt"):
+            os.remove("./test_queue_dataset_run_b.txt")
 
 
 class TestDatasetWithDataLoader(TestDataset):
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
index 0ab56f9244f93..195337e80defa 100644
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -51,6 +51,19 @@ def test_gpu_device(self):
             self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
             self.assertEqual(device, "gpu:0")
 
+    def test_xpu_device(self):
+        if core.is_compiled_with_xpu():
+            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
+            out2 = paddle.ones(shape=[1, 3], dtype='float32')
+            out3 = paddle.concat(x=[out1, out2], axis=0)
+            paddle.set_device('xpu:0')
+            exe = paddle.fluid.Executor()
+            exe.run(paddle.fluid.default_startup_program())
+            res = exe.run(fetch_list=[out3])
+            device = paddle.get_device()
+            self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
+            self.assertEqual(device, "xpu:0")
+
 
 class TestImperativeDeviceManage(unittest.TestCase):
     def test_cpu(self):
@@ -78,6 +91,17 @@ def test_gpu(self):
                                core.CUDAPlace), True)
                 self.assertEqual(device, "gpu:0")
 
+    def test_xpu(self):
+        if core.is_compiled_with_xpu():
+            with fluid.dygraph.guard():
+                out = paddle.to_tensor([1, 2])
+                device = paddle.get_device()
+                self.assertEqual(
+                    isinstance(framework._current_expected_place(),
+                               core.XPUPlace), True)
+                self.assertTrue(out.place.is_xpu_place())
+                self.assertEqual(device, "xpu:0")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index ddf1240e4ef27..29f5a90726d8f 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -23,224 +23,6 @@
 from paddle.fluid import Program, program_guard
 
 
-class TestDiagV2Op(OpTest):
-    def setUp(self):
-        self.op_type = "diag_v2"
-        self.x = np.random.rand(10, 10)
-        self.offset = 0
-        self.padding_value = 0.0
-        self.out = np.diag(self.x, self.offset)
-
-        self.init_config()
-        self.inputs = {'X': self.x}
-        self.attrs = {
-            'offset': self.offset,
-            'padding_value': self.padding_value
-        }
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_config(self):
-        pass
-
-
-class TestDiagV2OpCase1(TestDiagV2Op):
-    def init_config(self):
-        self.offset = 1
-        self.out = np.diag(self.x, self.offset)
-
-
-class TestDiagV2OpCase2(TestDiagV2Op):
-    def init_config(self):
-        self.offset = -1
-        self.out = np.diag(self.x, self.offset)
-
-
-class TestDiagV2OpCase3(TestDiagV2Op):
-    def init_config(self):
-        self.x = np.random.randint(-10, 10, size=(10, 10))
-        self.out = np.diag(self.x, self.offset)
-
-
-class TestDiagV2OpCase4(TestDiagV2Op):
-    def init_config(self):
-        self.x = np.random.rand(100)
-        self.padding_value = 8
-        n = self.x.size
-        self.out = self.padding_value * np.ones((n, n)) + np.diag(
-            self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
-
-
-class TestDiagV2Error(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-
-            def test_diag_v2_type():
-                x = [1, 2, 3]
-                output = paddle.diag(x)
-
-            self.assertRaises(TypeError, test_diag_v2_type)
-
-            x = paddle.static.data('data', [3, 3])
-            self.assertRaises(TypeError, paddle.diag, x, offset=2.5)
-
-            self.assertRaises(TypeError, paddle.diag, x, padding_value=[9])
-
-            x = paddle.static.data('data2', [3, 3, 3])
-            self.assertRaises(ValueError, paddle.diag, x)
-
-
-class TestDiagV2API(unittest.TestCase):
-    def setUp(self):
-        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
-        self.expected0 = np.diag(self.input_np)
-        self.expected1 = np.diag(self.input_np, k=1)
-        self.expected2 = np.diag(self.input_np, k=-1)
-
-        self.input_np2 = np.random.rand(100)
-        self.offset = 0
-        self.padding_value = 8
-        n = self.input_np2.size
-        self.expected3 = self.padding_value * np.ones(
-            (n, n)) + np.diag(self.input_np2, self.offset) - np.diag(
-                self.padding_value * np.ones(n))
-
-        self.input_np3 = np.random.randint(-10, 10, size=(100)).astype(np.int64)
-        self.padding_value = 8.0
-        n = self.input_np3.size
-        self.expected4 = self.padding_value * np.ones(
-            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
-                self.padding_value * np.ones(n))
-
-        self.padding_value = -8
-        self.expected5 = self.padding_value * np.ones(
-            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
-                self.padding_value * np.ones(n))
-
-        self.input_np4 = np.random.random(size=(2000, 2000)).astype(np.float32)
-        self.expected6 = np.diag(self.input_np4)
-        self.expected7 = np.diag(self.input_np4, k=1)
-        self.expected8 = np.diag(self.input_np4, k=-1)
-
-        self.input_np5 = np.random.random(size=(2000)).astype(np.float32)
-        self.expected9 = np.diag(self.input_np5)
-        self.expected10 = np.diag(self.input_np5, k=1)
-        self.expected11 = np.diag(self.input_np5, k=-1)
-
-    def run_imperative(self):
-        x = paddle.to_tensor(self.input_np)
-        y = paddle.diag(x)
-        self.assertTrue(np.allclose(y.numpy(), self.expected0))
-
-        y = paddle.diag(x, offset=1)
-        self.assertTrue(np.allclose(y.numpy(), self.expected1))
-
-        y = paddle.diag(x, offset=-1)
-        self.assertTrue(np.allclose(y.numpy(), self.expected2))
-
-        x = paddle.to_tensor(self.input_np2)
-        y = paddle.diag(x, padding_value=8)
-        self.assertTrue(np.allclose(y.numpy(), self.expected3))
-
-        x = paddle.to_tensor(self.input_np3)
-        y = paddle.diag(x, padding_value=8.0)
-        self.assertTrue(np.allclose(y.numpy(), self.expected4))
-
-        y = paddle.diag(x, padding_value=-8)
-        self.assertTrue(np.allclose(y.numpy(), self.expected5))
-
-        x = paddle.to_tensor(self.input_np4)
-        y = paddle.diag(x)
-        self.assertTrue(np.allclose(y.numpy(), self.expected6))
-
-        y = paddle.diag(x, offset=1)
-        self.assertTrue(np.allclose(y.numpy(), self.expected7))
-
-        y = paddle.diag(x, offset=-1)
-        self.assertTrue(np.allclose(y.numpy(), self.expected8))
-
-        x = paddle.to_tensor(self.input_np5)
-        y = paddle.diag(x)
-        self.assertTrue(np.allclose(y.numpy(), self.expected9))
-
-        y = paddle.diag(x, offset=1)
-        self.assertTrue(np.allclose(y.numpy(), self.expected10))
-
-        y = paddle.diag(x, offset=-1)
-        self.assertTrue(np.allclose(y.numpy(), self.expected11))
-
-    def run_static(self, use_gpu=False):
-        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
-        x2 = paddle.data(name='input2', shape=[100], dtype='float64')
-        x3 = paddle.data(name='input3', shape=[100], dtype='int64')
-        x4 = paddle.data(name='input4', shape=[2000, 2000], dtype='float32')
-        x5 = paddle.data(name='input5', shape=[2000], dtype='float32')
-        result0 = paddle.diag(x)
-        result1 = paddle.diag(x, offset=1)
-        result2 = paddle.diag(x, offset=-1)
-        result3 = paddle.diag(x, name='aaa')
-        result4 = paddle.diag(x2, padding_value=8)
-        result5 = paddle.diag(x3, padding_value=8.0)
-        result6 = paddle.diag(x3, padding_value=-8)
-        result7 = paddle.diag(x4)
-        result8 = paddle.diag(x4, offset=1)
-        result9 = paddle.diag(x4, offset=-1)
-        result10 = paddle.diag(x5)
-        result11 = paddle.diag(x5, offset=1)
-        result12 = paddle.diag(x5, offset=-1)
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        res0, res1, res2, res4, res5, res6, res7, res8, res9, res10, res11, res12 = exe.run(
-            feed={
-                "input": self.input_np,
-                "input2": self.input_np2,
-                'input3': self.input_np3,
-                'input4': self.input_np4,
-                'input5': self.input_np5
-            },
-            fetch_list=[
-                result0, result1, result2, result4, result5, result6, result7,
-                result8, result9, result10, result11, result12
-            ])
-
-        self.assertTrue(np.allclose(res0, self.expected0))
-        self.assertTrue(np.allclose(res1, self.expected1))
-        self.assertTrue(np.allclose(res2, self.expected2))
-        self.assertTrue('aaa' in result3.name)
-        self.assertTrue(np.allclose(res4, self.expected3))
-        self.assertTrue(np.allclose(res5, self.expected4))
-        self.assertTrue(np.allclose(res6, self.expected5))
-        self.assertTrue(np.allclose(res7, self.expected6))
-        self.assertTrue(np.allclose(res8, self.expected7))
-        self.assertTrue(np.allclose(res9, self.expected8))
-        self.assertTrue(np.allclose(res10, self.expected9))
-        self.assertTrue(np.allclose(res11, self.expected10))
-        self.assertTrue(np.allclose(res12, self.expected11))
-
-    def test_cpu(self):
-        paddle.disable_static(place=paddle.fluid.CPUPlace())
-        self.run_imperative()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static()
-
-    def test_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-
-        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
-        self.run_imperative()
-        paddle.enable_static()
-
-        with fluid.program_guard(fluid.Program()):
-            self.run_static(use_gpu=True)
-
-
 class TestDiagOp(OpTest):
     def setUp(self):
         self.op_type = "diag"
@@ -250,6 +32,7 @@ def setUp(self):
         self.outputs = {'Out': np.diag(self.inputs['Diagonal'])}
 
     def test_check_output(self):
+        paddle.enable_static()
         self.check_output()
 
     def init_config(self):
@@ -263,6 +46,7 @@ def init_config(self):
 
 class TestDiagError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
 
             def test_diag_type():
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
new file mode 100644
index 0000000000000..c364fb0a19335
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -0,0 +1,260 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import Program, program_guard
+
+
+class TestDiagV2Op(OpTest):
+    def setUp(self):
+        self.op_type = "diag_v2"
+        self.x = np.random.rand(10, 10)
+        self.offset = 0
+        self.padding_value = 0.0
+        self.out = np.diag(self.x, self.offset)
+
+        self.init_config()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'offset': self.offset,
+            'padding_value': self.padding_value
+        }
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def init_config(self):
+        pass
+
+
+class TestDiagV2OpCase1(TestDiagV2Op):
+    def init_config(self):
+        self.offset = 1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase2(TestDiagV2Op):
+    def init_config(self):
+        self.offset = -1
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase3(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.randint(-10, 10, size=(10, 10))
+        self.out = np.diag(self.x, self.offset)
+
+
+class TestDiagV2OpCase4(TestDiagV2Op):
+    def init_config(self):
+        self.x = np.random.rand(100)
+        self.padding_value = 8
+        n = self.x.size
+        self.out = self.padding_value * np.ones((n, n)) + np.diag(
+            self.x, self.offset) - np.diag(self.padding_value * np.ones(n))
+
+
+class TestDiagV2Error(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+
+            def test_diag_v2_type():
+                x = [1, 2, 3]
+                output = paddle.diag(x)
+
+            self.assertRaises(TypeError, test_diag_v2_type)
+
+            x = paddle.static.data('data', [3, 3])
+            self.assertRaises(TypeError, paddle.diag, x, offset=2.5)
+
+            self.assertRaises(TypeError, paddle.diag, x, padding_value=[9])
+
+            x = paddle.static.data('data2', [3, 3, 3])
+            self.assertRaises(ValueError, paddle.diag, x)
+
+
+class TestDiagV2API(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
+        self.expected0 = np.diag(self.input_np)
+        self.expected1 = np.diag(self.input_np, k=1)
+        self.expected2 = np.diag(self.input_np, k=-1)
+
+        self.input_np2 = np.random.rand(100)
+        self.offset = 0
+        self.padding_value = 8
+        n = self.input_np2.size
+        self.expected3 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np2, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.input_np3 = np.random.randint(-10, 10, size=(100)).astype(np.int64)
+        self.padding_value = 8.0
+        n = self.input_np3.size
+        self.expected4 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.padding_value = -8
+        self.expected5 = self.padding_value * np.ones(
+            (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
+                self.padding_value * np.ones(n))
+
+        self.input_np4 = np.random.random(size=(2000, 2000)).astype(np.float32)
+        self.expected6 = np.diag(self.input_np4)
+        self.expected7 = np.diag(self.input_np4, k=1)
+        self.expected8 = np.diag(self.input_np4, k=-1)
+
+        self.input_np5 = np.random.random(size=(2000)).astype(np.float32)
+        self.expected9 = np.diag(self.input_np5)
+        self.expected10 = np.diag(self.input_np5, k=1)
+        self.expected11 = np.diag(self.input_np5, k=-1)
+
+        self.input_np6 = np.random.random(size=(2000, 1500)).astype(np.float32)
+        self.expected12 = np.diag(self.input_np6, k=-1)
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = paddle.diag(x, padding_value=8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        x = paddle.to_tensor(self.input_np3)
+        y = paddle.diag(x, padding_value=8.0)
+        self.assertTrue(np.allclose(y.numpy(), self.expected4))
+
+        y = paddle.diag(x, padding_value=-8)
+        self.assertTrue(np.allclose(y.numpy(), self.expected5))
+
+        x = paddle.to_tensor(self.input_np4)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected6))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected7))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected8))
+
+        x = paddle.to_tensor(self.input_np5)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected9))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected10))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected11))
+
+        x = paddle.to_tensor(self.input_np6)
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected12))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.static.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.static.data(name='input2', shape=[100], dtype='float64')
+        x3 = paddle.static.data(name='input3', shape=[100], dtype='int64')
+        x4 = paddle.static.data(
+            name='input4', shape=[2000, 2000], dtype='float32')
+        x5 = paddle.static.data(name='input5', shape=[2000], dtype='float32')
+        x6 = paddle.static.data(
+            name='input6', shape=[2000, 1500], dtype='float32')
+        result0 = paddle.diag(x)
+        result1 = paddle.diag(x, offset=1)
+        result2 = paddle.diag(x, offset=-1)
+        result3 = paddle.diag(x, name='aaa')
+        result4 = paddle.diag(x2, padding_value=8)
+        result5 = paddle.diag(x3, padding_value=8.0)
+        result6 = paddle.diag(x3, padding_value=-8)
+        result7 = paddle.diag(x4)
+        result8 = paddle.diag(x4, offset=1)
+        result9 = paddle.diag(x4, offset=-1)
+        result10 = paddle.diag(x5)
+        result11 = paddle.diag(x5, offset=1)
+        result12 = paddle.diag(x5, offset=-1)
+        result13 = paddle.diag(x6, offset=-1)
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        res0, res1, res2, res4, res5, res6, res7, res8, res9, res10, res11, res12, res13 = exe.run(
+            feed={
+                "input": self.input_np,
+                "input2": self.input_np2,
+                'input3': self.input_np3,
+                'input4': self.input_np4,
+                'input5': self.input_np5,
+                'input6': self.input_np6
+            },
+            fetch_list=[
+                result0, result1, result2, result4, result5, result6, result7,
+                result8, result9, result10, result11, result12, result13
+            ])
+
+        self.assertTrue(np.allclose(res0, self.expected0))
+        self.assertTrue(np.allclose(res1, self.expected1))
+        self.assertTrue(np.allclose(res2, self.expected2))
+        self.assertTrue('aaa' in result3.name)
+        self.assertTrue(np.allclose(res4, self.expected3))
+        self.assertTrue(np.allclose(res5, self.expected4))
+        self.assertTrue(np.allclose(res6, self.expected5))
+        self.assertTrue(np.allclose(res7, self.expected6))
+        self.assertTrue(np.allclose(res8, self.expected7))
+        self.assertTrue(np.allclose(res9, self.expected8))
+        self.assertTrue(np.allclose(res10, self.expected9))
+        self.assertTrue(np.allclose(res11, self.expected10))
+        self.assertTrue(np.allclose(res12, self.expected11))
+        self.assertTrue(np.allclose(res13, self.expected12))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with fluid.program_guard(fluid.Program()):
+            self.run_static(use_gpu=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 28232e9ba4dc0..72df01ac1bcad 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -37,20 +37,27 @@ def test_new_directory(self):
         new_directory = [
             'paddle.enable_static', 'paddle.disable_static',
             'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad',
-            'paddle.no_grad', 'paddle.save', 'paddle.load',
-            'paddle.static.save', 'paddle.static.load',
+            'paddle.no_grad', 'paddle.static.save', 'paddle.static.load',
             'paddle.distributed.ParallelEnv',
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay',
-            'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay',
-            'paddle.ExponentialDecay', 'paddle.InverseTimeDecay',
-            'paddle.PolynomialDecay', 'paddle.CosineDecay',
-            'paddle.static.Executor', 'paddle.static.global_scope',
-            'paddle.static.scope_guard', 'paddle.static.append_backward',
-            'paddle.static.gradients', 'paddle.static.BuildStrategy',
-            'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy',
+            'paddle.jit.save', 'paddle.jit.load',
+            'paddle.optimizer.lr.LRScheduler', 'paddle.optimizer.lr.NoamDecay',
+            'paddle.optimizer.lr.PiecewiseDecay',
+            'paddle.optimizer.lr.NaturalExpDecay',
+            'paddle.optimizer.lr.ExponentialDecay',
+            'paddle.optimizer.lr.InverseTimeDecay',
+            'paddle.optimizer.lr.PolynomialDecay',
+            'paddle.optimizer.lr.CosineAnnealingDecay',
+            'paddle.optimizer.lr.MultiStepDecay',
+            'paddle.optimizer.lr.StepDecay', 'paddle.optimizer.lr.LambdaDecay',
+            'paddle.optimizer.lr.ReduceOnPlateau',
+            'paddle.optimizer.lr.LinearWarmup', 'paddle.static.Executor',
+            'paddle.static.global_scope', 'paddle.static.scope_guard',
+            'paddle.static.append_backward', 'paddle.static.gradients',
+            'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
+            'paddle.static.ExecutionStrategy',
             'paddle.static.default_main_program',
             'paddle.static.default_startup_program', 'paddle.static.Program',
             'paddle.static.name_scope', 'paddle.static.program_guard',
@@ -86,8 +93,8 @@ def test_new_directory(self):
             stderr=subprocess.PIPE)
         stdout, stderr = ps_proc.communicate()
 
-        assert "Error" not in str(stderr), "Error: Can't" \
-            " import Module {}".format(module)
+        self.assertFalse("Error" in str(stderr),
+                         "ErrorMessage:\n{}".format(bytes.decode(stderr)))
 
     def test_old_directory(self):
         old_directory = [
@@ -169,7 +176,7 @@ def test_old_directory(self):
             stderr=subprocess.PIPE)
         stdout, stderr = ps_proc.communicate()
 
-        assert "Error" not in str(stdout), str(stdout)
+        self.assertFalse("Error" in str(stdout), bytes.decode(stdout))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index 218eb77d0b565..d9ef1cf50c9ee 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -183,8 +183,12 @@ def test(self):
 
         from paddle.fluid.communicator import LargeScaleKV
         kv = LargeScaleKV()
+
         kv.save("__emb__.block0",
                 os.path.join(model_dir, "__emb__", "__emb__.block0"))
+
+        kv.size("__emb__.block0")
+
         fluid.framework.switch_main_program(fluid.Program())
         fleet.init_server(model_dir)
         shutil.rmtree(model_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
new file mode 100644
index 0000000000000..eddac64bab91b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+
+
+class SparseLoadOp(unittest.TestCase):
+    """ Test load operator.
+    """
+
+    def net(self, emb_array, fc_array):
+        with fluid.unique_name.guard():
+            dense_input = fluid.data('input', shape=[None, 1], dtype="int64")
+
+            emb = fluid.layers.embedding(
+                input=dense_input,
+                is_sparse=True,
+                size=[10, 10],
+                param_attr=fluid.ParamAttr(
+                    name="embedding",
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        emb_array)), )
+
+            fc1 = fluid.layers.fc(
+                input=emb,
+                size=10,
+                act="relu",
+                param_attr=fluid.ParamAttr(
+                    name='fc',
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        fc_array)))
+            loss = fluid.layers.reduce_mean(fc1)
+        return loss
+
+    def save_origin_model(self, emb_array, fc_array):
+        startup_program = fluid.framework.Program()
+        test_program = fluid.framework.Program()
+        with fluid.framework.program_guard(test_program, startup_program):
+            with fluid.unique_name.guard():
+                loss = self.net(emb_array, fc_array)
+                optimizer = fluid.optimizer.Adam(1e-3)
+                optimizer.minimize(loss)
+
+                exe = fluid.Executor(fluid.CPUPlace())
+                exe.run(startup_program)
+                model_path = tempfile.mkdtemp()
+                fluid.io.save_persistables(executor=exe, dirname=model_path)
+        return model_path
+
+
+class TestSparseLoadOpCase1(SparseLoadOp):
+    def test_2ps_0_load(self):
+        # init No.0 server env
+        env = {}
+        env["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        env["PADDLE_TRAINERS_NUM"] = str(2)
+        env["TRAINING_ROLE"] = "PSERVER"
+        env["PADDLE_PORT"] = "4001"
+        env["POD_IP"] = "127.0.0.1"
+        for k, v in env.items():
+            os.environ[k] = str(v)
+        """
+        array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
+                [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
+                [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
+                [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
+                [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
+                [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                [0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6],
+                [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7],
+                [0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8],
+                [0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]])
+        """
+        emb_array = np.arange(0, 1, 0.1).repeat(10).reshape(10, 10)
+        fc_array = np.arange(0, 1, 0.1).repeat(10).reshape(10, 10)
+        model_path = self.save_origin_model(emb_array, fc_array)
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss = self.net(emb_array, fc_array)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = fluid.optimizer.Adam(1e-3)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+        fleet.init_server(model_path)
+
+        fc_w = np.array(fluid.global_scope().find_var("fc").get_tensor())
+
+        emb = np.array(fluid.global_scope().find_var("embedding.block0")
+                       .get_tensor())
+
+        assert fc_w.all() == fc_array.all()
+        assert emb.all() == emb_array[::2].all()
+        shutil.rmtree(model_path)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
new file mode 100644
index 0000000000000..7d14a484f3442
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+from test_dist_sparse_load_ps0 import SparseLoadOp
+
+
+class TestSparseLoadOpCase2(SparseLoadOp):
+    def test_2ps_0_load(self):
+        # init No.1 server env
+        env = {}
+        env["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        env["PADDLE_TRAINERS_NUM"] = str(2)
+        env["TRAINING_ROLE"] = "PSERVER"
+        env["PADDLE_PORT"] = "4002"
+        env["POD_IP"] = "127.0.0.1"
+        for k, v in env.items():
+            os.environ[k] = str(v)
+        """
+        array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
+                [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
+                [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
+                [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
+                [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
+                [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                [0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6],
+                [0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7],
+                [0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8],
+                [0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]])
+        """
+        emb_array = np.arange(0, 1, 0.1).repeat(10).reshape(10, 10)
+        fc_array = np.arange(0, 1, 0.1).repeat(10).reshape(10, 10)
+        model_path = self.save_origin_model(emb_array, fc_array)
+
+        startup_program = fluid.framework.Program()
+        test_program = fluid.framework.Program()
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss = self.net(emb_array, fc_array)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        optimizer = fluid.optimizer.Adam(1e-3)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+        fleet.init_server(model_path)
+        emb = np.array(fluid.global_scope().find_var("embedding.block1")
+                       .get_tensor())
+        assert emb.all() == emb_array[1::2].all()
+        shutil.rmtree(model_path)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
new file mode 100644
index 0000000000000..ff545319ccd29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+
+
+class TestSparseLoadProgramAdagrad(TestSparseLoadProgram):
+    """ 
+    Test Sparse load operator.
+    """
+
+    def test_server_init(self):
+        scope, train_program, startup_program, loss = self.net()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                optimizer = fluid.optimizer.Adagrad(1e-3)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        self.strategy)
+                optimizer.minimize(loss)
+                fleet.init_server()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
new file mode 100644
index 0000000000000..60c3f7fc9f126
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+
+
+class TestSparseLoadProgramAdam(TestSparseLoadProgram):
+    """ 
+    Test Sparse load operator.
+    """
+
+    def test_server_init(self):
+        scope, train_program, startup_program, loss = self.net()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                optimizer = fluid.optimizer.Adam(1e-3)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        self.strategy)
+                optimizer.minimize(loss)
+                fleet.init_server()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
new file mode 100644
index 0000000000000..fbba08e4e0665
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+
+
+class TestSparseLoadProgramFtrl(TestSparseLoadProgram):
+    """ 
+    Test Sparse load operator.
+    """
+
+    def test_server_init(self):
+        scope, train_program, startup_program, loss = self.net()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                optimizer = fluid.optimizer.Ftrl(1e-3)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        self.strategy)
+                optimizer.minimize(loss)
+                fleet.init_server()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
new file mode 100644
index 0000000000000..31635ede6f5d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+
+
+class TestSparseLoadProgramMomentum(TestSparseLoadProgram):
+    """ 
+    Test Sparse load operator.
+    """
+
+    def test_server_init(self):
+        scope, train_program, startup_program, loss = self.net()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                optimizer = fluid.optimizer.Momentum(1e-3, 0.9)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        self.strategy)
+                optimizer.minimize(loss)
+                fleet.init_server()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
new file mode 100644
index 0000000000000..4fb5f2a2ea4f1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+
+
+class TestSparseLoadProgramRmsprop(TestSparseLoadProgram):
+    """ 
+    Test Sparse load operator.
+    """
+
+    def test_server_init(self):
+        scope, train_program, startup_program, loss = self.net()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                optimizer = fluid.optimizer.RMSProp(1e-3)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        self.strategy)
+                optimizer.minimize(loss)
+                fleet.init_server()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
new file mode 100644
index 0000000000000..17bff651c4489
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+from op_test import OpTest, randomize_probability
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.distributed.fleet.base.role_maker as role_maker
+from paddle.distributed.fleet import fleet
+
+
+class TestSparseLoadProgram(unittest.TestCase):
+    """ 
+    Test Sparse load operator.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "4001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        self.strategy = paddle.distributed.fleet.DistributedStrategy()
+        self.strategy.a_sync = True
+
+    def net(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                with fluid.unique_name.guard():
+                    inputs = fluid.data('input', shape=[None, 1], dtype="int64")
+                    emb = fluid.layers.embedding(
+                        inputs, is_sparse=True, size=[10000, 128])
+                    fc1 = fluid.layers.fc(input=emb, size=128, act="relu")
+                    fc2 = fluid.layers.fc(input=fc1, size=64, act="relu")
+                    loss = fluid.layers.reduce_mean(fc2)
+            return scope, train_program, startup_program, loss
+
+
+class TestSparseLoadProgramSGD(TestSparseLoadProgram):
+    def test_server_init(self):
+        scope, train_program, startup_program, loss = self.net()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(train_program, startup_program):
+                optimizer = fluid.optimizer.SGD(1e-3)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        self.strategy)
+                optimizer.minimize(loss)
+                fleet.init_server()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index e0e487eff11e7..aa85eb3df3527 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -170,7 +170,7 @@ def test_type():
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 6bc1a310d0aea..eb4ac1356eaaf 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -103,6 +103,57 @@ def test_amp_recompute_optimizer(self):
         # recompute
         self.assertIn('subprog', ''.join(outs))
 
+    def test_amp_recompute_lars_optimizer(self):
+        """ test amp + recompute """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lars')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        strategy = fleet._final_strategy()
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
+        # lars
+        self.assertIn('lars_momentum', ops)
+
+    def test_amp_recompute_lamb_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lamb')
+
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
+
+        applied_meta_list = fleet._get_applied_meta_list()
+        applied_graph_list = fleet._get_applied_graph_list()
+        print(applied_meta_list, applied_graph_list)
+        self.assertEqual(len(applied_meta_list), 3)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
+        # lamb
+        self.assertIn('lamb', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
index 0a4e2f631d60c..3e5b479fab559 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -48,6 +48,9 @@ def test_distributed_strategy_auto(self):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
+        applied_meta_list = fleet._get_applied_meta_list()
+        print("applied_meta_list: {}".format(applied_meta_list))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 3d4b2e218f725..f50d80d215da8 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -171,45 +171,7 @@ def test_dygraph_method(self):
         final_strategy = fleet._final_strategy()
 
 
-class LinearNet(nn.Layer):
-    def __init__(self):
-        super(LinearNet, self).__init__()
-        self._linear1 = nn.Linear(10, 10)
-        self._linear2 = nn.Linear(10, 1)
-
-    def forward(self, x):
-        return self._linear2(self._linear1(x))
-
-
-class TestFleetDygraphSingle(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-
-    def test_dygraph_single(self):
-        paddle.disable_static()
-        fleet.init(is_collective=True)
-
-        layer = LinearNet()
-        loss_fn = nn.MSELoss()
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=layer.parameters())
-
-        adam = fleet.distributed_optimizer(adam)
-        dp_layer = fleet.distributed_model(layer)
-        for step in range(2):
-            inputs = paddle.randn([10, 10], 'float32')
-            outputs = dp_layer(inputs)
-            labels = paddle.randn([10, 1], 'float32')
-            loss = loss_fn(outputs, labels)
-            loss.backward()
-            adam.step()
-            adam.clear_grad()
-
-
-class TestFleetBaseSingleRunCollective(unittest.TestCase):
+class TestFleetBaseSingleError(unittest.TestCase):
     def setUp(self):
         os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
 
@@ -221,71 +183,23 @@ def gen_data(self):
         }
 
     def test_single_run_collective_minimize(self):
-        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
-        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-
-        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-        avg_cost = paddle.mean(x=cost)
-
-        fleet.init(is_collective=True)
-        optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-        optimizer = fleet.distributed_optimizer(optimizer)
-        optimizer.minimize(avg_cost)
-
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-
-        exe = fluid.Executor(place)
-        exe.run(paddle.static.default_startup_program())
-
-        for i in range(10):
-            cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
-            print("cost of step[{}] = {}".format(i, cost_val))
-
-
-class TestFleetBaseSingleRunPS(unittest.TestCase):
-    def setUp(self):
-        os.environ.pop("PADDLE_PSERVERS_IP_PORT_LIST")
-
-    def gen_data(self):
-        return {
-            "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(
-                2, size=(128, 1)).astype('int64')
-        }
-
-    def test_single_run_ps_minimize(self):
-        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
-        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-
-        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-        avg_cost = paddle.mean(x=cost)
-
-        fleet.init()
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-        if fleet.is_server():
-            fleet.init_server()
-            fleet.run_server()
-        elif fleet.is_worker():
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(paddle.static.default_startup_program())
-            step = 100
-            for i in range(step):
-                cost_val = exe.run(program=fluid.default_main_program(),
-                                   feed=self.gen_data(),
-                                   fetch_list=[avg_cost.name])
-                print("worker_index: %d, step%d cost = %f" %
-                      (fleet.worker_index(), i, cost_val[0]))
-            fleet.save_persistables(exe, "fleet_single_model/")
-            print("save fleet models done.")
+        def test_single_error():
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32')
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.mean(x=cost)
+            fleet.init(is_collective=True)
+
+        # in non_distributed mode(use `python` to launch), raise error if has multi cards
+        if fluid.core.is_compiled_with_cuda(
+        ) and fluid.core.get_cuda_device_count() > 1:
+            self.assertRaises(ValueError, test_single_error)
+        else:
+            test_single_error()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
index 25801793f1f2e..6be05f436328e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -18,6 +18,7 @@
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.fluid as fluid
+paddle.enable_static()
 
 
 class TestFleetBase(unittest.TestCase):
@@ -48,5 +49,44 @@ def test_collective_minimize(self):
         optimizer.minimize(avg_cost)
 
 
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_fleet_get_applied_optimizer(self):
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        fleet.init(is_collective=True)
+
+        meta_list = fleet._get_applied_meta_list()
+        graph_list = fleet._get_applied_graph_list()
+        # not called minimize function
+        self.assertEqual(len(meta_list), 0)
+        self.assertEqual(len(graph_list), 0)
+
+        strategy = fleet.DistributedStrategy()
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        meta_list = fleet._get_applied_meta_list()
+        graph_list = fleet._get_applied_graph_list()
+        self.assertEqual(len(meta_list), 0)
+        self.assertEqual(len(graph_list), 1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
index 1b3fbb86a4af5..dba409ec9200e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
@@ -16,6 +16,9 @@
 import paddle
 import os
 import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
 
 
 class TestFleetBase(unittest.TestCase):
@@ -27,7 +30,6 @@ def setUp(self):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_fleet_init(self):
-        import paddle.distributed.fleet as fleet
 
         os.environ["TRAINING_ROLE"] = "PSERVER"
         os.environ["POD_IP"] = "127.0.0.1"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_single.py b/python/paddle/fluid/tests/unittests/test_fleet_base_single.py
new file mode 100644
index 0000000000000..111a6331958ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_single.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
+if cuda_visible_devices is None or cuda_visible_devices == "":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.fluid as fluid
+import unittest
+import paddle.nn as nn
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear1 = nn.Linear(10, 10)
+        self._linear2 = nn.Linear(10, 1)
+
+    def forward(self, x):
+        return self._linear2(self._linear1(x))
+
+
+class TestFleetDygraphSingle(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def test_dygraph_single(self):
+        paddle.disable_static()
+        fleet.init(is_collective=True)
+
+        layer = LinearNet()
+        loss_fn = nn.MSELoss()
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=layer.parameters())
+
+        adam = fleet.distributed_optimizer(adam)
+        dp_layer = fleet.distributed_model(layer)
+        for step in range(2):
+            inputs = paddle.randn([10, 10], 'float32')
+            outputs = dp_layer(inputs)
+            labels = paddle.randn([10, 1], 'float32')
+            loss = loss_fn(outputs, labels)
+            loss = dp_layer.scale_loss(loss)
+            loss.backward()
+            dp_layer.apply_collective_grads()
+            adam.step()
+            adam.clear_grad()
+
+
+class TestFleetBaseSingleRunCollective(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_collective_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init(is_collective=True)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        for i in range(10):
+            cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
+            print("cost of step[{}] = {}".format(i, cost_val))
+
+
+class TestFleetBaseSingleRunPS(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_ps_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+        if fleet.is_server():
+            fleet.init_server()
+            fleet.run_server()
+        elif fleet.is_worker():
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            step = 10
+            for i in range(step):
+                cost_val = exe.run(program=fluid.default_main_program(),
+                                   feed=self.gen_data(),
+                                   fetch_list=[avg_cost.name])
+                print("worker_index: %d, step%d cost = %f" %
+                      (fleet.worker_index(), i, cost_val[0]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
index 0faafd76a799d..3a64c1818ccc6 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -128,6 +128,36 @@ def test_dgc_recompute_optimizer(self):
         # recompute
         self.assertIn('subprog', ''.join(outs))
 
+    def test_amp_recompute_lars_dgc_not_apply_optimizer(self):
+        """ test amp + recompute + lars + dgc,
+            amp -/-> dgc, max_path is amp-->recompute-->lars
+        """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'dgc')
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lars')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        strategy = fleet._final_strategy()
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
+        # lars
+        self.assertIn('lars_momentum', ops)
+
+        # dgc not apply
+        self.assertFalse(strategy.dgc)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
index af72df5186876..29eb3d9ab16ac 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
@@ -18,35 +18,36 @@
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
-class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+paddle.enable_static()
 
+
+class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer):
     def test_gradient_merge_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.gradient_merge = True
-        strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'gradient_merge')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        vars = [x.name for x in train_prog.list_vars()]
+        with open("main_program", 'w') as f:
+            f.write(str(train_prog))
+
+        self.assertIn('@GradientMerge', ''.join(vars))
+
+    def test_recom_gm_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'gradient_merge')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        vars = [x.name for x in train_prog.list_vars()]
+        self.assertIn('@GradientMerge', ''.join(vars))
+        self.assertIn('subprog', ''.join(vars))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index d894762904ba6..05da44cd06133 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -60,7 +60,7 @@ def node_func():
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
-            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index cf32c803ff810..ae2914d56db73 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -282,8 +282,7 @@ def test_fs_gloo4(self):
         os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
         os.environ["PADDLE_WITH_GLOO"] = "1"
         os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
-        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
-        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+        os.environ["PADDLE_GLOO_HTTP_ENDPOINT"] = "127.0.0.1:30019"
 
         role = role_maker.PaddleCloudRoleMaker()
         role._generate_role()
@@ -541,8 +540,7 @@ def test_fs_gloo4(self):
         os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
         os.environ["PADDLE_WITH_GLOO"] = "1"
         os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
-        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
-        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+        os.environ["PADDLE_GLOO_HTTP_ENDPOINT"] = "127.0.0.1:30019"
 
         role = role_maker.PaddleCloudRoleMaker()
         role._generate_role()
@@ -673,8 +671,7 @@ def test_http_gloo_v2(self):
         os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
         os.environ["PADDLE_WITH_GLOO"] = "1"
         os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
-        os.environ["PADDLE_GLOO_HTTP_HOST"] = ""
-        os.environ["PADDLE_GLOO_HTTP_PORT"] = ""
+        os.environ["PADDLE_GLOO_HTTP_ENDPOINT"] = ""
 
         role = role_maker.PaddleCloudRoleMaker()
 
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index ba14aeae99032..30bc097428c3b 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -31,7 +31,7 @@ def test_attr_tensor_API(self):
         train_program = Program()
         with program_guard(train_program, startup_program):
             fill_value = 2.0
-            input = paddle.data(name='input', dtype='float32', shape=[2, 3])
+            input = paddle.fluid.data(name='input', dtype='float32', shape=[2, 3])
             output = paddle.full_like(input, fill_value)
             output_dtype = paddle.full_like(input, fill_value, dtype='float32')
 
@@ -67,7 +67,7 @@ def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
 
-            input_data = paddle.data(
+            input_data = paddle.fluid.data(
                 name='input', dtype='float32', shape=[2, 3])
             output = paddle.full_like(input_data, 2.0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index bd934c76ebfa2..a2955c12fc0c4 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -192,9 +192,9 @@ def test_error(self):
                                          paddle.static.Program()):
 
             shape = [8, 9, 6]
-            x = paddle.data(shape=shape, dtype='float32', name='x')
-            index = paddle.data(shape=shape, dtype='bool', name='index')
-            index_float = paddle.data(
+            x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
+            index = paddle.fluid.data(shape=shape, dtype='bool', name='index')
+            index_float = paddle.fluid.data(
                 shape=shape, dtype='float32', name='index_float')
             np_x = np.random.random(shape).astype('float32')
             np_index = np.array(np.random.randint(2, size=shape, dtype=bool))
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 5dcce88acf16b..2e4b52c282d56 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -202,9 +202,9 @@ def test_out1(self):
     def test_out2(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            x = paddle.data('x', shape=[-1, 2], dtype='float64')
-            index = paddle.data('index', shape=[-1, 1], dtype='int32')
-            axis = paddle.data('axis', shape=[1], dtype='int32')
+            x = paddle.fluid.data('x', shape=[-1, 2], dtype='float64')
+            index = paddle.fluid.data('index', shape=[-1, 1], dtype='int32')
+            axis = paddle.fluid.data('axis', shape=[1], dtype='int32')
             out = paddle.gather(x, index, axis)
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -252,10 +252,10 @@ def test_error1(self):
                                          paddle.static.Program()):
 
             shape = [8, 9, 6]
-            x = paddle.data(shape=shape, dtype='int8', name='x')
-            axis = paddle.data(shape=[1], dtype='float32', name='axis')
-            index = paddle.data(shape=shape, dtype='int32', name='index')
-            index_float = paddle.data(
+            x = paddle.fluid.data(shape=shape, dtype='int8', name='x')
+            axis = paddle.fluid.data(shape=[1], dtype='float32', name='axis')
+            index = paddle.fluid.data(shape=shape, dtype='int32', name='index')
+            index_float = paddle.fluid.data(
                 shape=shape, dtype='float32', name='index_float')
 
             def test_x_type():
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
new file mode 100644
index 0000000000000..cef77cc5f8453
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import subprocess
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+class TestGPUPackagePaddle(unittest.TestCase):
+    def test_import_paddle(self):
+        if core.is_compiled_with_cuda():
+            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            test_file = 'test_no_gpu_run_rand.py'
+            with open(test_file, 'w') as wb:
+                cmd_test = """
+import paddle
+x = paddle.rand([3,4])
+assert x.place.is_gpu_place() is False, "There is no CUDA device, but Tensor's place is CUDAPlace"
+"""
+                wb.write(cmd_test)
+
+            _python = sys.executable
+
+            ps_cmd = '{} {}'.format(_python, test_file)
+            ps_proc = subprocess.Popen(
+                ps_cmd.strip().split(" "),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            stdout, stderr = ps_proc.communicate()
+
+            assert 'CPU device will be used by default' in str(
+                stderr
+            ), "GPU version Paddle is installed. But CPU device can't be used when CUDA device is not set properly"
+            assert "Error" not in str(
+                stderr
+            ), "There is no CUDA device, but Tensor's place is CUDAPlace"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 833eeb33641c9..47761eb5eaf87 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -31,24 +31,24 @@ def test_dygraph(self):
         if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
             places.append(fluid.CUDAPlace(0))
         for p in places:
-            shape = [2, 6, 2, 2]
+            shape = [2, 2, 2, 2]
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
             def test_weight_bias_false():
                 with fluid.dygraph.guard(p):
                     gn = paddle.nn.GroupNorm(
-                        num_channels=6,
+                        num_channels=2,
                         num_groups=2,
                         weight_attr=False,
                         bias_attr=False)
@@ -56,7 +56,7 @@ def test_weight_bias_false():
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
-            result = np.allclose(y1, y2)
+            result = np.allclose(y1, y2, atol=1e-5)
             if not result:
                 print("y1:", y1, "\ty2:", y2)
             self.assertTrue(result)
diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py
index 0ccb6fce8e4ed..f540b885e12ee 100644
--- a/python/paddle/fluid/tests/unittests/test_histogram_op.py
+++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py
@@ -73,7 +73,7 @@ def test_bins_error(self):
         """Test bins should be greater than or equal to 1."""
 
         def net_func():
-            input_value = paddle.fill_constant(
+            input_value = paddle.fluid.layers.fill_constant(
                 shape=[3, 4], dtype='float32', value=3.0)
             paddle.histogram(input=input_value, bins=-1, min=1, max=5)
 
@@ -84,7 +84,7 @@ def test_min_max_error(self):
         """Test max must be larger or equal to min."""
 
         def net_func():
-            input_value = paddle.fill_constant(
+            input_value = paddle.fluid.layers.fill_constant(
                 shape=[3, 4], dtype='float32', value=3.0)
             paddle.histogram(input=input_value, bins=1, min=5, max=1)
 
@@ -95,7 +95,7 @@ def test_min_max_range_error(self):
         """Test range of min, max is not finite"""
 
         def net_func():
-            input_value = paddle.fill_constant(
+            input_value = paddle.fluid.layers.fill_constant(
                 shape=[3, 4], dtype='float32', value=3.0)
             paddle.histogram(input=input_value, bins=1, min=-np.inf, max=5)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index f61d1ab888a51..ab9a98588f76e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -31,11 +31,11 @@ def __init__(self, num_classes=10, classifier_activation='softmax'):
             nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
-            nn.Pool2D(2, 'max', 2),
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
             nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
-            nn.Pool2D(2, 'max', 2))
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = nn.Sequential(
@@ -54,17 +54,17 @@ def forward(self, inputs):
 
 def init_weights(layer):
     if type(layer) == nn.Linear:
-        new_weight = paddle.fill_constant(
+        new_weight = paddle.fluid.layers.fill_constant(
             layer.weight.shape, layer.weight.dtype, value=0.9)
         layer.weight.set_value(new_weight)
-        new_bias = paddle.fill_constant(
+        new_bias = paddle.fluid.layers.fill_constant(
             layer.bias.shape, layer.bias.dtype, value=-0.1)
         layer.bias.set_value(new_bias)
     elif type(layer) == nn.Conv2d:
-        new_weight = paddle.fill_constant(
+        new_weight = paddle.fluid.layers.fill_constant(
             layer.weight.shape, layer.weight.dtype, value=0.7)
         layer.weight.set_value(new_weight)
-        new_bias = paddle.fill_constant(
+        new_bias = paddle.fluid.layers.fill_constant(
             layer.bias.shape, layer.bias.dtype, value=-0.2)
         layer.bias.set_value(new_bias)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
index c7e0902341a59..95d3b87f0e948 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -30,11 +30,11 @@ def __init__(self):
             nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
-            nn.Pool2D(2, 'max', 2),
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
             nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
-            nn.Pool2D(2, 'max', 2))
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index 887e50f07c55c..e1b7847a6e6dd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -23,7 +23,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
+from paddle.fluid.optimizer import MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer
 from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph.base import to_variable
@@ -72,15 +72,17 @@ def _check_exception(self, exception_message, place=None):
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
 
-        with fluid.dygraph.guard(place):
-            try:
-                paddle.manual_seed(seed)
-                paddle.framework.random._manual_program_seed(seed)
-                mlp = MLP()
-                optimizer = self.get_optimizer_dygraph(
-                    parameter_list=mlp.parameters())
-            except Exception as e:
-                assert str(e) == exception_message
+        try:
+            paddle.disable_static()
+            paddle.manual_seed(seed)
+            paddle.framework.random._manual_program_seed(seed)
+            mlp = MLP()
+            optimizer = self.get_optimizer_dygraph(
+                parameter_list=mlp.parameters())
+        except Exception as e:
+            assert str(e) == exception_message
+        finally:
+            paddle.enable_static()
 
     def _check_mlp(self, place=None):
         seed = 90
@@ -90,47 +92,55 @@ def _check_mlp(self, place=None):
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
 
-        with fluid.dygraph.guard(place):
-            paddle.manual_seed(seed)
-            paddle.framework.random._manual_program_seed(seed)
+        paddle.disable_static(place)
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
 
-            mlp = MLP()
-            optimizer = self.get_optimizer_dygraph(
-                parameter_list=mlp.parameters())
+        mlp = MLP()
+        optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters())
 
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+        batch_py_reader = fluid.io.PyReader(capacity=1)
+        batch_py_reader.decorate_sample_list_generator(
+            paddle.batch(
+                self.reader_decorator(paddle.dataset.mnist.train()),
+                batch_size=batch_size,
+                drop_last=True),
+            places=fluid.CPUPlace())
 
-            dy_param_init_value = {}
-            for batch_id, data in enumerate(batch_py_reader()):
-                if batch_id >= self.batch_num:
-                    break
+        dy_param_init_value = {}
+        for batch_id, data in enumerate(batch_py_reader()):
+            if batch_id >= self.batch_num:
+                break
 
-                img = data[0]
-                label = data[1]
-                label.stop_gradient = True
+            img = data[0]
+            label = data[1]
 
-                img = fluid.layers.reshape(img, shape=[batch_size, -1])
-                cost = mlp(img)
-                avg_loss = fluid.layers.reduce_mean(cost)
-                dy_out = avg_loss.numpy()
+            label.stop_gradient = True
 
-                if batch_id == 0:
-                    for param in mlp.parameters():
-                        dy_param_init_value[param.name] = param.numpy()
+            img = fluid.layers.reshape(img, shape=[batch_size, -1])
+            cost = mlp(img)
+            avg_loss = fluid.layers.reduce_mean(cost)
+            dy_out = avg_loss.numpy()
 
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                mlp.clear_gradients()
-                dy_param_value = {}
+            if batch_id == 0:
                 for param in mlp.parameters():
-                    dy_param_value[param.name] = param.numpy()
+                    dy_param_init_value[param.name] = param.numpy()
 
+            avg_loss.backward()
+            optimizer.minimize(avg_loss)
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                if isinstance(optimizer._learning_rate,
+                              paddle.optimizer.lr.ReduceOnPlateau):
+                    optimizer._learning_rate.step(avg_loss)
+                else:
+                    optimizer._learning_rate.step()
+            mlp.clear_gradients()
+            dy_param_value = {}
+            for param in mlp.parameters():
+                dy_param_value[param.name] = param.numpy()
+
+        paddle.enable_static()
         with new_program_scope():
             paddle.manual_seed(seed)
             paddle.framework.random._manual_program_seed(seed)
@@ -181,6 +191,13 @@ def _check_mlp(self, place=None):
                               feed={"pixel": static_x_data,
                                     "label": y_data},
                               fetch_list=fetch_list)
+                if isinstance(optimizer._learning_rate,
+                              paddle.optimizer.lr.LRScheduler):
+                    if isinstance(optimizer._learning_rate,
+                                  paddle.optimizer.lr.ReduceOnPlateau):
+                        optimizer._learning_rate.step(out[0])
+                    else:
+                        optimizer._learning_rate.step()
 
                 static_param_value = {}
                 static_out = out[0]
@@ -199,17 +216,19 @@ def _check_mlp(self, place=None):
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
         bd = [3, 6, 9]
-        optimizer = SGDOptimizer(
-            learning_rate=paddle.optimizer.PiecewiseLR(
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=bd,
                 values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
-            parameter_list=parameter_list)
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
         bd = [3, 6, 9]
-        optimizer = SGDOptimizer(learning_rate=paddle.optimizer.PiecewiseLR(
-            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=bd,
+                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
         return optimizer
 
     def test_sgd(self):
@@ -218,21 +237,16 @@ def test_sgd(self):
 
 class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.natural_exp_decay(
-                learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True),
-            parameter_list=parameter_list)
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.NaturalExpDecay(
+                learning_rate=0.5, gamma=0.9),
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.NaturalExpDecay(
+                learning_rate=0.5, gamma=0.9))
         return optimizer
 
     def test_sgd(self):
@@ -241,21 +255,16 @@ def test_sgd(self):
 
 class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True),
-            parameter_list=parameter_list)
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.ExponentialDecay(
+                learning_rate=0.5, gamma=0.9),
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.ExponentialDecay(
+                learning_rate=0.5, gamma=0.9))
         return optimizer
 
     def test_sgd(self):
@@ -264,21 +273,16 @@ def test_sgd(self):
 
 class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = Adam(
-            learning_rate=fluid.layers.inverse_time_decay(
-                learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True),
-            parameter_list=parameter_list)
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=paddle.optimizer.lr.InverseTimeDecay(
+                learning_rate=0.5, gamma=0.9),
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
-            learning_rate=0.1,
-            decay_steps=10000,
-            decay_rate=0.5,
-            staircase=True))
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=paddle.optimizer.lr.InverseTimeDecay(
+                learning_rate=0.5, gamma=0.9))
         return optimizer
 
     def test_adam(self):
@@ -287,15 +291,16 @@ def test_adam(self):
 
 class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.polynomial_decay(
-                learning_rate=0.1, decay_steps=5, cycle=self.cycle),
-            parameter_list=parameter_list)
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.PolynomialDecay(
+                learning_rate=0.5, decay_steps=5, cycle=self.cycle),
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
-            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.PolynomialDecay(
+                learning_rate=0.5, decay_steps=5, cycle=self.cycle))
         return optimizer
 
     def test_sgd_cycle(self):
@@ -307,17 +312,18 @@ def test_sgd(self):
         self._check_mlp()
 
 
-class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
+class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.cosine_decay(
-                learning_rate=0.1, step_each_epoch=10000, epochs=120),
-            parameter_list=parameter_list)
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
+                learning_rate=0.5, T_max=5),
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
-            learning_rate=0.1, step_each_epoch=10000, epochs=120))
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
+                learning_rate=0.5, T_max=5))
         return optimizer
 
     def test_sgd(self):
@@ -326,15 +332,110 @@ def test_sgd(self):
 
 class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.noam_decay(
-                d_model=512, warmup_steps=8000),
-            parameter_list=parameter_list)
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.NoamDecay(
+                d_model=0.01, warmup_steps=100, verbose=True),
+            parameters=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.NoamDecay(
+                d_model=0.01, warmup_steps=100))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.LambdaDecay(
+                learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch),
+            parameters=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.LambdaDecay(
+                learning_rate=0.5, lr_lambda=lambda epoch: 0.9**epoch))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.LinearWarmup(
+                learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5),
+            parameters=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.LinearWarmup(
+                learning_rate=0.5,
+                warmup_steps=20,
+                start_lr=0,
+                end_lr=0.5,
+                verbose=True))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.MultiStepDecay(
+                learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8),
+            parameters=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.MultiStepDecay(
+                learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.StepDecay(
+                learning_rate=0.5, step_size=5, gamma=0.8),
+            parameters=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.StepDecay(
+                learning_rate=0.5, step_size=5, gamma=0.8))
+        return optimizer
+
+    def test_sgd(self):
+        self._check_mlp()
+
+
+class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase):
+    def get_optimizer_dygraph(self, parameter_list):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
+                learning_rate=0.5),
+            parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
-            d_model=512, warmup_steps=8000))
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
+                learning_rate=0.5))
         return optimizer
 
     def test_sgd(self):
@@ -381,7 +482,7 @@ def test_lr_decay(self):
             bd = [2, 4, 6, 8]
             value = [0.2, 0.4, 0.6, 0.8, 1.0]
 
-            scheduler = paddle.optimizer.PiecewiseLR(bd, value)
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
             adam = paddle.optimizer.Adam(
                 scheduler, parameters=linear.parameters())
 
@@ -396,7 +497,7 @@ def test_lr_decay(self):
                 self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
                 scheduler.step()
 
-    def test_lr_decay_natural_exp(self):
+    def test_lr_scheduler_natural_exp(self):
         with fluid.dygraph.guard():
             a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
 
@@ -407,8 +508,7 @@ def test_lr_decay_natural_exp(self):
             loss = fluid.layers.reduce_mean(b)
             base_lr = 1.0
 
-            scheduler = paddle.optimizer.NaturalExpLR(1.0, gamma=0.5)
-            print("scheduler.last_lr", scheduler.last_lr)
+            scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
             adam = paddle.optimizer.Adam(
                 scheduler, parameters=linear.parameters())
 
@@ -453,7 +553,7 @@ def test_set_lr(self):
 
             with self.assertRaises(RuntimeError):
                 adam = paddle.optimizer.Adam(
-                    paddle.optimizer.NaturalExpLR(
+                    paddle.optimizer.lr.NaturalExpDecay(
                         learning_rate=0.1, gamma=0.5),
                     parameters=linear.parameters())
                 adam.set_lr(0.01)
@@ -695,10 +795,10 @@ def test_parameter_list(self):
             linear_1 = Linear(10, 10)
             linear_2 = Linear(10, 10)
 
-            sgd = SGDOptimizer(
-                1.0,
-                parameter_list=itertools.chain(linear_1.parameters(),
-                                               linear_2.parameters()))
+            sgd = paddle.optimizer.SGD(1.0,
+                                       parameters=itertools.chain(
+                                           linear_1.parameters(),
+                                           linear_2.parameters()))
 
             in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             in_data = fluid.dygraph.to_variable(in_np)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 5b7998198efa8..0335fa547616e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -239,7 +239,7 @@ def setUp(self):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.PiecewiseLR(
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=bd, values=lr_arr)
             adam = Adam(
                 learning_rate=scheduler, parameters=ptb_model.parameters())
@@ -328,7 +328,7 @@ def testLoadAndSetVarBase(self):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.PiecewiseLR(
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=bd, values=lr_arr)
             adam = Adam(
                 learning_rate=scheduler, parameters=ptb_model.parameters())
@@ -436,7 +436,7 @@ def testSetVariable(self):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.PiecewiseLR(
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=bd, values=lr_arr)
             adam = Adam(
                 learning_rate=scheduler, parameters=ptb_model.parameters())
@@ -544,7 +544,7 @@ def testSetNumpy(self):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.PiecewiseLR(
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=bd, values=lr_arr)
             adam = Adam(
                 learning_rate=scheduler, parameters=ptb_model.parameters())
@@ -829,7 +829,7 @@ def testSetNumpyBeforeTrain(self):
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.PiecewiseLR(
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(
                 boundaries=bd, values=lr_arr)
             adam = Adam(
                 learning_rate=scheduler,
diff --git a/python/paddle/fluid/tests/unittests/test_increment.py b/python/paddle/fluid/tests/unittests/test_increment.py
new file mode 100755
index 0000000000000..e8cc7c8cf1819
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_increment.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+
+class TestIncrement(unittest.TestCase):
+    def test_api(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.layers.fill_constant(
+                shape=[1], dtype='int64', value=5)
+            expected_result = np.array([8], dtype='int64')
+
+            output = paddle.tensor.math.increment(input, value=3)
+            exe = fluid.Executor(fluid.CPUPlace())
+            result = exe.run(fetch_list=[output])
+            self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            input = paddle.ones(shape=[1], dtype='int64')
+            expected_result = np.array([2], dtype='int64')
+            output = paddle.tensor.math.increment(input, value=1)
+            self.assertEqual((output.numpy() == expected_result).all(), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
index bd71ca0c1c9e7..f640c0531192d 100644
--- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 
 import unittest
+import paddle
+import paddle.fluid as fluid
 import numpy as np
 from op_test import OpTest
 
@@ -98,9 +100,7 @@ def config(self):
 
 class TestIndexSampleShape(unittest.TestCase):
     def test_shape(self):
-        import paddle.fluid as fluid
-        import paddle
-
+        paddle.enable_static()
         # create x value
         x_shape = (2, 5)
         x_type = "float64"
@@ -124,5 +124,22 @@ def test_shape(self):
         res = exe.run(feed=feed, fetch_list=[output])
 
 
+class TestIndexSampleDynamic(unittest.TestCase):
+    def test_result(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor(
+                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                 [9.0, 10.0, 11.0, 12.0]],
+                dtype='float32')
+            index = paddle.to_tensor(
+                [[0, 1, 2], [1, 2, 3], [0, 0, 0]], dtype='int32')
+            out_z1 = paddle.index_sample(x, index)
+
+            except_output = np.array(
+                [[1.0, 2.0, 3.0], [6.0, 7.0, 8.0], [9.0, 9.0, 9.0]])
+            assert out_z1.numpy().all() == except_output.all()
+
+
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 92135b113a0f9..952265e1195f5 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -599,9 +599,9 @@ def test_uniform_initializer(self, dtype="float32"):
         """
         paddle.disable_static()
 
-        tensor = paddle.zeros([1024, 1024])
+        tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
-        self.assertTrue(np.allclose(np.zeros((1024, 1024)), tensor.numpy()))
+        self.assertTrue(np.allclose(np.zeros((1024, 1024, 16)), tensor.numpy()))
 
         uniform_ = paddle.nn.initializer.Uniform()
         uniform_(tensor)
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index 6ad19658fd203..ce72b5effbc51 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -27,6 +27,12 @@
 DELTA = 0.00001
 
 
+def get_uniform_min_and_max(weight):
+    min_value = np.min(weight)
+    max_value = np.max(weight)
+    return min_value, max_value
+
+
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
@@ -104,5 +110,577 @@ def test_constant_initializer_fp16(self):
         self.test_constant_initializer_dygraph("float16")
 
 
+class TestKaimingInitializer(unittest.TestCase):
+    def static_test_kaiming_initializer_common(self,
+                                               init_inst,
+                                               dtype="float32",
+                                               uniform=False,
+                                               is_conv=False):
+        paddle.enable_static()
+        program = framework.Program()
+        block = program.global_block()
+        shape_mat = [5, 10, 15, 20] if is_conv else [5, 10]
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=shape_mat,
+                lod_level=0,
+                name="param",
+                initializer=init_inst)
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        if uniform:
+            self.assertEqual(init_op.type, 'uniform_random')
+            if is_conv:
+                receptive_field_size = float(15 * 20)
+                limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size))
+            else:
+                limit = np.sqrt(6.0 / param.shape[0])
+            self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+            self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        else:
+            self.assertEqual(init_op.type, 'gaussian_random')
+            if is_conv:
+                receptive_field_size = float(15 * 20)
+                std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size))
+            else:
+                std = np.sqrt(2.0 / param.shape[0])
+            self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+            self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        paddle.disable_static()
+
+    def dygraph_test_kaiming_initializer_common(self,
+                                                init_inst,
+                                                dtype="float32",
+                                                uniform=False):
+        linear = nn.Linear(40, 20, weight_attr=init_inst)
+
+    def test_kaiming_dygraph(self):
+        self.dygraph_test_kaiming_initializer_common(
+            init_inst=initializer.KaimingUniform(),
+            dtype="float32",
+            uniform=True)
+        self.dygraph_test_kaiming_initializer_common(
+            init_inst=initializer.KaimingNormal(),
+            dtype="float32",
+            uniform=False)
+
+    def test_kaiming_uniform_initializer_static(self):
+        """Test Kaiming unorm initializer for matrix multiply.
+        """
+        self.static_test_kaiming_initializer_common(
+            init_inst=initializer.KaimingUniform(),
+            dtype="float32",
+            uniform=True,
+            is_conv=False)
+
+    def test_kaiming_uniform_initializer_conv_static(self):
+        """Test Kaiming unorm initializer for convolutions.
+        """
+        self.static_test_kaiming_initializer_common(
+            init_inst=initializer.KaimingUniform(),
+            dtype="float32",
+            uniform=True,
+            is_conv=True)
+
+    def test_kaiming_normal_initializer_static(self):
+        """Test Kaiming normal initializer for matrix multiply.
+        """
+        self.static_test_kaiming_initializer_common(
+            init_inst=initializer.KaimingNormal(),
+            dtype="float32",
+            uniform=False,
+            is_conv=False)
+
+    def test_kaiming_normal_initializer_conv_static(self):
+        """Test Kaiming normal initializer for convolutions.
+        """
+        self.static_test_kaiming_initializer_common(
+            init_inst=initializer.KaimingNormal(),
+            dtype="float32",
+            uniform=False,
+            is_conv=True)
+
+
+class TestUniform(unittest.TestCase):
+    def test_uniform_common(self, dtype="float32", seed=0):
+        """Test the uniform initializer with default value
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        program.random_seed = seed
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.Uniform())
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        self.assertAlmostEqual(init_op.attr('min'), -1.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), seed)
+
+        paddle.disable_static()
+
+        return block
+
+    def test_uniform_initializer_default_value(self,
+                                               dtype="float32",
+                                               seed=0,
+                                               min_value=-1.0,
+                                               max_vlaue=1.0):
+        """Test the uniform initializer with default value
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        program.random_seed = seed
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.Uniform())
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        self.assertAlmostEqual(init_op.attr('min'), min_value, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), max_vlaue, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), seed)
+
+        paddle.disable_static()
+
+        return block
+
+    def test_uniform_initializer(self,
+                                 dtype="float32",
+                                 seed=0,
+                                 min_value=-4.2,
+                                 max_vlaue=3.1):
+        """Test uniform initializer with supplied attributes
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        program.random_seed = seed
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.Uniform(min_value, max_vlaue))
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        self.assertAlmostEqual(init_op.attr('min'), min_value, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), max_vlaue, delta=DELTA)
+
+        paddle.disable_static()
+
+        return block
+
+    def test_uniform_initializer_two_op(self,
+                                        dtype="float32",
+                                        seed=123,
+                                        min_value=-4.2,
+                                        max_vlaue=0.0):
+        """Test uniform initializer with supplied attributes
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        program.random_seed = seed
+        block = program.global_block()
+        for i in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.Uniform(min_value, float(i)))
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op0 = block.ops[0]
+        self.assertEqual(init_op0.type, 'uniform_random')
+        self.assertAlmostEqual(init_op0.attr('min'), min_value, delta=DELTA)
+        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
+        self.assertEqual(init_op0.attr("seed"), seed)
+
+        paddle.disable_static()
+
+        return block
+
+    def test_uniform_initializer_fp16(self):
+        """Test uniform initializer with float16
+        """
+        block = self.test_uniform_initializer_default_value("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer(dtype="float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer_two_op("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
+    def test_uniform_initializer_dygraph(self):
+        """Test uniform initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.Uniform(
+                low=-0.5, high=0.5))
+        linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
+
+        min_value, max_value = get_uniform_min_and_max(linear.weight.numpy())
+        self.assertTrue(min_value >= -0.5,
+                        'min value {} should >= -0.5'.format(min_value))
+        self.assertTrue(max_value <= 0.5,
+                        'max value {} should <= 0.5'.format(max_value))
+
+
+class TestNormal(unittest.TestCase):
+    def test_normal_initializer_default_value(self):
+        """Test the normal initializer with default value
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.Normal())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+        paddle.disable_static()
+
+    def test_normal_initializer(self, dtype="float32"):
+        """Test normal initializer with supplied attributes
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.Normal(2.3, 1.9))
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA)
+
+        paddle.disable_static()
+
+        return block
+
+    def test_normal_initializer_fp16(self):
+        """Test normal initializer with float16
+        """
+        block = self.test_normal_initializer("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
+    def test_normal_initializer_dygraph(self):
+        """Test normal initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.Normal(
+                mean=0.0, std=2.0))
+        linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
+
+
+class TestTruncatedNormal(unittest.TestCase):
+    def test_truncated_normal_initializer_default_value(self):
+        """Test the truncated normal initializer with default value
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.TruncatedNormal())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'truncated_gaussian_random')
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+        paddle.disable_static()
+
+    def test_truncated_normal_initializer(self, dtype="float32"):
+        """Test truncated normal initializer with supplied attributes
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.TruncatedNormal(2.3, 1.9))
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'truncated_gaussian_random')
+        self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA)
+
+        paddle.disable_static()
+
+        return block
+
+    def test_truncated_normal_initializer_fp16(self):
+        """Test truncated normal initializer with float16
+        """
+        paddle.enable_static()
+
+        block = self.test_truncated_normal_initializer("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
+    def test_truncated_normal_initializer_dygraph(self):
+        """Test truncated normal initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+        linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
+
+
+class TestXavierUniform(unittest.TestCase):
+    def test_xavier_uniform_initializer(self):
+        """Test Xavier initializer with uniform distribution on
+           for matrix multiply.
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierUniform())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / (param.shape[0] + param.shape[1]))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+        paddle.disable_static()
+
+    def test_xavier_uniform_initializer_conv(self):
+        """Test Xavier initializer with uniform distribution on
+           for convolutions.
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierUniform())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        receptive_field_size = float(15 * 20)
+        limit = np.sqrt(6.0 / (
+            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_xavier_uniform_initializer_dygraph(self):
+        """Test xavier uniform initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.XavierUniform())
+        linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
+
+
+class TestXavierNormal(unittest.TestCase):
+    def test_xavier_normal_initializer(self):
+        """Test Xavier initializer with normal distribution on
+           for matrix multiply.
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierNormal())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / (param.shape[0] + param.shape[1]))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+        paddle.disable_static()
+
+    def test_xavier_normal_initializer_conv(self):
+        """Test Xavier initializer with normal distribution on
+           for convolutions.
+        """
+        paddle.enable_static()
+
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierNormal())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        receptive_field_size = float(15 * 20)
+        std = np.sqrt(2.0 / (
+            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+        paddle.disable_static()
+
+    def test_xavier_normal_initializer_dygraph(self):
+        """Test xavier normal initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            initializer=paddle.nn.initializer.XavierNormal())
+        linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
+
+
+class TestAssign(unittest.TestCase):
+    def test_assign_initializer(self, dtype="float32"):
+        """Test the numpy array initializer with supplied arguments
+        """
+        paddle.enable_static()
+
+        import numpy
+        program = framework.Program()
+        block = program.global_block()
+        np_array = numpy.random.random((10000)).astype(dtype)
+        for _ in range(2):
+            block.create_parameter(
+                dtype=np_array.dtype,
+                shape=np_array.shape,
+                lod_level=0,
+                name="param",
+                initializer=initializer.Assign(np_array))
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+        assert (init_op.attr('fp32_values') == np_array).all()
+
+        paddle.disable_static()
+
+        return block
+
+    def test_assign_initializer_fp16(self):
+        """Test the numpy array initializer with float16
+        """
+        block = self.test_assign_initializer("float16")
+        self.assertTrue(block.ops[1])
+
+    def test_assign_initializer_dygraph_1(self):
+        """Test assign initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr_1 = paddle.framework.ParamAttr(
+            name="linear_weight_1",
+            initializer=paddle.nn.initializer.Assign(np.array([2, 2])))
+        linear_1 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_1)
+
+        self.assertTrue((linear_1.weight.numpy() == [2.0, 2.0]).all(), '')
+
+    def test_assign_initializer_dygraph_2(self):
+        """Test assign initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr_2 = paddle.framework.ParamAttr(
+            name="linear_weight_2",
+            initializer=paddle.nn.initializer.Assign([2, 2]))
+        linear_2 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_2)
+
+        self.assertTrue((linear_2.weight.numpy() == [2.0, 2.0]).all(), '')
+
+    def test_assign_initializer_dygraph_3(self):
+        """Test assign initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr_3 = paddle.framework.ParamAttr(
+            name="linear_weight_3",
+            initializer=paddle.nn.initializer.Assign(paddle.full([2], 2)))
+        linear_3 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_3)
+
+        self.assertTrue((linear_3.weight.numpy() == [2.0, 2.0]).all(), '')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_install_check.py b/python/paddle/fluid/tests/unittests/test_install_check.py
index 5cb199d4967a4..83e17406c1004 100644
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
@@ -14,13 +14,17 @@
 
 from __future__ import print_function
 import unittest
+import paddle
 import paddle.fluid as fluid
 
 
 class TestInstallCheck(unittest.TestCase):
-    def test_install_check(self):
+    def test_paddle_fluid(self):
         fluid.install_check.run_check()
 
+    def test_paddle_utils(self):
+        paddle.utils.run_check()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
index 4c7add3f271a2..83d86aff7ac9c 100644
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
@@ -132,6 +133,14 @@ def test_has_nan_bad_x():
 
             self.assertRaises(TypeError, test_has_nan_bad_x)
 
+        with fluid.dygraph.guard():
+            data = paddle.zeros([2, 3])
+            result = paddle.fluid.layers.has_inf(data)
+            expect_value = np.array([False])
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+            result = paddle.fluid.layers.has_nan(data)
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
index 281dc7caded1f..0d4d3b58e862c 100644
--- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -27,7 +27,7 @@ def run_static(x_np, dtype, op_str, use_gpu=False):
         place = paddle.CUDAPlace(0)
     exe = fluid.Executor(place)
     with fluid.program_guard(main_program, startup_program):
-        x = paddle.data(name='x', shape=x_np.shape, dtype=dtype)
+        x = paddle.fluid.data(name='x', shape=x_np.shape, dtype=dtype)
         res = getattr(paddle.tensor, op_str)(x)
         exe.run(startup_program)
         static_result = exe.run(main_program,
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index 3c37397cae1b5..fba16959901a8 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -44,8 +44,8 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.nn.functional.l1_loss(input, label)
         result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
         result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
@@ -90,9 +90,9 @@ def test_gpu(self):
     # test case the raise message
     def test_errors(self):
         def test_value_error():
-            input = paddle.data(
+            input = paddle.fluid.data(
                 name='input', shape=[10, 10, 5], dtype='float32')
-            label = paddle.data(
+            label = paddle.fluid.data(
                 name='label', shape=[10, 10, 5], dtype='float32')
             loss = paddle.nn.functional.l1_loss(
                 input, label, reduction='reduce_mean')
@@ -127,8 +127,8 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
         l1_loss = paddle.nn.loss.L1Loss()
         result0 = l1_loss(input, label)
         l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e0ec676f1b14c..e3f477c1d9b5e 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -316,23 +316,6 @@ def test_relu(self):
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
 
-    def test_pad2d(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
-            ret = layers.pad2d(t, paddings=[1, 1, 1, 1])
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3, 5, 5], dtype='float32')},
-                fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3, 5, 5], dtype='float32')
-            my_pad2d = paddle.nn.Pad2D(paddings=1)
-            dy_ret = my_pad2d(base.to_variable(t))
-            dy_ret_value = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-
     def test_matmul(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -1369,7 +1352,7 @@ def test_instance_norm(self):
             dy_rlt_value = dy_ret.numpy()
 
         with self.dynamic_graph():
-            instanceNorm = paddle.nn.InstanceNorm(num_channels=shape[1])
+            instanceNorm = nn.InstanceNorm(num_channels=shape[1])
             dy_ret = instanceNorm(base.to_variable(input))
             dy_rlt_value2 = dy_ret.numpy()
 
@@ -1380,7 +1363,7 @@ def test_instance_norm(self):
         with self.static_graph():
             # the input of InstanceNorm must be Variable.
             def test_Variable():
-                instanceNorm = paddle.nn.InstanceNorm(num_channels=shape[1])
+                instanceNorm = nn.InstanceNorm(num_channels=shape[1])
                 ret1 = instanceNorm(input)
 
             self.assertRaises(TypeError, test_Variable)
@@ -1388,7 +1371,7 @@ def test_Variable():
             # the input dtype of InstanceNorm must be float32 or float64
             def test_type():
                 input = np.random.random(shape).astype('int32')
-                instanceNorm = paddle.nn.InstanceNorm(num_channels=shape[1])
+                instanceNorm = nn.InstanceNorm(num_channels=shape[1])
                 ret2 = instanceNorm(input)
 
             self.assertRaises(TypeError, test_type)
diff --git a/python/paddle/fluid/tests/unittests/test_load_op.py b/python/paddle/fluid/tests/unittests/test_load_op.py
index eaa608e5f5b31..885c26e2be0a6 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op.py
@@ -41,14 +41,14 @@ def setUp(self):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(start_prog)
         fluid.io.save_persistables(
-            exe, dirname="/tmp/model", main_program=main_prog)
+            exe, dirname="./model", main_program=main_prog)
 
     def test_load(self):
         main_prog = fluid.Program()
         start_prog = fluid.Program()
         with fluid.program_guard(main_prog, start_prog):
             var = layers.create_tensor(dtype='float32')
-            layers.load(var, file_path='/tmp/model/w')
+            layers.load(var, file_path='./model/w')
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(start_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
index 1d7f986507ca9..a5af6871be474 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
@@ -44,14 +44,14 @@ def setUp(self):
         exe = fluid.Executor(fluid.XPUPlace(0))
         exe.run(start_prog)
         fluid.io.save_persistables(
-            exe, dirname="/tmp/model", main_program=main_prog)
+            exe, dirname="./model", main_program=main_prog)
 
     def test_load_xpu(self):
         main_prog = fluid.Program()
         start_prog = fluid.Program()
         with fluid.program_guard(main_prog, start_prog):
             var = layers.create_tensor(dtype='float32')
-            layers.load(var, file_path='/tmp/model/w')
+            layers.load(var, file_path='./model/w')
 
         exe = fluid.Executor(fluid.XPUPlace(0))
         exe.run(start_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 9ac4895f499f8..0dd6c9f893e2a 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -88,7 +88,7 @@ def check_api(self, axis=-1):
         logsoftmax = paddle.nn.LogSoftmax(axis)
         # test static api
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data(name='x', shape=self.x_shape)
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
             y = logsoftmax(x)
             exe = paddle.static.Executor(self.place)
             out = exe.run(feed={'x': self.x}, fetch_list=[y])
@@ -120,7 +120,7 @@ def check_api(self, axis=-1, dtype=None):
             x = x.astype(dtype)
         ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data(name='x', shape=self.x_shape)
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
             y = F.log_softmax(x, axis, dtype)
             exe = paddle.static.Executor(self.place)
             out = exe.run(feed={'x': self.x}, fetch_list=[y])
@@ -139,10 +139,10 @@ def test_check_api(self):
 
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data(name='X1', shape=[100], dtype='int32')
+            x = paddle.fluid.data(name='X1', shape=[100], dtype='int32')
             self.assertRaises(TypeError, F.log_softmax, x)
 
-            x = paddle.data(name='X2', shape=[100], dtype='float32')
+            x = paddle.fluid.data(name='X2', shape=[100], dtype='float32')
             self.assertRaises(TypeError, F.log_softmax, x, dtype='int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index 9032293070a96..c48ec2a4fb458 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -90,7 +90,7 @@ class TestLogsumexpError(unittest.TestCase):
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             self.assertRaises(TypeError, paddle.logsumexp, 1)
-            x1 = paddle.data(name='x1', shape=[120], dtype="int32")
+            x1 = paddle.fluid.data(name='x1', shape=[120], dtype="int32")
             self.assertRaises(TypeError, paddle.logsumexp, x1)
 
 
@@ -104,7 +104,7 @@ def setUp(self):
     def api_case(self, axis=None, keepdim=False):
         out_ref = ref_logsumexp(self.x, axis, keepdim)
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.shape)
+            x = paddle.fluid.data('X', self.shape)
             out = paddle.logsumexp(x, axis, keepdim)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x}, fetch_list=[out])
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index f655e363e9648..0cdc413c2f68c 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -56,22 +56,22 @@ def is_better(current, best, m, n):
     return var_list[1]
 
 
-class TestReduceLROnPlateauDecay(object):
+class TestReduceOnPlateauDecay(object):
     def test_ReduceLR(self):
         # the decay rate must be less than 1.0
         with self.assertRaises(ValueError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=2.0)
+            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=2.0)
         # the mode must be "min" or "max"
         with self.assertRaises(ValueError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, mode="test")
+            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test")
         # the threshold_mode must be "rel" or "abs"
         with self.assertRaises(ValueError):
-            paddle.optimizer.ReduceLROnPlateau(
+            paddle.optimizer.lr.ReduceOnPlateau(
                 learning_rate=1.0, threshold_mode="test")
         with self.assertRaises(TypeError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate="test")
+            paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test")
         with self.assertRaises(TypeError):
-            paddle.optimizer.ReduceLROnPlateau(learning_rate=0.5).step("test")
+            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=0.5).step("test")
 
         places = [paddle.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -114,7 +114,7 @@ def _test_static(self, place, kwargs):
                 [1], 1, 'float32', persistable=True)
             paddle.increment(x)
             loss = paddle.sin(x)
-            scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+            scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
             adam = paddle.optimizer.Adam(learning_rate=scheduler)
             adam.minimize(loss)
             lr_var = adam._global_learning_rate()
@@ -158,7 +158,7 @@ def _test_dygraph(self, place, kwargs):
         var_list = [best, current_lr, cooldown_counter, num_bad_epochs]
 
         linear = paddle.nn.Linear(10, 10)
-        scheduler = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
         adam = paddle.optimizer.Adam(
             learning_rate=scheduler, parameters=linear.parameters())
 
@@ -180,7 +180,7 @@ def _test_dygraph(self, place, kwargs):
                 loss, var_list)
             self.assertEqual(current_lr, expected_lr)
         state_dict = adam.state_dict()
-        scheduler1 = paddle.optimizer.ReduceLROnPlateau(**kwargs)
+        scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
         adam1 = paddle.optimizer.Adam(
             learning_rate=scheduler1, parameters=linear.parameters())
         adam1.set_state_dict(state_dict)
@@ -414,13 +414,13 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place):
             for batch_id in range(2):
                 x = paddle.to_tensor(x)
                 out = linear(x)
-                loss = paddle.reduce_mean(out)
+                loss = paddle.mean(out)
                 loss.backward()
                 adam.step()
                 adam.clear_grad()
             current_lr = adam.get_lr()
             expected_lr = python_func(epoch, **kwarg)
-            if paddle_api.__name__ != "CosineAnnealingLR":
+            if paddle_api.__name__ != "CosineAnnealingDecay":
                 self.assertEqual(current_lr, expected_lr)
                 scheduler.step()
             else:
@@ -429,74 +429,75 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place):
 
     def test_scheduler(self):
         with self.assertRaises(NotImplementedError):
-            paddle.optimizer.lr_scheduler._LRScheduler().step()
+            paddle.optimizer.lr.LRScheduler().step()
         with self.assertRaises(TypeError):
-            paddle.optimizer.MultiStepLR(
+            paddle.optimizer.lr.MultiStepDecay(
                 learning_rate="test", milestones=[1, 2, 3])
         with self.assertRaises(TypeError):
-            paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones='test')
+            paddle.optimizer.lr.MultiStepDecay(
+                learning_rate=0.5, milestones='test')
         with self.assertRaises(ValueError):
-            paddle.optimizer.MultiStepLR(
+            paddle.optimizer.lr.MultiStepDecay(
                 learning_rate=0.5, milestones=[3, 2, 1])
         with self.assertRaises(ValueError):
-            paddle.optimizer.MultiStepLR(
+            paddle.optimizer.lr.MultiStepDecay(
                 learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
 
-        func_api_kwargs = [(noam_lr, paddle.optimizer.NoamLR, {
+        func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
             "d_model": 0.01,
             "warmup_steps": 100,
             "verbose": False
-        }), (piecewise_lr, paddle.optimizer.PiecewiseLR, {
+        }), (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, {
             "boundaries": [3, 6, 9, 15, 20],
             "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
             "verbose": False
-        }), (natural_exp_lr, paddle.optimizer.NaturalExpLR, {
+        }), (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, {
             "learning_rate": 0.5,
             "gamma": 0.1,
             "verbose": True
-        }), (inverse_time_lr, paddle.optimizer.InverseTimeLR, {
+        }), (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, {
             "learning_rate": 0.5,
             "gamma": 0.1,
             "verbose": False
-        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+        }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
             "learning_rate": 0.5,
             "decay_steps": 20,
             "end_lr": 0,
             "power": 1.0,
             "cycle": False,
             "verbose": True
-        }), (polynomial_lr, paddle.optimizer.PolynomialLR, {
+        }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
             "learning_rate": 0.5,
             "decay_steps": 20,
             "end_lr": 0,
             "power": 1.0,
             "cycle": True,
             "verbose": False
-        }), (linear_warmup_lr, paddle.optimizer.LinearLrWarmup, {
+        }), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
             'learning_rate': 0.5,
             'warmup_steps': 20,
             'start_lr': 0,
             'end_lr': 0.5,
             "verbose": True
-        }), (exponential_lr, paddle.optimizer.ExponentialLR, {
+        }), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
             "learning_rate": 0.5,
             "gamma": 0.9,
             "verbose": False
-        }), (multi_step_lr, paddle.optimizer.MultiStepLR, {
+        }), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
             "learning_rate": 0.5,
             "milestones": [3, 6, 9, 15, 20],
             "gamma": 0.8,
             "verbose": True
-        }), (step_lr, paddle.optimizer.StepLR, {
+        }), (step_lr, paddle.optimizer.lr.StepDecay, {
             "learning_rate": 0.5,
             "step_size": 2,
             "gamma": 0.8,
             "verbose": False
-        }), (lambda_lr, paddle.optimizer.LambdaLR, {
+        }), (lambda_lr, paddle.optimizer.lr.LambdaDecay, {
             "learning_rate": 0.5,
             "lr_lambda": lambda x: 0.95**x,
             "verbose": True
-        }), (cosine_annealing_lr, paddle.optimizer.CosineAnnealingLR, {
+        }), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
             "learning_rate": 0.5,
             "T_max": 10,
             "verbose": False
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index da3b2b7a2066a..29e0a8d6f02db 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 import unittest
 import numpy as np
 import paddle.fluid as fluid
@@ -154,5 +155,197 @@ def test_errors(self):
             self.assertRaises(TypeError, fluid.layers.lrn, in_w)
 
 
+class TestLocalResponseNormFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_3d_input(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            in_np1 = np.random.random([3, 40, 40]).astype("float32")
+            in_np2 = np.transpose(in_np1, (0, 2, 1))
+
+            input1 = fluid.data(
+                name="input1", shape=[3, 40, 40], dtype="float32")
+            input2 = fluid.data(
+                name="input2", shape=[3, 40, 40], dtype="float32")
+            res1 = paddle.nn.functional.local_response_norm(
+                x=input1, size=5, data_format='NCL')
+            res2 = paddle.nn.functional.local_response_norm(
+                x=input2, size=5, data_format='NLC')
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input1": in_np1,
+                                    "input2": in_np2},
+                              fetch_list=[res1, res2])
+
+            fetches1_tran = np.transpose(fetches[1], (0, 2, 1))
+            self.assertTrue(np.allclose(fetches[0], fetches1_tran))
+
+    def check_static_4d_input(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input1 = fluid.data(
+                name="input1", shape=[3, 3, 40, 40], dtype="float32")
+            input2 = fluid.data(
+                name="input2", shape=[3, 40, 40, 3], dtype="float32")
+
+            res1 = paddle.nn.functional.local_response_norm(
+                x=input1, size=5, data_format='NCHW')
+            res2 = paddle.nn.functional.local_response_norm(
+                x=input2, size=5, data_format='NHWC')
+
+            in_np1 = np.random.random([3, 3, 40, 40]).astype("float32")
+            in_np2 = np.transpose(in_np1, (0, 2, 3, 1))
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input1": in_np1,
+                                    "input2": in_np2},
+                              fetch_list=[res1, res2])
+
+            fetches1_tran = np.transpose(fetches[1], (0, 3, 1, 2))
+            self.assertTrue(np.allclose(fetches[0], fetches1_tran))
+
+    def check_static_5d_input(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input1 = fluid.data(
+                name="input1", shape=[3, 3, 3, 40, 40], dtype="float32")
+            input2 = fluid.data(
+                name="input2", shape=[3, 3, 40, 40, 3], dtype="float32")
+            res1 = paddle.nn.functional.local_response_norm(
+                x=input1, size=5, data_format='NCDHW')
+            res2 = paddle.nn.functional.local_response_norm(
+                x=input2, size=5, data_format='NDHWC')
+
+            in_np1 = np.random.random([3, 3, 3, 40, 40]).astype("float32")
+            in_np2 = np.transpose(in_np1, (0, 2, 3, 4, 1))
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input1": in_np1,
+                                    "input2": in_np2},
+                              fetch_list=[res1, res2])
+
+            fetches1_tran = np.transpose(fetches[1], (0, 4, 1, 2, 3))
+            self.assertTrue(np.allclose(fetches[0], fetches1_tran))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_3d_input(place=place)
+            self.check_static_4d_input(place=place)
+            self.check_static_5d_input(place=place)
+
+    def check_dygraph_3d_input(self, place):
+        with fluid.dygraph.guard(place):
+            in_np1 = np.random.random([3, 40, 40]).astype("float32")
+            in_np2 = np.transpose(in_np1, (0, 2, 1))
+
+            in1 = paddle.to_tensor(in_np1)
+            in2 = paddle.to_tensor(in_np2)
+
+            res1 = paddle.nn.functional.local_response_norm(
+                x=in1, size=5, data_format='NCL')
+            res2 = paddle.nn.functional.local_response_norm(
+                x=in2, size=5, data_format='NLC')
+
+            res2_tran = np.transpose(res2.numpy(), (0, 2, 1))
+            self.assertTrue(np.allclose(res1.numpy(), res2_tran))
+
+    def check_dygraph_4d_input(self, place):
+        with fluid.dygraph.guard(place):
+            in_np1 = np.random.random([3, 3, 40, 40]).astype("float32")
+            in_np2 = np.transpose(in_np1, (0, 2, 3, 1))
+
+            in1 = paddle.to_tensor(in_np1)
+            in2 = paddle.to_tensor(in_np2)
+
+            res1 = paddle.nn.functional.local_response_norm(
+                x=in1, size=5, data_format='NCHW')
+            res2 = paddle.nn.functional.local_response_norm(
+                x=in2, size=5, data_format='NHWC')
+
+            res2_tran = np.transpose(res2.numpy(), (0, 3, 1, 2))
+            self.assertTrue(np.allclose(res1.numpy(), res2_tran))
+
+    def check_dygraph_5d_input(self, place):
+        with fluid.dygraph.guard(place):
+            in_np1 = np.random.random([3, 3, 3, 40, 40]).astype("float32")
+            in_np2 = np.transpose(in_np1, (0, 2, 3, 4, 1))
+
+            in1 = paddle.to_tensor(in_np1)
+            in2 = paddle.to_tensor(in_np2)
+
+            res1 = paddle.nn.functional.local_response_norm(
+                x=in1, size=5, data_format='NCDHW')
+            res2 = paddle.nn.functional.local_response_norm(
+                x=in2, size=5, data_format='NDHWC')
+
+            res2_tran = np.transpose(res2.numpy(), (0, 4, 1, 2, 3))
+            self.assertTrue(np.allclose(res1.numpy(), res2_tran))
+
+    def test_dygraph(self):
+        for place in self.places:
+            self.check_dygraph_3d_input(place)
+            self.check_dygraph_4d_input(place)
+            self.check_dygraph_5d_input(place)
+
+
+class TestLocalResponseNormFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_Variable():
+                # the input of lrn must be Variable.
+                x1 = fluid.create_lod_tensor(
+                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                paddle.nn.functional.local_response_norm(x1, size=5)
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_datatype():
+                x = fluid.data(name='x', shape=[3, 4, 5, 6], dtype="int32")
+                paddle.nn.functional.local_response_norm(x, size=5)
+
+            self.assertRaises(TypeError, test_datatype)
+
+            def test_dataformat():
+                x = fluid.data(name='x', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.local_response_norm(
+                    x, size=5, data_format="NCTHW")
+
+            self.assertRaises(ValueError, test_dataformat)
+
+            def test_dim():
+                x = fluid.data(name='x', shape=[3, 4], dtype="float32")
+                paddle.nn.functional.local_response_norm(x, size=5)
+
+            self.assertRaises(ValueError, test_dim)
+
+
+class TestLocalResponseNormCAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in1 = paddle.rand(shape=(3, 3, 40, 40), dtype="float32")
+                in2 = paddle.transpose(in1, [0, 2, 3, 1])
+
+                m1 = paddle.nn.LocalResponseNorm(size=5, data_format='NCHW')
+                m2 = paddle.nn.LocalResponseNorm(size=5, data_format='NHWC')
+
+                res1 = m1(in1)
+                res2 = m2(in2)
+
+                res2_tran = np.transpose(res2.numpy(), (0, 3, 1, 2))
+                self.assertTrue(np.allclose(res1.numpy(), res2_tran))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_masked_select_op.py b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
index 259a36e30d9a9..ed1a981d0306b 100644
--- a/python/paddle/fluid/tests/unittests/test_masked_select_op.py
+++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
@@ -74,8 +74,8 @@ def test_imperative_mode(self):
 
     def test_static_mode(self):
         shape = [8, 9, 6]
-        x = paddle.data(shape=shape, dtype='float32', name='x')
-        mask = paddle.data(shape=shape, dtype='bool', name='mask')
+        x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
+        mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
         np_x = np.random.random(shape).astype('float32')
         np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
 
@@ -97,9 +97,9 @@ def test_error(self):
                                          paddle.static.Program()):
 
             shape = [8, 9, 6]
-            x = paddle.data(shape=shape, dtype='float32', name='x')
-            mask = paddle.data(shape=shape, dtype='bool', name='mask')
-            mask_float = paddle.data(
+            x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
+            mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
+            mask_float = paddle.fluid.data(
                 shape=shape, dtype='float32', name='mask_float')
             np_x = np.random.random(shape).astype('float32')
             np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index d85521f76621d..4795b49301507 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -397,8 +397,10 @@ def test_tensor_patch_method(self):
         self.assertTrue(
             np.array_equal(m.unique()[0].numpy(), paddle.unique(m)[0].numpy()))
         self.assertTrue(
-            np.array_equal(m.unique_with_counts()[2],
-                           paddle.unique_with_counts(m)[2]))
+            np.array_equal(
+                m.unique(return_counts=True)[1],
+                paddle.unique(
+                    m, return_counts=True)[1]))
         self.assertTrue(np.array_equal(x.flip([0]), paddle.flip(x, [0])))
         self.assertTrue(np.array_equal(x.unbind(0), paddle.unbind(x, 0)))
         self.assertTrue(np.array_equal(x.roll(1), paddle.roll(x, 1)))
@@ -471,12 +473,6 @@ def test_tensor_patch_method(self):
         # 3. Bool tensor operation
         x = paddle.to_tensor([[True, False], [True, False]])
         y = paddle.to_tensor([[False, False], [False, True]])
-        self.assertTrue(
-            np.array_equal(x.reduce_all().numpy(), paddle.reduce_all(x).numpy(
-            )))
-        self.assertTrue(
-            np.array_equal(x.reduce_any().numpy(), paddle.reduce_any(x).numpy(
-            )))
         self.assertTrue(
             np.array_equal(
                 x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
@@ -499,22 +495,12 @@ def test_tensor_patch_method(self):
                 x.where(a, b).numpy(), paddle.where(x, a, b).numpy()))
 
         self.assertTrue(inspect.ismethod(a.dot))
-        self.assertTrue(inspect.ismethod(a.elementwise_add))
-        self.assertTrue(inspect.ismethod(a.elementwise_div))
-        self.assertTrue(inspect.ismethod(a.elementwise_floordiv))
-        self.assertTrue(inspect.ismethod(a.elementwise_mod))
-        self.assertTrue(inspect.ismethod(a.elementwise_sub))
         self.assertTrue(inspect.ismethod(a.logsumexp))
         self.assertTrue(inspect.ismethod(a.multiplex))
         self.assertTrue(inspect.ismethod(a.prod))
-        self.assertTrue(inspect.ismethod(a.reduce_max))
-        self.assertTrue(inspect.ismethod(a.reduce_min))
-        self.assertTrue(inspect.ismethod(a.reduce_prod))
-        self.assertTrue(inspect.ismethod(a.reduce_sum))
         self.assertTrue(inspect.ismethod(a.scale))
         self.assertTrue(inspect.ismethod(a.stanh))
-        self.assertTrue(inspect.ismethod(a.sums))
-        self.assertTrue(inspect.ismethod(a.elementwise_sum))
+        self.assertTrue(inspect.ismethod(a.add_n))
         self.assertTrue(inspect.ismethod(a.max))
         self.assertTrue(inspect.ismethod(a.maximum))
         self.assertTrue(inspect.ismethod(a.min))
@@ -527,7 +513,6 @@ def test_tensor_patch_method(self):
         self.assertTrue(inspect.ismethod(a.inverse))
         self.assertTrue(inspect.ismethod(a.log1p))
         self.assertTrue(inspect.ismethod(a.erf))
-        self.assertTrue(inspect.ismethod(a.addcmul))
         self.assertTrue(inspect.ismethod(a.addmm))
         self.assertTrue(inspect.ismethod(a.clip))
         self.assertTrue(inspect.ismethod(a.trace))
@@ -547,8 +532,6 @@ def test_tensor_patch_method(self):
         self.assertTrue(inspect.ismethod(a.argmax))
         self.assertTrue(inspect.ismethod(a.argmin))
         self.assertTrue(inspect.ismethod(a.argsort))
-        self.assertTrue(inspect.ismethod(a.has_inf))
-        self.assertTrue(inspect.ismethod(a.has_nan))
         self.assertTrue(inspect.ismethod(a.masked_select))
         self.assertTrue(inspect.ismethod(a.topk))
         self.assertTrue(inspect.ismethod(a.index_select))
@@ -556,7 +539,6 @@ def test_tensor_patch_method(self):
         self.assertTrue(inspect.ismethod(a.sort))
         self.assertTrue(inspect.ismethod(a.index_sample))
         self.assertTrue(inspect.ismethod(a.mean))
-        self.assertTrue(inspect.ismethod(a.reduce_mean))
         self.assertTrue(inspect.ismethod(a.std))
         self.assertTrue(inspect.ismethod(a.numel))
 
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index 1d38c833773ca..fac400caacdab 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -92,7 +92,7 @@ def setUp(self):
 
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.maxout(x, self.groups, self.axis)
             m = paddle.nn.Maxout(self.groups, self.axis)
             out2 = m(x)
@@ -137,11 +137,11 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.maxout, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(
+            x_int32 = paddle.fluid.data(
                 name='x_int32', shape=[2, 4, 6, 8], dtype='int32')
             self.assertRaises(TypeError, F.maxout, x_int32)
 
-            x_float32 = paddle.data(name='x_float32', shape=[2, 4, 6, 8])
+            x_float32 = paddle.fluid.data(name='x_float32', shape=[2, 4, 6, 8])
             self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index f0094e703cd0d..e2a2dcf44f056 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -185,7 +185,7 @@ def setUp(self):
     def test_api_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_shape)
+            x = paddle.fluid.data('X', self.x_shape)
             out1 = paddle.mean(x)
             out2 = paddle.tensor.mean(x)
             out3 = paddle.tensor.stat.mean(x)
@@ -249,7 +249,7 @@ def test_errors(self):
         self.assertRaises(Exception, paddle.mean, x, 2)
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12], 'int32')
+            x = paddle.fluid.data('X', [10, 12], 'int32')
             self.assertRaises(TypeError, paddle.mean, x)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index e327307e95530..bc5d35d3254bc 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -191,8 +191,8 @@ def test_NNFunctionalMseLoss_mean(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.data(name='input', shape=dim, dtype='float32')
-                target = paddle.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
 
             exe = paddle.static.Executor(place)
@@ -225,8 +225,8 @@ def test_NNFunctionalMseLoss_sum(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.data(name='input', shape=dim, dtype='float32')
-                target = paddle.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
 
                 exe = paddle.static.Executor(place)
@@ -259,8 +259,8 @@ def test_NNFunctionalMseLoss_none(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.data(name='input', shape=dim, dtype='float32')
-                target = paddle.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
 
                 exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index 7cca7738efd05..db4978930e049 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -17,12 +17,14 @@
 import unittest
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid import core
 from op_test import OpTest
 import numpy as np
 
 
 class TestMultinomialOp(OpTest):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "multinomial"
         self.init_data()
         self.inputs = {"X": self.input_np}
@@ -175,5 +177,39 @@ def test_alias(self):
         paddle.tensor.random.multinomial(x, num_samples=10, replacement=True)
 
 
+class TestMultinomialError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def test_num_sample(self):
+        def test_num_sample_less_than_0():
+            x = paddle.rand([4])
+            paddle.multinomial(x, num_samples=-2)
+
+        self.assertRaises(core.EnforceNotMet, test_num_sample_less_than_0)
+
+    def test_replacement_False(self):
+        def test_samples_larger_than_categories():
+            x = paddle.rand([4])
+            paddle.multinomial(x, num_samples=5, replacement=False)
+
+        self.assertRaises(core.EnforceNotMet,
+                          test_samples_larger_than_categories)
+
+    def test_input_probs_dim(self):
+        def test_dim_larger_than_2():
+            x = paddle.rand([2, 3, 3])
+            paddle.multinomial(x)
+
+        self.assertRaises(core.EnforceNotMet, test_dim_larger_than_2)
+
+        def test_dim_less_than_1():
+            x_np = np.random.random([])
+            x = paddle.to_tensor(x_np)
+            paddle.multinomial(x)
+
+        self.assertRaises(core.EnforceNotMet, test_dim_less_than_1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 2feca1c30689c..1f88568b5bc8e 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -526,20 +526,6 @@ def test_case(self):
             self.assertTrue(np.allclose(results[i + 1], expect_res))
 
 
-class TestUpsampleNearest2dInterpOpAPI2_0(unittest.TestCase):
-    def test_case(self):
-
-        # dygraph
-        x_data = np.random.random((1, 3, 6, 6)).astype("float32")
-        upsample = paddle.nn.UpsamplingNearest2d(scale_factor=[2, 2])
-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(x_data)
-            interp = upsample(x)
-            expect = nearest_neighbor_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False)
-            self.assertTrue(np.allclose(interp.numpy(), expect))
-
-
 class TestNearestInterpException(unittest.TestCase):
     def test_exception(self):
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index c07bf949af39e..2b741fcd0797d 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -884,8 +884,8 @@ def test_name(self):
         startup_prog = paddle.static.Program()
         place = paddle.CPUPlace()
         with paddle.static.program_guard(prog, startup_prog):
-            x = paddle.data(name='x', shape=[10, 10], dtype='float64')
-            label = paddle.data(name='label', shape=[10], dtype='int64')
+            x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
+            label = paddle.fluid.data(name='label', shape=[10], dtype='int64')
             nll_loss = paddle.nn.loss.NLLLoss(name='nll_loss')
             res = nll_loss(x, label)
             self.assertTrue(res.name.startswith('nll_loss'))
@@ -898,8 +898,8 @@ def test_x_dim_lt_2():
             startup_prog = paddle.static.Program()
             place = paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                x = paddle.data(name='x', shape=[10, ], dtype='float64')
-                label = paddle.data(name='label', shape=[10, ], dtype='float64')
+                x = paddle.fluid.data(name='x', shape=[10, ], dtype='float64')
+                label = paddle.fluid.data(name='label', shape=[10, ], dtype='float64')
                 nll_loss = paddle.nn.loss.NLLLoss()
                 res = nll_loss(x, label)
 
@@ -922,8 +922,8 @@ def test_NLLLoss_reduction_not_sum_mean_none():
             startup_prog = paddle.static.Program()
             place = paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
-                label = paddle.data(name='label', shape=[10], dtype='int64')
+                x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
+                label = paddle.fluid.data(name='label', shape=[10], dtype='int64')
                 nll_loss = paddle.nn.loss.NLLLoss(reduction='')
                 res = nll_loss(x, label)
 
@@ -946,8 +946,8 @@ def test_nll_loss_function_reduction_not_sum_mean_none():
             startup_prog = paddle.static.Program()
             place = paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                x = paddle.data(name='x', shape=[10, 10], dtype='float64')
-                label = paddle.data(name='label', shape=[10], dtype='int64')
+                x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
+                label = paddle.fluid.data(name='label', shape=[10], dtype='int64')
                 res = paddle.nn.functional.nll_loss(x, label, reduction='')
 
         self.assertRaises(ValueError,
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 1675f935f7d6a..899c1f798e69d 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -161,10 +161,10 @@ def func(self, place):
         y_shapes = [[2], [3, 2], [2, 4, 5], [2, 3, 3, 5], [4, 3]]
         transpose_xs = [False, True, True, False, False]
         transpose_ys = [False, True, False, True, False]
-        dtypes = [np.float64, np.float64, np.float32, np.float32, np.float64]
-        typenames = ["float64", "float64", "float32", "float32", "float64"]
-        for i, (x_shape, y_shape, transpose_x, transpose_y, dtype, typename) \
-            in enumerate(zip(x_shapes, y_shapes, transpose_xs, transpose_ys, dtypes, typenames)):
+        dtype = np.float64
+        typename = "float64"
+        for i, (x_shape, y_shape, transpose_x, transpose_y) \
+            in enumerate(zip(x_shapes, y_shapes, transpose_xs, transpose_ys)):
             x = layers.create_parameter(
                 dtype=typename, shape=x_shape, name='x{}'.format(i))
             y = layers.create_parameter(
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index 995a1f26ff6eb..595e0bb480051 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -61,8 +61,8 @@ def static_api(self):
         if isinstance(self.mean, np.ndarray) \
             and isinstance(self.std, np.ndarray):
             with paddle.static.program_guard(paddle.static.Program()):
-                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
-                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                mean = paddle.fluid.data('Mean', self.mean.shape, self.mean.dtype)
+                std = paddle.fluid.data('Std', self.std.shape, self.std.dtype)
                 out = paddle.normal(mean, std, self.shape)
 
                 exe = paddle.static.Executor(self.place)
@@ -76,7 +76,7 @@ def static_api(self):
             return ret_all
         elif isinstance(self.mean, np.ndarray):
             with paddle.static.program_guard(paddle.static.Program()):
-                mean = paddle.data('Mean', self.mean.shape, self.mean.dtype)
+                mean = paddle.fluid.data('Mean', self.mean.shape, self.mean.dtype)
                 out = paddle.normal(mean, self.std, self.shape)
 
                 exe = paddle.static.Executor(self.place)
@@ -86,7 +86,7 @@ def static_api(self):
             return ret_all
         elif isinstance(self.std, np.ndarray):
             with paddle.static.program_guard(paddle.static.Program()):
-                std = paddle.data('Std', self.std.shape, self.std.dtype)
+                std = paddle.fluid.data('Std', self.std.shape, self.std.dtype)
                 out = paddle.normal(self.mean, std, self.shape)
 
                 exe = paddle.static.Executor(self.place)
@@ -180,17 +180,17 @@ def test_errors(self):
             std = [1, 2, 3]
             self.assertRaises(TypeError, paddle.normal, std=std)
 
-            mean = paddle.data('Mean', [100], 'int32')
+            mean = paddle.fluid.data('Mean', [100], 'int32')
             self.assertRaises(TypeError, paddle.normal, mean)
 
-            std = paddle.data('Std', [100], 'int32')
+            std = paddle.fluid.data('Std', [100], 'int32')
             self.assertRaises(TypeError, paddle.normal, mean=1.0, std=std)
 
             self.assertRaises(TypeError, paddle.normal, shape=1)
 
             self.assertRaises(TypeError, paddle.normal, shape=[1.0])
 
-            shape = paddle.data('Shape', [100], 'float32')
+            shape = paddle.fluid.data('Shape', [100], 'float32')
             self.assertRaises(TypeError, paddle.normal, shape=shape)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
index 614e0e897613b..274a4ebee7c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_normalize.py
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -56,8 +56,8 @@ def run_imperative(self):
         self.assertRaises(BaseException, F.normalize, x)
 
     def run_static(self, use_gpu=False):
-        x = paddle.data(name='input', shape=[10, 10], dtype='float32')
-        x2 = paddle.data(name='input2', shape=[2], dtype='float32')
+        x = paddle.fluid.data(name='input', shape=[10, 10], dtype='float32')
+        x2 = paddle.fluid.data(name='input2', shape=[2], dtype='float32')
         result0 = F.normalize(x)
         result1 = F.normalize(x, p=1.5)
         result2 = F.normalize(x, axis=0)
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index 800706e5965df..d106484d91e2f 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -55,8 +55,8 @@ def test_numel_static(self):
         with fluid.program_guard(main_program, startup_program):
             shape1 = [2, 1, 4, 5]
             shape2 = [1, 4, 5]
-            x_1 = paddle.data(shape=shape1, dtype='int32', name='x_1')
-            x_2 = paddle.data(shape=shape2, dtype='int32', name='x_2')
+            x_1 = paddle.fluid.data(shape=shape1, dtype='int32', name='x_1')
+            x_2 = paddle.fluid.data(shape=shape2, dtype='int32', name='x_2')
             input_1 = np.random.random(shape1).astype("int32")
             input_2 = np.random.random(shape2).astype("int32")
             out_1 = paddle.numel(x_1)
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index bb0d6f07bdbde..db7fc9d2b2e99 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -25,7 +25,7 @@
 class TestOnesLikeAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x = paddle.data('x', [3, 4])
+            x = paddle.fluid.data('x', [3, 4])
             self.assertRaises(TypeError, ones_like, x, 'int8')
 
 
@@ -35,7 +35,7 @@ def test_api(self):
         startup_program = Program()
         train_program = Program()
         with program_guard(train_program, startup_program):
-            x = paddle.data('X', shape)
+            x = paddle.fluid.data('X', shape)
 
             # 'bool', 'float32', 'float64', 'int32', 'int64'
             out1 = ones_like(x)
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index 6dcd9850273be..69298f0f6a55d 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -22,6 +22,9 @@
 import paddle.fluid.optimizer as optimizer
 from paddle.fluid.backward import _append_grad_suffix_
 
+import paddle
+paddle.enable_static()
+
 np.random.seed(10)
 
 SHAPE = [16, 10]
@@ -255,8 +258,8 @@ def _apply_gradient(self, param, grad, name):
         moment2_out = beta2 * moment2 + (1. - beta2) * np.square(grad)
 
         lr = attr['lr'] * np.sqrt(1. - beta2_pow) / (1. - beta1_pow)
-        param_out = param - lr * (moment1_out /
-                                  (np.sqrt(moment2_out) + epsilon))
+        param_out = param - lr * (moment1_out / (np.sqrt(moment2_out) + epsilon
+                                                 * np.sqrt(1 - beta2_pow)))
 
         # update hyper-parameter of optimizer
         self.param_attr[name]['beta1_pow'] = beta1_pow * beta1
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
index 11719a9c4a928..c29352bb51af6 100644
--- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -165,7 +165,7 @@ def check_static_result_1(self, place):
             mode = "constant"
             value = 100
             input_data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             result = F.pad(x=x,
                            pad=pad,
                            value=value,
@@ -186,7 +186,7 @@ def check_static_result_2(self, place):
             pad = [1, 2, 1, 1, 1, 2]
             mode = "reflect"
             input_data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
             result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
             exe = Executor(place)
@@ -208,7 +208,7 @@ def check_static_result_3(self, place):
             pad = [1, 2, 1, 1, 3, 4]
             mode = "replicate"
             input_data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
             result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
             exe = Executor(place)
@@ -230,7 +230,7 @@ def check_static_result_4(self, place):
             pad = [1, 2, 1, 1, 3, 4]
             mode = "circular"
             input_data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW")
             result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC")
             exe = Executor(place)
@@ -314,7 +314,6 @@ def test_static(self):
 
     def test_dygraph_1(self):
         paddle.disable_static()
-
         input_shape = (1, 2, 3, 4, 5)
         pad = [1, 2, 1, 1, 3, 4]
         mode = "constant"
@@ -342,7 +341,6 @@ def test_dygraph_1(self):
 
     def test_dygraph_2(self):
         paddle.disable_static()
-
         input_shape = (2, 3, 4, 5)
         pad = [1, 1, 3, 4]
         mode = "constant"
@@ -370,38 +368,8 @@ def test_dygraph_2(self):
         self.assertTrue(np.allclose(y1.numpy(), np_out1))
         self.assertTrue(np.allclose(y2.numpy(), np_out2))
 
-    def test_dygraph_2(self):
-        paddle.disable_static()
-
-        input_shape = (2, 3, 4, 5)
-        pad = [1, 1, 3, 4]
-        mode = "constant"
-        value = 100
-        input_data = np.random.rand(*input_shape).astype(np.float32)
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCHW")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NHWC")
-        tensor_data = paddle.to_tensor(input_data)
-        tensor_pad = paddle.to_tensor(pad, dtype="int32")
-
-        y1 = F.pad(tensor_data,
-                   pad=tensor_pad,
-                   mode=mode,
-                   value=value,
-                   data_format="NCHW")
-        y2 = F.pad(tensor_data,
-                   pad=tensor_pad,
-                   mode=mode,
-                   value=value,
-                   data_format="NHWC")
-
-        self.assertTrue(np.allclose(y1.numpy(), np_out1))
-        self.assertTrue(np.allclose(y2.numpy(), np_out2))
-
     def test_dygraph_3(self):
         paddle.disable_static()
-
         input_shape = (3, 4, 5)
         pad = [3, 4]
         mode = "constant"
@@ -455,6 +423,8 @@ def _get_numpy_out(self,
             out = np.pad(input_data, pad, mode=mode)
         elif mode == "replicate":
             out = np.pad(input_data, pad, mode="edge")
+        elif mode == "circular":
+            out = np.pad(input_data, pad, mode="wrap")
 
         return out
 
@@ -471,9 +441,10 @@ def test_class(self):
             value = 100
             input_data = np.random.rand(*input_shape).astype(np.float32)
 
-            pad_reflection = nn.ReflectionPad1d(padding=pad)
-            pad_replication = nn.ReplicationPad1d(padding=pad)
-            pad_constant = nn.ConstantPad1d(padding=pad, value=value)
+            pad_reflection = nn.Pad1D(padding=pad, mode="reflect")
+            pad_replication = nn.Pad1D(padding=pad, mode="replicate")
+            pad_constant = nn.Pad1D(padding=pad, mode="constant", value=value)
+            pad_circular = nn.Pad1D(padding=pad, mode="circular")
 
             data = paddle.to_tensor(input_data)
 
@@ -492,6 +463,11 @@ def test_class(self):
                 input_data, pad, "constant", value=value, data_format="NCL")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
+            output = pad_circular(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "circular", value=value, data_format="NCL")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
 
 class TestPad2dAPI(unittest.TestCase):
     def _get_numpy_out(self,
@@ -521,6 +497,8 @@ def _get_numpy_out(self,
             out = np.pad(input_data, pad, mode=mode)
         elif mode == "replicate":
             out = np.pad(input_data, pad, mode="edge")
+        elif mode == "circular":
+            out = np.pad(input_data, pad, mode="wrap")
 
         return out
 
@@ -537,10 +515,10 @@ def test_class(self):
             value = 100
             input_data = np.random.rand(*input_shape).astype(np.float32)
 
-            pad_reflection = nn.ReflectionPad2d(padding=pad)
-            pad_replication = nn.ReplicationPad2d(padding=pad)
-            pad_constant = nn.ConstantPad2d(padding=pad, value=value)
-            pad_zero = nn.ZeroPad2d(padding=pad)
+            pad_reflection = nn.Pad2D(padding=pad, mode="reflect")
+            pad_replication = nn.Pad2D(padding=pad, mode="replicate")
+            pad_constant = nn.Pad2D(padding=pad, mode="constant", value=value)
+            pad_circular = nn.Pad2D(padding=pad, mode="circular")
 
             data = paddle.to_tensor(input_data)
 
@@ -559,9 +537,9 @@ def test_class(self):
                 input_data, pad, "constant", value=value, data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
-            output = pad_zero(data)
+            output = pad_circular(data)
             np_out = self._get_numpy_out(
-                input_data, pad, "constant", value=0, data_format="NCHW")
+                input_data, pad, "circular", data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
@@ -595,6 +573,8 @@ def _get_numpy_out(self,
             out = np.pad(input_data, pad, mode=mode)
         elif mode == "replicate":
             out = np.pad(input_data, pad, mode="edge")
+        elif mode == "circular":
+            out = np.pad(input_data, pad, mode="wrap")
 
         return out
 
@@ -611,11 +591,18 @@ def test_class(self):
             value = 100
             input_data = np.random.rand(*input_shape).astype(np.float32)
 
-            pad_replication = nn.ReplicationPad3d(padding=pad)
-            pad_constant = nn.ConstantPad3d(padding=pad, value=value)
+            pad_reflection = nn.Pad3D(padding=pad, mode="reflect")
+            pad_replication = nn.Pad3D(padding=pad, mode="replicate")
+            pad_constant = nn.Pad3D(padding=pad, mode="constant", value=value)
+            pad_circular = nn.Pad3D(padding=pad, mode="circular")
 
             data = paddle.to_tensor(input_data)
 
+            output = pad_reflection(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
             output = pad_replication(data)
             np_out = self._get_numpy_out(
                 input_data, pad, "replicate", data_format="NCDHW")
@@ -626,6 +613,11 @@ def test_class(self):
                 input_data, pad, "constant", value=value, data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
+            output = pad_circular(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "circular", data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
 
 class TestPad3dOpError(unittest.TestCase):
     def test_errors(self):
@@ -637,7 +629,7 @@ def test_variable():
         def test_reflect_1():
             input_shape = (1, 2, 3, 4, 5)
             data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             y = F.pad(x, pad=[5, 6, 1, 1, 1, 1], value=1, mode='reflect')
             place = paddle.CPUPlace()
             exe = Executor(place)
@@ -646,7 +638,7 @@ def test_reflect_1():
         def test_reflect_2():
             input_shape = (1, 2, 3, 4, 5)
             data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             y = F.pad(x, pad=[1, 1, 4, 3, 1, 1], value=1, mode='reflect')
             place = paddle.CPUPlace()
             exe = Executor(place)
@@ -655,7 +647,7 @@ def test_reflect_2():
         def test_reflect_3():
             input_shape = (1, 2, 3, 4, 5)
             data = np.random.rand(*input_shape).astype(np.float32)
-            x = paddle.data(name="x", shape=input_shape)
+            x = paddle.fluid.data(name="x", shape=input_shape)
             y = F.pad(x, pad=[1, 1, 1, 1, 2, 3], value=1, mode='reflect')
             place = paddle.CPUPlace()
             exe = Executor(place)
@@ -673,32 +665,30 @@ def test_reflect_3():
 class TestPadDataformatError(unittest.TestCase):
     def test_errors(self):
         def test_ncl():
-            paddle.disable_static(paddle.CPUPlace())
             input_shape = (1, 2, 3, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
             data = np.arange(
                 np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
-            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCL")
+            my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCL")
             data = paddle.to_tensor(data)
             result = my_pad(data)
 
         def test_nchw():
-            paddle.disable_static(paddle.CPUPlace())
             input_shape = (1, 2, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
             data = np.arange(
                 np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
-            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCHW")
+            my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCHW")
             data = paddle.to_tensor(data)
             result = my_pad(data)
 
         def test_ncdhw():
-            paddle.disable_static(paddle.CPUPlace())
             input_shape = (1, 2, 3, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
             data = np.arange(
                 np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
-            my_pad = nn.ReplicationPad1d(padding=pad, data_format="NCDHW")
+            my_pad = nn.Pad1D(
+                padding=pad, mode="replicate", data_format="NCDHW")
             data = paddle.to_tensor(data)
             result = my_pad(data)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
index cf138e6772616..c91616b06ee47 100644
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -32,8 +32,8 @@ def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
     ) else fluid.CPUPlace()
 
     with paddle.static.program_guard(prog, startup_prog):
-        x = paddle.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
-        y = paddle.data(name='y', shape=y_np.shape, dtype=x_np.dtype)
+        x = paddle.fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+        y = paddle.fluid.data(name='y', shape=y_np.shape, dtype=x_np.dtype)
         dist = paddle.nn.layer.distance.PairwiseDistance(
             p=p, epsilon=epsilon, keepdim=keepdim)
         distance = dist(x, y)
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index cf93f39ab8c5c..f75d6e9df540b 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -97,8 +97,8 @@ def test_static_graph_functional(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
             out_1 = F.pixel_shuffle(x_1, 3)
             out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
 
@@ -123,8 +123,8 @@ def test_static_graph_layer(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
             # init instance
             ps_1 = paddle.nn.PixelShuffle(3)
             ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 16388ff8f5f04..f33b375029b26 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -49,8 +49,8 @@ def setUp(self):
 
     def static_check(self, weight_np):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, 'float32')
-            weight = paddle.data('Alpha', weight_np.shape, 'float32')
+            x = paddle.fluid.data('X', self.x_np.shape, 'float32')
+            weight = paddle.fluid.data('Alpha', weight_np.shape, 'float32')
             out = F.prelu(x, weight)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x_np,
@@ -78,15 +78,15 @@ def test_dygraph_api(self):
 
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            weight_fp32 = paddle.data(
+            weight_fp32 = paddle.fluid.data(
                 name='weight_fp32', shape=[1], dtype='float32')
             # The input type must be Variable.
             self.assertRaises(TypeError, F.prelu, x=1, weight=weight_fp32)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[2, 3], dtype='int32')
             self.assertRaises(TypeError, F.prelu, x=x_int32, weight=weight_fp32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[2, 3], dtype='float16')
             F.prelu(x=x_fp16, weight=weight_fp32)
 
 
@@ -100,7 +100,7 @@ def test_static_api(self):
         startup_program = paddle.static.Program()
         train_program = paddle.static.Program()
         with paddle.static.program_guard(train_program, startup_program):
-            x = paddle.data(name='X', shape=self.x_np.shape, dtype='float32')
+            x = paddle.fluid.data(name='X', shape=self.x_np.shape, dtype='float32')
             m = paddle.nn.PReLU()
             out = m(x)
             exe = paddle.static.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
index 158683907253e..15fd79542d608 100644
--- a/python/paddle/fluid/tests/unittests/test_prod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -55,7 +55,7 @@ def run_imperative(self):
         self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
 
     def run_static(self, use_gpu=False):
-        input = paddle.data(name='input', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.prod(input)
         result1 = paddle.prod(input, axis=1)
         result2 = paddle.prod(input, axis=-1)
@@ -113,8 +113,8 @@ class TestProdOpError(unittest.TestCase):
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            x = paddle.data(name='x', shape=[2, 2, 4], dtype='float32')
-            bool_x = paddle.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            x = paddle.fluid.data(name='x', shape=[2, 2, 4], dtype='float32')
+            bool_x = paddle.fluid.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
             # The argument x shoule be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 7880b48cd7d5a..82bfb88d54d51 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -125,8 +125,8 @@ def test_api(self):
             out3 = paddle.randint(
                 low=-100, high=100, shape=(32, 32, 3), dtype='int64')
             # shape is a tensorlist and dtype is 'float32'
-            dim_1 = paddle.fill_constant([1], "int64", 32)
-            dim_2 = paddle.fill_constant([1], "int32", 50)
+            dim_1 = paddle.fluid.layers.fill_constant([1], "int64", 32)
+            dim_2 = paddle.fluid.layers.fill_constant([1], "int32", 50)
             out4 = paddle.randint(
                 low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
             # shape is a tensor and dtype is 'float64'
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index 4ddd98a8a7342..6d33b468ee1d0 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -30,8 +30,8 @@ def test_api(self):
             x1 = paddle.randn(shape, 'float32')
             x2 = paddle.randn(shape, 'float64')
 
-            dim_1 = paddle.fill_constant([1], "int64", 20)
-            dim_2 = paddle.fill_constant([1], "int32", 50)
+            dim_1 = paddle.fluid.layers.fill_constant([1], "int64", 20)
+            dim_2 = paddle.fluid.layers.fill_constant([1], "int32", 50)
             x3 = paddle.randn([dim_1, dim_2, 784])
 
             var_shape = paddle.static.data('X', [2], 'int32')
@@ -59,8 +59,8 @@ def test_api(self):
         x1 = paddle.randn(shape, 'float32')
         x2 = paddle.randn(shape, 'float64')
 
-        dim_1 = paddle.fill_constant([1], "int64", 20)
-        dim_2 = paddle.fill_constant([1], "int32", 50)
+        dim_1 = paddle.fluid.layers.fill_constant([1], "int64", 20)
+        dim_2 = paddle.fluid.layers.fill_constant([1], "int32", 50)
         x3 = paddle.randn(shape=[dim_1, dim_2, 784])
 
         var_shape = paddle.to_tensor(np.array(shape))
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 275f9d21f9f8e..f41099bda39f8 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -226,11 +226,29 @@ def init_dtype(self):
         self.dtype = np.uint8
 
 
+class TestReshapeOpBool(TestReshapeOp):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {
+            "X": np.random.choice(
+                [True, False], size=self.ori_shape)
+        }
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def test_check_grad(self):
+        pass
+
+
 # Test python API
 class TestReshapeAPI(unittest.TestCase):
     def _set_paddle_api(self):
-        self.fill_constant = paddle.fill_constant
-        self.data = paddle.data
+        self.fill_constant = paddle.fluid.layers.fill_constant
+        self.data = paddle.fluid.data
         self.reshape = paddle.reshape
         self.to_tensor = paddle.to_tensor
 
@@ -305,7 +323,7 @@ def test_imperative(self):
 # Test Input Error
 class TestReshapeOpError(unittest.TestCase):
     def _set_paddle_api(self):
-        self.data = paddle.data
+        self.data = paddle.fluid.data
         self.reshape = paddle.reshape
 
     def _set_fluid_api(self):
@@ -324,7 +342,7 @@ def test_x_type():
 
             # The x dtype of reshape_op must be float16, float32, float64, int32 or int64.
             def test_x_dtype():
-                x2 = self.data(name="x2", shape=[2, 25], dtype="bool")
+                x2 = self.data(name="x2", shape=[2, 25], dtype="int8")
                 self.reshape(x2, shape=[2, 5, 5])
 
             self.assertRaises(TypeError, test_x_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index 98c7e3800c20c..3e1dd4ef57320 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -73,8 +73,8 @@ def cal_gradient_penalty(self,
             fake_AB = paddle.concat((real_data.detach(), interpolatesv), 1)
             disc_interpolates = netD(fake_AB)
 
-            outs = paddle.fill_constant(disc_interpolates.shape,
-                                        disc_interpolates.dtype, 1.0)
+            outs = paddle.fluid.layers.fill_constant(
+                disc_interpolates.shape, disc_interpolates.dtype, 1.0)
             gradients = paddle.grad(
                 outputs=disc_interpolates,
                 inputs=fake_AB,
@@ -85,9 +85,9 @@ def cal_gradient_penalty(self,
 
             gradients = paddle.reshape(gradients[0], [real_data.shape[0], -1])
 
-            gradient_penalty = paddle.reduce_mean((paddle.norm(
-                gradients + 1e-16, 2, 1) - constant)**
-                                                  2) * lambda_gp  # added eps
+            gradient_penalty = paddle.mean((paddle.norm(gradients + 1e-16, 2, 1)
+                                            - constant)**
+                                           2) * lambda_gp  # added eps
             return gradient_penalty, gradients
         else:
             return 0.0, None
@@ -113,7 +113,8 @@ def run_retain(self, need_retain):
         fake_AB = paddle.concat((realA, fakeB), 1)
         G_pred_fake = d(fake_AB.detach())
 
-        false_target = paddle.fill_constant(G_pred_fake.shape, 'float32', 0.0)
+        false_target = paddle.fluid.layers.fill_constant(G_pred_fake.shape,
+                                                         'float32', 0.0)
 
         G_gradient_penalty, _ = self.cal_gradient_penalty(
             d, realA, fakeB, lambda_gp=10.0)
@@ -125,7 +126,8 @@ def run_retain(self, need_retain):
         optim_g.clear_gradients()
         fake_AB = paddle.concat((realA, fakeB), 1)
         G_pred_fake = d(fake_AB)
-        true_target = paddle.fill_constant(G_pred_fake.shape, 'float32', 1.0)
+        true_target = paddle.fluid.layers.fill_constant(G_pred_fake.shape,
+                                                        'float32', 1.0)
         loss_g = l1_criterion(fakeB, realB) + gan_criterion(G_pred_fake,
                                                             true_target)
 
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 6ca194b2694b6..066d0a37e1361 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,9 +14,17 @@
 
 from __future__ import print_function
 
+import random
 import unittest
 import numpy as np
 
+import paddle
+import paddle.nn as nn
+from paddle import Model, set_device
+from paddle.static import InputSpec as Input
+from paddle.fluid.dygraph import Layer
+from paddle.nn import BeamSearchDecoder, dynamic_decode
+
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -24,6 +32,8 @@
 from paddle.fluid.executor import Executor
 from paddle.fluid import framework
 
+paddle.enable_static()
+
 
 class EncoderCell(layers.RNNCell):
     def __init__(self, num_layers, hidden_size, dropout_prob=0.):
@@ -436,6 +446,7 @@ def setUp(self):
         self.exe = Executor(place)
 
     def test_mle_train(self):
+        paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "train_greedy"
         agent = SeqPGAgent(
             model_cls=Seq2SeqModel,
@@ -468,6 +479,7 @@ def test_mle_train(self):
                   (iter_idx, reward.mean(), cost))
 
     def test_greedy_train(self):
+        paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "infer_greedy"
         agent = SeqPGAgent(
             model_cls=Seq2SeqModel,
@@ -493,6 +505,7 @@ def test_greedy_train(self):
                   (iter_idx, reward.mean(), cost))
 
     def test_sample_train(self):
+        paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "infer_sample"
         agent = SeqPGAgent(
             model_cls=Seq2SeqModel,
@@ -518,6 +531,8 @@ def test_sample_train(self):
                   (iter_idx, reward.mean(), cost))
 
     def test_beam_search_infer(self):
+        paddle.set_default_dtype("float32")
+        paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "beam_search"
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -542,5 +557,154 @@ def test_beam_search_infer(self):
                 fetch_list=[output])[0]
 
 
+class ModuleApiTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+        cls._random_seed = 123
+        np.random.seed(cls._random_seed)
+        random.seed(cls._random_seed)
+
+        cls.model_cls = type(cls.__name__ + "Model", (Layer, ), {
+            "__init__": cls.model_init_wrapper(cls.model_init),
+            "forward": cls.model_forward
+        })
+
+    @classmethod
+    def tearDownClass(cls):
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+    @staticmethod
+    def model_init_wrapper(func):
+        def __impl__(self, *args, **kwargs):
+            Layer.__init__(self)
+            func(self, *args, **kwargs)
+
+        return __impl__
+
+    @staticmethod
+    def model_init(model, *args, **kwargs):
+        raise NotImplementedError(
+            "model_init acts as `Model.__init__`, thus must implement it")
+
+    @staticmethod
+    def model_forward(model, *args, **kwargs):
+        return model.module(*args, **kwargs)
+
+    def make_inputs(self):
+        # TODO(guosheng): add default from `self.inputs`
+        raise NotImplementedError(
+            "model_inputs makes inputs for model, thus must implement it")
+
+    def setUp(self):
+        """
+        For the model which wraps the module to be tested:
+            Set input data by `self.inputs` list
+            Set init argument values by `self.attrs` list/dict
+            Set model parameter values by `self.param_states` dict
+            Set expected output data by `self.outputs` list
+        We can create a model instance and run once with these.
+        """
+        self.inputs = []
+        self.attrs = {}
+        self.param_states = {}
+        self.outputs = []
+
+    def _calc_output(self, place, mode="test", dygraph=True):
+        if dygraph:
+            fluid.enable_dygraph(place)
+        else:
+            fluid.disable_dygraph()
+        gen = paddle.manual_seed(self._random_seed)
+        gen._is_init_py = False
+        paddle.framework.random._manual_program_seed(self._random_seed)
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            layer = self.model_cls(**self.attrs) if isinstance(
+                self.attrs, dict) else self.model_cls(*self.attrs)
+            model = Model(layer, inputs=self.make_inputs())
+            model.prepare()
+            if self.param_states:
+                model.load(self.param_states, optim_state=None)
+            return model.test_batch(self.inputs)
+
+    def check_output_with_place(self, place, mode="test"):
+        dygraph_output = self._calc_output(place, mode, dygraph=True)
+        stgraph_output = self._calc_output(place, mode, dygraph=False)
+        expect_output = getattr(self, "outputs", None)
+        for actual_t, expect_t in zip(dygraph_output, stgraph_output):
+            self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
+        if expect_output:
+            for actual_t, expect_t in zip(dygraph_output, expect_output):
+                self.assertTrue(
+                    np.allclose(
+                        actual_t, expect_t, rtol=1e-5, atol=0))
+
+    def check_output(self):
+        devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
+        for device in devices:
+            place = set_device(device)
+            self.check_output_with_place(place)
+
+
+class TestBeamSearch(ModuleApiTest):
+    def setUp(self):
+        paddle.set_default_dtype("float64")
+        shape = (8, 32)
+        self.inputs = [
+            np.random.random(shape).astype("float64"),
+            np.random.random(shape).astype("float64")
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "embed_dim": 32,
+            "hidden_size": 32,
+        }
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self,
+                   vocab_size,
+                   embed_dim,
+                   hidden_size,
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=2,
+                   max_step_num=2):
+        embedder = paddle.fluid.dygraph.Embedding(
+            size=[vocab_size, embed_dim], dtype="float64")
+        output_layer = nn.Linear(hidden_size, vocab_size)
+        cell = nn.LSTMCell(embed_dim, hidden_size)
+        self.max_step_num = max_step_num
+        self.beam_search_decoder = BeamSearchDecoder(
+            cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=embedder,
+            output_fn=output_layer)
+
+    @staticmethod
+    def model_forward(model, init_hidden, init_cell):
+        return dynamic_decode(
+            model.beam_search_decoder, [init_hidden, init_cell],
+            max_step_num=model.max_step_num,
+            impute_finished=True,
+            is_test=True)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input([None, self.inputs[0].shape[-1]], "float64", "init_hidden"),
+            Input([None, self.inputs[1].shape[-1]], "float64", "init_cell"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv.py b/python/paddle/fluid/tests/unittests/test_row_conv.py
index abec23c7f658a..7b6068c32cab1 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv.py
@@ -69,7 +69,7 @@ def functional_declarative(self, place):
                 x = fluid.data(
                     "input", [-1, -1, self.num_channels], dtype=self.dtype)
                 w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
-                y = F.row_conv(x, w, act=self.act)
+                y = F.extension.row_conv(x, w, act=self.act)
         exe = fluid.Executor(place)
         exe.run(start)
         y_np, = exe.run(main,
@@ -82,7 +82,7 @@ def functional_imperative(self, place):
         with dg.guard(place):
             x_var = dg.to_variable(self.input)
             w_var = dg.to_variable(self.weight)
-            y_var = F.row_conv(x_var, w_var, act=self.act)
+            y_var = F.extension.row_conv(x_var, w_var, act=self.act)
             y_np = y_var.numpy()
         return y_np
 
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index b5a2e84a53ef6..95ae1eecc6614 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -93,7 +93,7 @@ def setUp(self):
 
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
             out1 = F.selu(x, self.scale, self.alpha)
             selu = paddle.nn.SELU(self.scale, self.alpha)
             out2 = selu(x)
@@ -128,15 +128,15 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.selu, x_int32)
             # The scale must be greater than 1.0
-            x_fp32 = paddle.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
             self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
             # The alpha must be no less than 0
             self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
             F.selu(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
index 71e119739e777..85f9501e53f4a 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -42,13 +42,13 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         normalizer = None
         if normalizer_np is not None:
-            normalizer = paddle.data(
+            normalizer = paddle.fluid.data(
                 name='normalizer', shape=normalizer_np.shape, dtype='float64')
             feed_dict["normalizer"] = normalizer_np
 
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index a37fad9cf0ca0..71df2c4acc467 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -315,7 +315,7 @@ def setUp(self):
 
     def test_static_check(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, 'float32')
+            x = paddle.fluid.data('X', self.x_np.shape, 'float32')
             out1 = F.softmax(x)
             m = paddle.nn.Softmax()
             out2 = m(x)
@@ -354,10 +354,10 @@ def test_error(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softmax, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.data(name='x_int32', shape=[2, 3], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32', shape=[2, 3], dtype='int32')
             self.assertRaises(TypeError, F.softmax, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[2, 3], dtype='float16')
             F.softmax(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_std_layer.py b/python/paddle/fluid/tests/unittests/test_std_layer.py
index e455151481443..2196996afffc9 100644
--- a/python/paddle/fluid/tests/unittests/test_std_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_std_layer.py
@@ -44,7 +44,7 @@ def set_attrs(self):
 
     def static(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.shape, self.dtype)
+            x = paddle.fluid.data('X', self.shape, self.dtype)
             out = paddle.std(x, self.axis, self.unbiased, self.keepdim)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x}, fetch_list=[out])
@@ -111,7 +111,7 @@ def test_alias(self):
 class TestStdError(unittest.TestCase):
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [2, 3, 4], 'int32')
+            x = paddle.fluid.data('X', [2, 3, 4], 'int32')
             self.assertRaises(TypeError, paddle.std, x)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index c0cd88a0a6aa0..35dc92ffb08c6 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -225,7 +225,7 @@ def test_w_is_selected_rows(self):
     globals()[cls_name] = TestSumFp16Case
 
 
-class API_Test_Elementwise_Sum(unittest.TestCase):
+class API_Test_Add_n(unittest.TestCase):
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input0 = fluid.layers.fill_constant(
@@ -234,11 +234,19 @@ def test_api(self):
                 shape=[2, 3], dtype='int64', value=3)
             expected_result = np.empty((2, 3))
             expected_result.fill(8)
-            sum_value = paddle.elementwise_sum([input0, input1])
+            sum_value = paddle.add_n([input0, input1])
             exe = fluid.Executor(fluid.CPUPlace())
             result = exe.run(fetch_list=[sum_value])
 
-        self.assertEqual((result == expected_result).all(), True)
+            self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            input0 = paddle.ones(shape=[2, 3], dtype='float32')
+            expected_result = np.empty((2, 3))
+            expected_result.fill(2)
+            sum_value = paddle.add_n([input0, input0])
+
+            self.assertEqual((sum_value.numpy() == expected_result).all(), True)
 
 
 class TestRaiseSumError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 1fbc0fc4604c2..a102bcea995c7 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -81,7 +81,7 @@ def initTestCase(self):
 class TestTemporalShiftAPI(unittest.TestCase):
     def test_api(self):
         input = paddle.randn([6, 4, 2, 2])
-        out = paddle.nn.functional.temporal_shift(
+        out = paddle.fluid.layers.temporal_shift(
             x=input, seg_num=2, shift_ratio=0.2)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index a2c60d870e5e1..a4bef436e1375 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -254,7 +254,7 @@ def test_dygraph_attr_dtype(self):
     def test_static_graph(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            x = paddle.data(name='x', shape=[3, 2], dtype='float64')
+            x = paddle.fluid.data(name='x', shape=[3, 2], dtype='float64')
             unique, inverse, counts = paddle.unique(
                 x, return_inverse=True, return_counts=True, axis=0)
             place = paddle.CPUPlace()
@@ -274,13 +274,13 @@ def test_input_dtype(self):
         def test_x_dtype():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
-                x = paddle.data(name='x', shape=[10, 10], dtype='float16')
+                x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float16')
                 result = paddle.unique(x)
 
             self.assertRaises(TypeError, test_x_dtype)
 
     def test_attr(self):
-        x = paddle.data(name='x', shape=[10, 10], dtype='float64')
+        x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
 
         def test_return_index():
             result = paddle.unique(x, return_index=0)
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
index 8f116db855b05..4ffff252ee97d 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
@@ -50,10 +50,10 @@ def test_name_generator(self):
         with fluid.dygraph.guard():
             tracer = fluid.framework._dygraph_tracer()
             tmp_var_0 = tracer._generate_unique_name()
-            self.assertEqual(tmp_var_0, "eager_tmp_0")
+            self.assertEqual(tmp_var_0, "dygraph_tmp_0")
 
-            tmp_var_1 = tracer._generate_unique_name("eager_tmp")
-            self.assertEqual(tmp_var_1, "eager_tmp_1")
+            tmp_var_1 = tracer._generate_unique_name("dygraph_tmp")
+            self.assertEqual(tmp_var_1, "dygraph_tmp_1")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index e3edf82ab9959..6d4258a426d05 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -55,6 +55,15 @@ def _test_place(place):
                     np.array_equal(x.numpy(), np.array([1.2]).astype(
                         'float32')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
+                clone_x = x.clone()
+                self.assertTrue(
+                    np.array_equal(clone_x.numpy(),
+                                   np.array([1.2]).astype('float32')))
+                self.assertEqual(clone_x.dtype, core.VarDesc.VarType.FP32)
+                y = clone_x**2
+                y.backward()
+                self.assertTrue(
+                    np.array_equal(x.grad, np.array([2.4]).astype('float32')))
 
                 # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
@@ -404,6 +413,37 @@ def _assert_to_static(self, var_base, static_var, is_param=False):
 
         self.assertListEqual(list(var_base.shape), list(static_var.shape))
 
+    def test_tensor_str(self):
+        paddle.enable_static()
+        paddle.disable_static(paddle.CPUPlace())
+        paddle.manual_seed(10)
+        a = paddle.rand([10, 20])
+        paddle.set_printoptions(4, 100, 3)
+        a_str = str(a)
+
+        if six.PY2:
+            expected = '''Tensor(shape=[10L, 20L], dtype=float32, place=CPUPlace, stop_gradient=True,
+       [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
+        [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
+        [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
+        ...,
+        [0.3426, 0.1909, 0.7240, ..., 0.4218, 0.2676, 0.5679],
+        [0.5561, 0.2081, 0.0676, ..., 0.9778, 0.3302, 0.9559],
+        [0.2665, 0.8483, 0.5389, ..., 0.4956, 0.6862, 0.9178]])'''
+
+        else:
+            expected = '''Tensor(shape=[10, 20], dtype=float32, place=CPUPlace, stop_gradient=True,
+       [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
+        [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
+        [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
+        ...,
+        [0.3426, 0.1909, 0.7240, ..., 0.4218, 0.2676, 0.5679],
+        [0.5561, 0.2081, 0.0676, ..., 0.9778, 0.3302, 0.9559],
+        [0.2665, 0.8483, 0.5389, ..., 0.4956, 0.6862, 0.9178]])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
 
 class TestVarBaseSetitem(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_variance_layer.py b/python/paddle/fluid/tests/unittests/test_variance_layer.py
index b5bb3cc978a55..13e3cf4df111e 100644
--- a/python/paddle/fluid/tests/unittests/test_variance_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_variance_layer.py
@@ -44,7 +44,7 @@ def set_attrs(self):
 
     def static(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.shape, self.dtype)
+            x = paddle.fluid.data('X', self.shape, self.dtype)
             out = paddle.var(x, self.axis, self.unbiased, self.keepdim)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x}, fetch_list=[out])
@@ -111,7 +111,7 @@ def test_alias(self):
 class TestVarError(unittest.TestCase):
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [2, 3, 4], 'int32')
+            x = paddle.fluid.data('X', [2, 3, 4], 'int32')
             self.assertRaises(TypeError, paddle.var, x)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 2cea3072809ec..6546d7b99f441 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -25,7 +25,7 @@
 class TestZerosLikeAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x = paddle.data('x', [3, 4])
+            x = paddle.fluid.data('x', [3, 4])
             self.assertRaises(TypeError, zeros_like, x, 'int8')
 
 
@@ -35,7 +35,7 @@ def test_api(self):
         startup_program = Program()
         train_program = Program()
         with program_guard(train_program, startup_program):
-            x = paddle.data('X', shape)
+            x = paddle.fluid.data('X', shape)
 
             # 'bool', 'float32', 'float64', 'int32', 'int64'
             out1 = zeros_like(x)
diff --git a/python/paddle/fluid/tests/unittests/xpu/elementwise.py b/python/paddle/fluid/tests/unittests/xpu/elementwise.py
new file mode 100644
index 0000000000000..f4f2ddb19cf7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/elementwise.py
@@ -0,0 +1,100 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+paddle.enable_static()
+
+
+class TestXPUElementwiseOpBase(object):
+    def setUp(self, op_type):
+        self.op_type = op_type
+        self.attrs = {'use_xpu': True}
+        self.is_common_broadcast = False
+        self.is_x_size_less_than_y = False
+        self.grad_implemented = False
+        self.y_grad_implemented = True
+        self.dtype = np.float32
+        self.__class__.op_type = self.op_type
+        self.__class__.use_xpu = True
+        self.__class__.dtype = self.dtype
+
+    def net(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.layers.data(
+                name='X', shape=self.inputs['X'].shape, dtype=self.dtype)
+            y = fluid.layers.data(
+                name='Y', shape=self.inputs['Y'].shape, dtype=self.dtype)
+            op = getattr(fluid.layers, self.op_type)
+            z = op(x, y)
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=self.inputs, fetch_list=[z.name])
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            if not self.is_common_broadcast and not self.is_x_size_less_than_y:
+                self.check_output_with_place(place, atol=1e-3)
+            else:
+                with self.assertRaises(BaseException):
+                    self.net(place)
+
+    def _check_grad_xpu_helper(self,
+                               inputs_to_check,
+                               output_names,
+                               no_grad_set=None,
+                               max_relative_error=0.01):
+        if self.grad_implemented and not self.is_common_broadcast   \
+          and not self.is_x_size_less_than_y:
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(
+                    place,
+                    inputs_to_check,
+                    output_names,
+                    no_grad_set=no_grad_set,
+                    max_relative_error=max_relative_error)
+
+    def test_check_grad_normal(self):
+        self._check_grad_xpu_helper(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self._check_grad_xpu_helper(['Y'], 'Out', set("X"))
+
+    def test_check_grad_ingore_y(self):
+        if self.y_grad_implemented:
+            self._check_grad_xpu_helper(['X'], 'Out', set("Y"))
+
+    def init_axis(self):
+        self.axis = -1
+
+    def make_input(self, x_shape=[13, 17], y_shape=[13, 17]):
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, x_shape).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, y_shape).astype(self.dtype)
+        }
+
+    def reshape_input(self, x_shape=None, y_shape=None):
+        if x_shape is None:
+            x = self.inputs['X']
+        else:
+            x = self.inputs['X'].reshape(x_shape)
+        if y_shape is None:
+            y = self.inputs['Y']
+        else:
+            y = self.inputs['Y'].reshape(y_shape)
+        return x, y
+
+    def make_output(self, x_shape=None, y_shape=None):
+        pass
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
new file mode 100644
index 0000000000000..147824f341be4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+import paddle
+
+
+class TestAdamOp1(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(place=paddle.XPUPlace(0), atol=1e-2)
+
+
+class TestAdamOp2(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(place=paddle.XPUPlace(0), atol=1e-2)
+
+
+class TestAdamOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adam Operator with supplied attributes
+        '''
+        self.op_type = "adam"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        self.beta1 = 0.9
+        self.beta2 = 0.999
+        epsilon = 1e-8
+        self.beta1_pow = self.beta1**10
+        self.beta2_pow = self.beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': self.beta1,
+            'beta2': self.beta2
+        }
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment1_out, \
+                moment2_out = adam_step(self.inputs, self.attrs)
+
+            beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
+            beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
+            self.outputs = {
+                'Moment1Out': moment1_out,
+                'Moment2Out': moment2_out,
+                'ParamOut': param_out,
+                'Beta1PowOut': beta1_pow_out,
+                'Beta2PowOut': beta2_pow_out
+            }
+
+            # Verify output for this step
+            self.check_output_with_place(place=paddle.XPUPlace(0), atol=1e-2)
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment1'] = moment1_out
+            self.inputs['Moment2'] = moment2_out
+
+            # Update powers of Beta1 and Beta2 for next time step
+            self.inputs['Beta1Pow'] = beta1_pow_out
+            self.inputs['Beta2Pow'] = beta2_pow_out
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adam_step(inputs, attributes):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    epsilon = attributes['epsilon']
+
+    if 'beta1' in attributes:
+        beta1 = attributes['beta1']
+    else:
+        beta1 = inputs['Beta1Tensor'][0]
+    if 'beta2' in attributes:
+        beta2 = attributes['beta2']
+    else:
+        beta2 = inputs['Beta2Tensor'][0]
+
+    moment1_out = beta1 * moment1 + (1 - beta1) * grad
+    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
+    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+class TestAdamOpBetaVariable(OpTest):
+    def setUp(self):
+        '''Test Adam Op with beta as Variable
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+        beta1 = 0.85
+        beta2 = 0.95
+
+        learning_rate = 0.001
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            "Beta1Tensor": np.array([beta1]).astype("float32"),
+            "Beta2Tensor": np.array([beta2]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(place=paddle.XPUPlace(0), atol=1e-2)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
new file mode 100644
index 0000000000000..110e7bb3cbf41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+
+sys.path.append("..")
+import op_test
+import numpy as np
+import unittest
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.backward import append_backward
+
+
+class TestAssignOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign"
+        x = np.random.random(size=(100, 10)).astype('float32')
+        self.inputs = {'X': x}
+        self.outputs = {'Out': x}
+
+    def test_forward(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_backward(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class TestAssignOpWithLoDTensorArray(unittest.TestCase):
+    def test_assign_LoDTensorArray(self):
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program):
+            x = fluid.data(name='x', shape=[100, 10], dtype='float32')
+            x.stop_gradient = False
+            y = fluid.layers.fill_constant(
+                shape=[100, 10], dtype='float32', value=1)
+            z = fluid.layers.elementwise_add(x=x, y=y)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            init_array = fluid.layers.array_write(x=z, i=i)
+            array = fluid.layers.assign(init_array)
+            sums = fluid.layers.array_read(array=init_array, i=i)
+            mean = fluid.layers.mean(sums)
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        feed_x = np.random.random(size=(100, 10)).astype('float32')
+        ones = np.ones((100, 10)).astype('float32')
+        feed_add = feed_x + ones
+        res = exe.run(main_program,
+                      feed={'x': feed_x},
+                      fetch_list=[sums.name, x.grad_name])
+        self.assertTrue(np.allclose(res[0], feed_add))
+        self.assertTrue(np.allclose(res[1], ones / 1000.0))
+
+
+class TestAssignOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The type of input must be Variable or numpy.ndarray.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.assign, x1)
+            # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.assign, x3)
+            # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
+            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            self.assertRaises(TypeError, fluid.layers.assign, x4)
+            x5 = np.array([[2.5, 2.5]], dtype='uint8')
+            self.assertRaises(TypeError, fluid.layers.assign, x5)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
new file mode 100644
index 0000000000000..0d9387d6b75a7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -0,0 +1,269 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit, erf
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+def ref_batch_norm_infer(x, scale, bias, mean, variance, momentum, epsilon,
+                         data_layout):
+    if data_layout == "NCHW":
+        n, c, h, w = x.shape
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        variance_tile = np.reshape(variance, (1, c, 1, 1))
+        variance_tile = np.tile(variance_tile, (n, 1, h, w))
+        normalized_x = (x - mean_tile) / np.sqrt(variance_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        bias_tile = np.reshape(bias, (1, c, 1, 1))
+        bias_tile = np.reshape(bias_tile, (1, c, 1, 1))
+        y = normalized_x * scale_tile + bias_tile
+    elif data_layout == "NHWC":
+        normalized_x = (x - mean) / np.sqrt(variance + epsilon)
+        y = normalized_x * scale + bias
+    else:
+        raise ValueError(
+            "Unsupported data layout! Only NCHW and NHWC is supported, but received "
+            + data_layout)
+    return y
+
+
+def ref_batch_norm_train(x, y_grad, scale, bias, mean, variance, momentum,
+                         epsilon, data_layout):
+    # Forward
+    if data_layout == "NCHW":
+        n, c, h, w = x.shape
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 2, 3))
+        x_sum = np.sum(x, axis=(0, 2, 3))
+        element_count = np.size(x) / int(np.shape(x)[1])
+        saved_mean = x_sum / element_count
+        saved_variance = x_square_sum / element_count - saved_mean * saved_mean
+        saved_mean_tile = np.reshape(saved_mean, (1, c, 1, 1))
+        saved_mean_tile = np.tile(saved_mean_tile, (n, 1, h, w))
+        saved_variance_tile = np.reshape(saved_variance, (1, c, 1, 1))
+        saved_variance_tile = np.tile(saved_variance_tile, (n, 1, h, w))
+        normalized_x = (
+            x - saved_mean_tile) / np.sqrt(saved_variance_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        bias_tile = np.reshape(bias, (1, c, 1, 1))
+        bias_tile = np.reshape(bias_tile, (1, c, 1, 1))
+        y = normalized_x * scale_tile + bias_tile
+    elif data_layout == "NHWC":
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 1, 2))
+        x_sum = np.sum(x, axis=(0, 1, 2))
+        element_count = np.size(x) / int(np.shape(x)[-1])
+        saved_mean = x_sum / element_count
+        saved_variance = x_square_sum / element_count - saved_mean * saved_mean
+        normalized_x = (x - saved_mean) / np.sqrt(saved_variance + epsilon)
+        y = normalized_x * scale + bias
+    else:
+        raise ValueError(
+            "Unsupported data layout! Only NCHW and NHWC is supported, but received "
+            + data_layout)
+    mean_out = saved_mean * (1. - momentum) + momentum * mean
+    variance_out = saved_variance * (1. - momentum) + momentum * variance
+    saved_inv_std = 1. / np.sqrt(saved_variance + epsilon)
+    # Backward
+    # Use the following formulas to calculate gradients:
+    # grad_scale =
+    #   sum(grad_y * (x - mean)) * rsqrt(variance + epsilon)
+    #
+    # grad_bias = sum(y)
+    #
+    # x_grad =
+    #   1/N * scale * rsqrt(variance + epsilon) * (N * grad_y - sum(grad_y) -
+    #   (x - mean) * sum(grad_y * (x - mean)) / (variance + epsilon))
+    # Transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    if data_layout == "NCHW":
+        x = np.transpose(x, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+    x_grad = scale * (
+        y_grad - np.mean(
+            y_grad, axis=(0, 1, 2)) - (x - saved_mean) * np.mean(
+                y_grad * (x - saved_mean), axis=(0, 1, 2)) /
+        (saved_variance + epsilon)) / np.sqrt(saved_variance + epsilon)
+    scale_grad = np.sum(y_grad * (x - saved_mean) /
+                        np.sqrt(saved_variance + epsilon),
+                        axis=(0, 1, 2))
+    bias_grad = np.sum(y_grad, axis=(0, 1, 2))
+    # Transfer back to N, C, H, W
+    if data_layout == "NCHW":
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+        x = np.transpose(x, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+    return y, mean_out, variance_out, saved_mean, saved_inv_std, x_grad, scale_grad, bias_grad
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUBatchNormOp(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "batch_norm"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.data_layout = "NCHW"
+        self.epsilon = 1e-05
+        self.momentum = 0.9
+        self.set_attrs()
+
+        if self.data_layout == "NHWC":
+            channel_size = self.shape[3]
+        elif self.data_layout == "NCHW":
+            channel_size = self.shape[1]
+        else:
+            raise ValueError(
+                "Unsupported data layout! Only NCHW and NHWC is supported, but received "
+                + data_layout)
+        np.random.seed(1024)
+        self.x_np = np.random.random_sample(self.shape).astype(self.dtype)
+        self.scale_np = np.random.random_sample(
+            [channel_size]).astype(self.dtype)
+        self.bias_np = np.random.random_sample(
+            [channel_size]).astype(self.dtype)
+        self.mean_np = np.zeros([channel_size]).astype(self.dtype)
+        self.variance_np = np.ones([channel_size]).astype(self.dtype)
+        self.saved_mean_np = np.zeros([channel_size]).astype(self.dtype)
+        self.saved_variance_np = np.ones([channel_size]).astype(self.dtype)
+
+    def set_attrs(self):
+        pass
+
+    def test_infer(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            scale = paddle.data('Scale', self.scale_np.shape,
+                                self.scale_np.dtype)
+            bias = paddle.data('Bias', self.bias_np.shape, self.bias_np.dtype)
+            mean = paddle.data('Mean', self.mean_np.shape, self.mean_np.dtype)
+            variance = paddle.data('Variance', self.variance_np.shape,
+                                   self.variance_np.dtype)
+            y = F.batch_norm(x, mean, variance, scale, bias, False,
+                             self.momentum, self.epsilon, self.data_layout)
+            exe = paddle.static.Executor(self.place)
+            [y_np] = exe.run(feed={
+                'X': self.x_np,
+                'Scale': self.scale_np,
+                'Bias': self.bias_np,
+                'Mean': self.mean_np,
+                'Variance': self.variance_np
+            },
+                             fetch_list=[y])
+        y_np_ref = ref_batch_norm_infer(
+            self.x_np, self.scale_np, self.bias_np, self.mean_np,
+            self.variance_np, self.momentum, self.epsilon, self.data_layout)
+        self.assertEqual(np.allclose(y_np_ref, y_np), True)
+
+    def test_train(self):
+        y_grad_np = np.random.random_sample(self.shape).astype(self.dtype)
+        y_np, mean_out_np, variance_out_np, saved_mean_np, saved_variance_np, x_grad_np, scale_grad_np, bias_grad_np = ref_batch_norm_train(
+            self.x_np, y_grad_np, self.scale_np, self.bias_np, self.mean_np,
+            self.variance_np, self.momentum, self.epsilon, self.data_layout)
+        inputs = {
+            'X': self.x_np,
+            'Scale': self.scale_np,
+            'Bias': self.bias_np,
+            'Mean': self.mean_np,
+            'Variance': self.variance_np,
+            'Y@GRAD': y_grad_np
+        }
+        outputs = {
+            'Y': y_np,
+            'Mean': mean_out_np,
+            'Variance': variance_out_np,
+            'SavedMean': saved_mean_np,
+            'SavedVariance': saved_variance_np,
+            'X@GRAD': x_grad_np,
+            'Scale@GRAD': scale_grad_np,
+            'Bias@GRAD': bias_grad_np
+        }
+        attrs = {
+            'momentum': self.momentum,
+            'epsilon': self.epsilon,
+            'is_test': False,
+            'data_layout': self.data_layout,
+            'use_mkldnn': False,
+            'fuse_with_relu': False,
+            'use_global_stats': False,
+        }
+        paddle.enable_static()
+        program = paddle.static.Program()
+        with paddle.static.program_guard(program):
+            block = program.global_block()
+            # Set inputs, outputs and attributes to the forward op of batch_norm 
+            input_vars = {}
+            for var_name in inputs:
+                arg_name = var_name
+                np_value = inputs[var_name]
+                if not block.has_var(var_name):
+                    block.create_var(
+                        name=var_name,
+                        shape=np_value.shape,
+                        dtype=np_value.dtype)
+                input_vars[arg_name] = block.var(var_name)
+            fetch_list = []
+            output_vars = {}
+            for var_name in outputs:
+                arg_name = var_name
+                np_value = outputs[var_name]
+                if not block.has_var(var_name):
+                    block.create_var(
+                        name=var_name,
+                        shape=np_value.shape,
+                        dtype=np_value.dtype)
+                if var_name == 'Mean':
+                    arg_name = 'MeanOut'  # Share memory
+                if var_name == 'Variance':
+                    arg_name = 'VarianceOut'  # Share memory
+                output_vars[arg_name] = block.var(var_name)
+                fetch_list.append(var_name)
+            batch_norm_op = block.append_op(
+                type="batch_norm",
+                inputs=input_vars,
+                outputs=output_vars,
+                attrs=attrs)
+            # Generate the backward op_desc of batch_norm
+            grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                batch_norm_op.desc, set(), [])
+            grad_op_desc = grad_op_desc_list[0]
+            new_op_desc = block.desc.append_op()
+            new_op_desc.copy_from(grad_op_desc)
+            program._sync_with_cpp()
+            exe = paddle.static.Executor(self.place)
+            outs = exe.run(program, feed=inputs, fetch_list=fetch_list)
+            for id, name in enumerate(fetch_list):
+                self.assertEqual(
+                    np.allclose(
+                        outputs[name], outs[id], atol=1e-4), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
new file mode 100644
index 0000000000000..cb64cb90e8c2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+
+sys.path.append("..")
+import op_test
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+class TestCastOp1(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float32')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], ['Out'])
+
+
+class TestCastOp2(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float32')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        #self.check_output(atol=1e-3)
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+
+class TestCastOp3(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float32')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        #self.check_output(atol=1e-3)
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+
+class TestCastOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of cast_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
+            # The input dtype of cast_op must be float32, int32, int64.
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype='int16')
+            self.assertRaises(TypeError, fluid.layers.cast, x2, 'int32')
+
+            def test_dtype_type():
+                x4 = fluid.layers.data(name='x4', shape=[4], dtype='int32')
+                output = fluid.layers.cast(x=x4, dtype='int16')
+
+            self.assertRaises(TypeError, test_dtype_type)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
new file mode 100644
index 0000000000000..bb5d7134a1bad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
@@ -0,0 +1,240 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard, core
+import paddle
+
+
+class TestConcatOp(OpTest):
+    def setUp(self):
+        self.op_type = "concat"
+        self.dtype = self.get_dtype()
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def get_dtype(self):
+        return "float64"
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['x0'], 'Out')
+            self.check_grad_with_place(place, ['x1'], 'Out')
+            self.check_grad_with_place(place, ['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype)
+        self.axis = 1
+
+
+class TestConcatOp2(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="The function 'check_grad' for large inputs is too slow.")
+class TestConcatOp3(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
+        self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
+        self.x2 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
+        self.axis = 1
+
+    def test_check_grad(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+)
+class TestConcatOp4(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((0, 3, 4, 5)).astype(self.dtype)
+        self.axis = 0
+
+    def test_check_grad(self):
+        pass
+
+
+class TestConcatOp5(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
+        self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
+        self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype)
+        self.axis = -3
+
+
+class TestConcatOp6(TestConcatOp):
+    def setUp(self):
+        self.op_type = "concat"
+        self.dtype = self.get_dtype()
+        self.init_test_data()
+        self.lod = [[20, 80]]
+        self.out_lod = [[20, 80, 20, 80, 20, 80]]
+        self.inputs = {
+            'X': [('x0', (self.x0, self.lod)), ('x1', (self.x1, self.lod)),
+                  ('x2', (self.x2, self.lod))]
+        }
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+        out = np.concatenate((self.x0, self.x1, self.x2), axis=self.actual_axis)
+        self.outputs = {'Out': (out, self.out_lod)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['x0'], 'Out')
+            self.check_grad_with_place(place, ['x1'], 'Out')
+            self.check_grad_with_place(place, ['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random([100]).astype(self.dtype)
+        self.x1 = np.random.random([100]).astype(self.dtype)
+        self.x2 = np.random.random([100]).astype(self.dtype)
+        self.axis = 0
+
+
+class TestConcatOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of concat_op should be list.
+            x1 = fluid.layers.data(shape=[4], dtype='int32', name='x1')
+            fluid.layers.concat(x1)
+            # The item in input must be Variable.
+            x2 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x3 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.concat, [x2])
+            # The input dtype of concat_op must be float16, float32, float64, int32, int64.
+            x4 = fluid.layers.data(shape=[4], dtype='uint8', name='x4')
+            x5 = fluid.layers.data(shape=[4], dtype='uint8', name='x5')
+            self.assertRaises(TypeError, fluid.layers.concat, [x4, x5])
+            x6 = fluid.layers.data(shape=[4], dtype='float16', name='x6')
+            x7 = fluid.layers.data(shape=[4], dtype='float16', name='x7')
+            x8 = fluid.layers.data(shape=[4], dtype='float32', name='x8')
+            fluid.layers.concat([x6, x7])
+
+            # The type of axis in concat_op should be int or Variable.
+            def test_axis_type():
+                fluid.layers.concat([x6, x7], 3.2)
+
+            self.assertRaises(TypeError, test_axis_type)
+
+            def test_input_same_dtype():
+                fluid.layers.concat([x7, x8])
+
+            self.assertRaises(TypeError, test_input_same_dtype)
+
+
+class TestConcatAPI(unittest.TestCase):
+    def test_fluid_api(self):
+        x_1 = fluid.data(shape=[None, 1, 4, 5], dtype='float32', name='x_1')
+        fluid.layers.concat([x_1, x_1], 0)
+
+        input_2 = np.random.random([2, 1, 4, 5]).astype("float32")
+        input_3 = np.random.random([2, 2, 4, 5]).astype("float32")
+        x_2 = fluid.data(shape=[2, 1, 4, 5], dtype='float32', name='x_2')
+        x_3 = fluid.data(shape=[2, 2, 4, 5], dtype='float32', name='x_3')
+        positive_1_int32 = fluid.layers.fill_constant([1], "float32", 1)
+        positive_1_int64 = fluid.layers.fill_constant([1], "float32", 1)
+        out_1 = fluid.layers.concat(input=[x_2, x_3], axis=1)
+        out_2 = fluid.layers.concat(input=[x_2, x_3], axis=1)
+        out_3 = fluid.layers.concat(input=[x_2, x_3], axis=1)
+
+        exe = fluid.Executor(place=fluid.XPUPlace(0))
+        [res_1, res_2, res_3] = exe.run(
+            fluid.default_main_program(),
+            feed={"x_1": input_2,
+                  "x_2": input_2,
+                  "x_3": input_3},
+            fetch_list=[out_1, out_2, out_3])
+        assert np.array_equal(res_1, np.concatenate((input_2, input_3), axis=1))
+        assert np.array_equal(res_2, np.concatenate((input_2, input_3), axis=1))
+        assert np.array_equal(res_3, np.concatenate((input_2, input_3), axis=1))
+
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The item in input must be Variable.
+            x2 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
+            x3 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
+            self.assertRaises(TypeError, paddle.concat, [x2])
+            # The input dtype of concat_op must be float32.
+            x4 = fluid.data(shape=[4], dtype='uint8', name='x4')
+            x5 = fluid.data(shape=[4], dtype='uint8', name='x5')
+            self.assertRaises(TypeError, fluid.layers.concat, [x4, x5])
+
+            # The type of axis in concat_op should be int or Variable.
+            x6 = fluid.layers.data(shape=[4], dtype='float16', name='x6')
+            x7 = fluid.layers.data(shape=[4], dtype='float16', name='x7')
+            x8 = fluid.layers.data(shape=[4], dtype='float32', name='x8')
+
+            def test_axis_type():
+                paddle.concat([x6, x7], 3.2)
+
+            self.assertRaises(TypeError, test_axis_type)
+
+            def test_input_same_dtype():
+                paddle.concat([x7, x8])
+
+            self.assertRaises(TypeError, test_input_same_dtype)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
new file mode 100644
index 0000000000000..f826448c59664
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
@@ -0,0 +1,600 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+import paddle
+from paddle.fluid import Program, program_guard
+
+
+def conv2d_forward_naive(input,
+                         filter,
+                         group,
+                         conv_param,
+                         padding_algorithm='EXPLICIT',
+                         data_format='NCHW'):
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                         "It can only be 'SAME' or 'VALID'." %
+                         str(padding_algorithm))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Unknown Attr(data_format): '%s' ."
+                         "It can only be 'NCHW' or 'NHWC'." % str(data_format))
+
+    channel_last = (data_format == "NHWC")
+    if channel_last:
+        input = np.transpose(input, [0, 3, 1, 2])
+
+    in_n, in_c, in_h, in_w = input.shape
+    f_n, f_c, f_h, f_w = filter.shape
+    out_n = in_n
+    out_c = f_n
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c // group
+    sub_f_n = f_n // group
+
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
+        'dilation']
+
+    # update pad and dilation
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter.shape[2:4]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilation = [1, 1]
+        input_data_shape = input.shape[2:4]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_h_0, pad_h_1 = pad[0], pad[0]
+    pad_w_0, pad_w_1 = pad[1], pad[1]
+    if len(pad) == 4:
+        pad_h_0, pad_h_1 = pad[0], pad[1]
+        pad_w_0, pad_w_1 = pad[2], pad[3]
+    out_h = 1 + (in_h + pad_h_0 + pad_h_1 - (dilation[0] *
+                                             (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + pad_w_0 + pad_w_1 - (dilation[1] *
+                                             (f_w - 1) + 1)) // stride[1]
+    out = np.zeros((out_n, out_c, out_h, out_w))
+
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
+
+    input_pad = np.pad(input, ((0, 0), (0, 0), (pad_h_0, pad_h_1),
+                               (pad_w_0, pad_w_1)),
+                       mode='constant',
+                       constant_values=0)
+
+    filter_dilation = np.zeros((f_n, f_c, d_bolck_h, d_bolck_w))
+    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
+        1]] = filter
+
+    for i in range(out_h):
+        for j in range(out_w):
+            for g in range(group):
+                input_pad_masked = \
+                    input_pad[:, g * f_c:(g + 1) * f_c,
+                    i * stride[0]:i * stride[0] + d_bolck_h,
+                    j * stride[1]:j * stride[1] + d_bolck_w]
+
+                f_sub = filter_dilation[g * sub_f_n:(g + 1) * sub_f_n, :, :, :]
+                # sub_f_n == sub_out_c
+                for k in range(sub_out_c):
+                    # Multiplication of Corresponding Elements, then sum all
+                    out[:, g * sub_out_c + k, i, j] = \
+                        np.sum(input_pad_masked * f_sub[k, :, :, :],
+                               axis=(1, 2, 3))
+
+    if channel_last:
+        out = np.transpose(out, [0, 2, 3, 1])
+
+    return out, in_n, out_h, out_w, out_c
+
+
+def create_test_channel_last_class(parent):
+    class TestChannelLastCase(parent):
+        def init_data_format(self):
+            self.data_format = "NHWC"
+
+        def init_test_case_2(self):
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+
+    cls_name = "{0}_{1}".format(parent.__name__, "ChannelLast")
+    TestChannelLastCase.__name__ = cls_name
+    globals()[cls_name] = TestChannelLastCase
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.pad = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+class TestConv2dOp(OpTest):
+    def setUp(self):
+        self.op_type = "conv2d"
+        self.use_cudnn = False
+        self.exhaustive_search = False
+        self.use_cuda = False
+        self.use_mkldnn = False
+        self.fuse_relu_before_depthwise_conv = False
+        self.data_format = "AnyLayout"
+        self.dtype = np.float64
+        self.init_kernel_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        if not self.has_cuda():
+            self.fuse_relu_before_depthwise_conv = False
+        if self.fuse_relu_before_depthwise_conv:
+            input = input - 0.5
+            input -= (input < 0) * 0.1
+            input += (input >= 0) * 0.1
+            input2 = np.maximum(input, 0.0)
+        else:
+            input2 = input
+        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
+
+        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
+                                                  conv2d_param)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format,
+            'fuse_relu_before_depthwise_conv':
+            self.fuse_relu_before_depthwise_conv,
+            'exhaustive_search': self.exhaustive_search
+        }
+        self.outputs = {'Output': output}
+
+    def has_cuda(self):
+        return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                 self.use_cuda)
+
+    def test_check_output(self):
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
+                                        self.no_need_check_grad == True):
+            return
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
+
+    def test_check_grad_no_filter(self):
+        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
+                                        self.no_need_check_grad == True):
+            return
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
+                                        self.no_need_check_grad == True):
+            return
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'], 'Output', no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_test_case_2(self):
+        pass
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_kernel_type(self):
+        pass
+
+
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithGroup(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.group = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [18, f_c, 3, 3]
+
+
+class TestWith1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithInput1x1Filter1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [100, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
+# class TestCUDNNWithDilation(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv_cudnn"
+
+# ---- test asymmetric padding ----
+
+
+class TestConv2dOp_v2(OpTest):
+    def setUp(self):
+        self.op_type = "conv2d"
+        self.use_cudnn = False
+        self.exhaustive_search = False
+        self.use_cuda = False
+        self.use_mkldnn = False
+        self.fuse_relu_before_depthwise_conv = False
+        self.dtype = np.float64
+        self.init_kernel_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_data_format()
+        self.init_test_case()
+        self.init_paddings()
+        self.init_test_case_2()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        if not self.has_cuda():
+            self.fuse_relu_before_depthwise_conv = False
+        if self.fuse_relu_before_depthwise_conv:
+            input = input - 0.5
+            input -= (input < 0) * 0.1
+            input += (input >= 0) * 0.1
+            input2 = np.maximum(input, 0.0)
+        else:
+            input2 = input
+        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
+        output, _, _, _, _ = conv2d_forward_naive(
+            input2, filter, self.groups, conv2d_param, self.padding_algorithm,
+            self.data_format)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'padding_algorithm': self.padding_algorithm,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format,
+            'fuse_relu_before_depthwise_conv':
+            self.fuse_relu_before_depthwise_conv,
+            'exhaustive_search': self.exhaustive_search
+        }
+        self.outputs = {'Output': output}
+
+    def has_cuda(self):
+        return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                 self.use_cuda)
+
+    def test_check_output(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
+
+    def test_check_grad_no_filter(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        if core.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'], 'Output', no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 4, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_kernel_type(self):
+        pass
+
+    def init_paddings(self):
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def init_test_case_2(self):
+        pass
+
+
+class TestConv2dOp_AsyPadding(TestConv2dOp_v2):
+    def init_paddings(self):
+        self.pad = [0, 0, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithPad_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithStride_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithGroup_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.group = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 4, 3]
+
+
+class TestWith1x1_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [2, 2, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDilation_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 1, 3, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithInput1x1Filter1x1_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [40, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 3, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+#---------- test SAME VALID -----------
+create_test_padding_SAME_class(TestConv2dOp_AsyPadding)
+create_test_padding_SAME_class(TestWithPad_AsyPadding)
+create_test_padding_SAME_class(TestWithStride_AsyPadding)
+create_test_padding_SAME_class(TestWithGroup_AsyPadding)
+create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_padding_VALID_class(TestConv2dOp_AsyPadding)
+create_test_padding_VALID_class(TestWithPad_AsyPadding)
+create_test_padding_VALID_class(TestWithStride_AsyPadding)
+create_test_padding_VALID_class(TestWithGroup_AsyPadding)
+create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+# ------------ test channel last ---------
+create_test_channel_last_class(TestConv2dOp_AsyPadding)
+create_test_channel_last_class(TestWithPad_AsyPadding)
+create_test_channel_last_class(TestWithGroup_AsyPadding)
+create_test_channel_last_class(TestWith1x1_AsyPadding)
+create_test_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
new file mode 100644
index 0000000000000..6c3368c3b6bfc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
@@ -0,0 +1,112 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+class TestDropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64)).astype('uint8')
+        }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class TestDropoutOpInput1d(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((2000, )).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((2000)).astype('uint8')
+        }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class TestDropoutOp2(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': np.zeros((32, 64)).astype('float32'),
+            'Mask': np.zeros((32, 64)).astype('uint8')
+        }
+
+
+class TestDropoutOp3(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('uint8')
+        }
+
+
+class TestDropoutOp6(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('uint8')
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
new file mode 100644
index 0000000000000..cb6e412cb0f01
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
@@ -0,0 +1,138 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest, skip_check_grad_ci
+from elementwise import TestXPUElementwiseOpBase
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseDivOp(OpTest, TestXPUElementwiseOpBase):
+    def setUp(self):
+        TestXPUElementwiseOpBase.setUp(self, "elementwise_div")
+        self.make_input()
+        self.make_output()
+
+    def make_output(self, x_shape=None, y_shape=None):
+        x, y = self.reshape_input(x_shape, y_shape)
+        self.outputs = {'Out': np.divide(x, y)}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_scalar(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_scalar, self).setUp()
+        self.grad_implemented = False
+        self.make_input([20, 3, 4], [1])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_Vector(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_Vector, self).setUp()
+        self.make_input([100, ], [100, ])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_broadcast_0(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_broadcast_0, self).setUp()
+        self.attrs['axis'] = 0
+        self.make_input([100, 3, 4], [100, ])
+        self.make_output(y_shape=[100, 1, 1])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_broadcast_1(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_broadcast_1, self).setUp()
+        self.attrs['axis'] = 1
+        self.make_input([2, 100, 4], [100, ])
+        self.make_output(y_shape=[1, 100, 1])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_broadcast_2(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_broadcast_2, self).setUp()
+        self.make_input([2, 3, 100], [100, ])
+        self.make_output(y_shape=[1, 1, 100])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_broadcast_3(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_broadcast_3, self).setUp()
+        self.attrs['axis'] = 1
+        self.make_input([2, 10, 12, 5], [10, 12])
+        self.make_output(y_shape=[1, 10, 12, 1])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_broadcast_4(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_broadcast_4, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input([2, 3, 50], [2, 1, 50])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_broadcast_5(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_broadcast_5, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input([2, 3, 4, 20], [2, 3, 1, 20])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_commonuse_1(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_commonuse_1, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input([2, 3, 100], [1, 1, 100])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseDivOp_xsize_lessthan_ysize(TestXPUElementwiseDivOp):
+    def setUp(self):
+        super(TestElementwiseDivOp_xsize_lessthan_ysize, self).setUp()
+        self.is_x_size_less_than_y = True
+        self.attrs['axis'] = 2
+        self.make_input([10, 12], [2, 3, 10, 12])
+        self.make_output(x_shape=[1, 1, 10, 12])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
new file mode 100644
index 0000000000000..340c5895c1359
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+from elementwise import TestXPUElementwiseOpBase
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseOp(OpTest, TestXPUElementwiseOpBase):
+    def setUp(self):
+        TestXPUElementwiseOpBase.setUp(self, "elementwise_max")
+        self.make_input()
+        self.make_output()
+
+    def make_input(self, x_shape=[13, 17], y_shape=[13, 17], idx_list=None):
+        x = np.random.random(x_shape).astype(self.dtype)
+        sgn = np.random.choice([-1, 1], y_shape).astype(self.dtype)
+        if idx_list is None:
+            y = x + sgn * np.random.uniform(0.1, 1, y_shape).astype(self.dtype)
+        else:
+            x_temp = x
+            for idx in idx_list:
+                x_temp = np.take(x_temp, [0], axis=idx)
+            sgn = sgn.reshape(x_temp.shape)
+            y = x_temp + sgn * np.random.uniform(0.1, 1, x_temp.shape)
+            y = y.reshape(y_shape).astype(self.dtype)
+
+        self.inputs = {'X': x, 'Y': y}
+
+    def make_output(self, x_shape=None, y_shape=None):
+        x, y = self.reshape_input(x_shape, y_shape)
+        self.outputs = {'Out': np.maximum(x, y)}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_scalar(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_scalar, self).setUp()
+        self.make_input([2, 3, 20], [1])
+        self.make_output()
+        self.grad_implemented = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_Vector(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_Vector, self).setUp()
+        self.make_input([100, ], [100, ])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_broadcast_0(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_broadcast_0, self).setUp()
+        self.attrs['axis'] = 0
+        self.make_input([100, 5, 2], [100, ], [1, 2])
+        self.make_output(y_shape=[100, 1, 1])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_broadcast_1(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_broadcast_1, self).setUp()
+        self.attrs['axis'] = 1
+        self.make_input([2, 100, 3], [100, ], [0, 2])
+        self.make_output(y_shape=[1, 100, 1])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_broadcast_2(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_broadcast_2, self).setUp()
+        self.make_input([1, 3, 100], [100, ], [0, 1])
+        self.make_output(y_shape=[1, 1, 100])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_broadcast_3(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_broadcast_3, self).setUp()
+        self.attrs['axis'] = 1
+        self.make_input([2, 50, 2, 1], [50, 2], [0, 3])
+        self.make_output(y_shape=[1, 50, 2, 1])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_broadcast_4(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_broadcast_4, self).setUp()
+        self.make_input([2, 3, 4, 5], [2, 3, 1, 5])
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMaxOp_broadcast_5(TestXPUElementwiseOp):
+    def setUp(self):
+        super(TestElementwiseMaxOp_broadcast_5, self).setUp()
+        self.make_input([2, 3, 100], [1, 1, 100])
+        self.make_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
new file mode 100644
index 0000000000000..3fa9c6d84e24d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
@@ -0,0 +1,153 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+from elementwise import TestXPUElementwiseOpBase
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseMulOp(OpTest, TestXPUElementwiseOpBase):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        TestXPUElementwiseOpBase.setUp(self, "elementwise_mul")
+        self.init_kernel_type()
+        self.init_axis()
+        self.attrs['axis'] = self.axis
+        self.attrs['use_mkldnn'] = self.use_mkldnn
+        self.grad_implemented = True
+        self.make_input()
+        self.make_output()
+
+    def make_output(self, x_shape=None, y_shape=None):
+        x, y = self.reshape_input(x_shape, y_shape)
+        self.outputs = {'Out': np.multiply(x, y)}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseMulOp_scalar(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestXPUElementwiseMulOp_scalar, self).setUp()
+        self.make_input((10, 3, 4), (1, ))
+        self.make_output()
+        self.grad_implemented = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseMulOp_Vector(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestXPUElementwiseMulOp_Vector, self).setUp()
+        self.make_input((100, ), (100, ))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseMulOp_broadcast_0(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestXPUElementwiseMulOp_broadcast_0, self).setUp()
+        self.make_input((100, 2, 3), (100, ))
+        self.make_output(y_shape=(100, 1, 1))
+        self.y_grad_implemented = False
+
+    def init_axis(self):
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_1(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestElementwiseMulOp_broadcast_1, self).setUp()
+        self.attrs['axis'] = 1
+        self.y_grad_implemented = False
+        self.make_input((2, 100, 3), (100, ))
+        self.make_output(y_shape=(1, 100, 1))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_2(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestElementwiseMulOp_broadcast_2, self).setUp()
+        self.y_grad_implemented = False
+        self.make_input((2, 3, 100), (100, ))
+        self.make_output(y_shape=(1, 1, 100))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_3(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestElementwiseMulOp_broadcast_3, self).setUp()
+        self.attrs['axis'] = 1
+        self.y_grad_implemented = False
+        self.make_input((2, 10, 12, 3), (10, 12))
+        self.make_output(y_shape=(1, 10, 12, 1))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_4(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestElementwiseMulOp_broadcast_4, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input((10, 2, 11), (10, 1, 11))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseMulOp_broadcast_5(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestElementwiseMulOp_broadcast_5, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input((10, 4, 2, 3), (10, 4, 1, 3))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseMulOp_commonuse_1(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestXPUElementwiseMulOp_commonuse_1, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input((2, 3, 100), (1, 1, 100))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseMulOp_xsize_lessthan_ysize(TestXPUElementwiseMulOp):
+    def setUp(self):
+        super(TestXPUElementwiseMulOp_xsize_lessthan_ysize, self).setUp()
+        self.attrs['axis'] = 2
+        self.is_x_size_less_than_y = True
+        self.make_input((10, 10), (2, 2, 10, 10))
+        self.make_output(x_shape=(1, 1, 10, 10))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
new file mode 100644
index 0000000000000..22aa07be951a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
@@ -0,0 +1,128 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+from elementwise import TestXPUElementwiseOpBase
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseSubOp(OpTest, TestXPUElementwiseOpBase):
+    def setUp(self):
+        TestXPUElementwiseOpBase.setUp(self, "elementwise_sub")
+        self.make_input()
+        self.make_output()
+        self.grad_implemented = True
+
+    def make_output(self, x_shape=None, y_shape=None):
+        x, y = self.reshape_input(x_shape, y_shape)
+        self.outputs = {'Out': x - y}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_scalar(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_scalar, self).setUp()
+        self.grad_implemented = False
+        self.make_input((10, 3, 4), (1, ))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_Vector(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_Vector, self).setUp()
+        self.make_input((100, ), (100, ))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_broadcast_0(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_broadcast_0, self).setUp()
+        self.attrs['axis'] = 0
+        self.make_input((100, 3, 2), (100, ))
+        self.make_output(y_shape=(100, 1, 1))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_broadcast_1(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_broadcast_1, self).setUp()
+        self.attrs['axis'] = 1
+        self.make_input((2, 100, 3), (100, ))
+        self.make_output(y_shape=(1, 100, 1))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_broadcast_2(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_broadcast_2, self).setUp()
+        self.make_input((2, 3, 100), (100, ))
+        self.make_output(y_shape=(1, 1, 100))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_broadcast_3(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_broadcast_3, self).setUp()
+        self.attrs['axis'] = 1
+        self.make_input((2, 10, 12, 3), (10, 12))
+        self.make_output(y_shape=(1, 10, 12, 1))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_broadcast_4(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_broadcast_4, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input((2, 5, 3, 12), (2, 5, 1, 12))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_commonuse_1(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_commonuse_1, self).setUp()
+        self.is_common_broadcast = True
+        self.make_input((2, 3, 100), (1, 1, 100))
+        self.make_output()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestElementwiseSubOp_xsize_lessthan_ysize(TestXPUElementwiseSubOp):
+    def setUp(self):
+        super(TestElementwiseSubOp_xsize_lessthan_ysize, self).setUp()
+        self.attrs['axis'] = 2
+        self.is_x_size_less_than_y = True
+        self.make_input((10, 12), (2, 3, 10, 12))
+        self.make_output(x_shape=(1, 1, 10, 12))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
new file mode 100644
index 0000000000000..b31c80ee9e7e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
@@ -0,0 +1,241 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+from op_test import OpTest
+
+import paddle
+import numpy as np
+
+
+# Situation 1: Attr(shape) is a list(without tensor)
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp1(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value'''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'dtype': 5, 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp2(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with default value'''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'dtype': 5}
+        self.outputs = {'Out': np.full((123, 92), 0.0)}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp3(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified int64 value'''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'dtype': 3, 'value': 10000000000}
+        self.outputs = {'Out': np.full((123, 92), 10000000000)}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp4(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified int value'''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'dtype': 2, 'value': 3}
+        self.outputs = {'Out': np.full((123, 92), 3)}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+# Situation 2: Attr(shape) is a list(with tensor)
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp1_ShapeTensorList(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value'''
+        self.op_type = "fill_constant"
+        self.init_data()
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {
+            'shape': self.infer_shape,
+            'dtype': 5,
+            'value': self.value
+        }
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, 92]
+        self.value = 3.8
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp2_ShapeTensorList(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with default value'''
+        self.op_type = "fill_constant"
+        self.init_data()
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {'shape': self.infer_shape, 'dtype': 5}
+        self.outputs = {'Out': np.full(self.shape, 0.0)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, -1]
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [123, -1]
+        self.value = 10000000000
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp4_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+    def init_data(self):
+        self.shape = [123, 92]
+        self.infer_shape = [123, -1]
+        self.value = 3
+
+
+# Situation 3: shape is a tensor
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp1_ShapeTensor(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value'''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+        self.attrs = {'value': self.value, 'dtype': 5}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3.8
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+# Situation 4: value is a tensor
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp1_ValueTensor(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value'''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor': np.array([self.value]).astype("float32")
+        }
+        self.attrs = {'value': self.value + 1.0, 'dtype': 5}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3.8
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+# Situation 5: value is a tensor
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestFillConstantOp2_ValueTensor(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value'''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor': np.array([self.value]).astype("int32")
+        }
+        self.attrs = {'value': self.value, 'dtype': 2}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3
+        self.dtype = np.int32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
new file mode 100644
index 0000000000000..9bea33e484e19
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -0,0 +1,154 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            'X': xnp,
+            'Index': np.array(self.index).astype(self.index_type)
+        }
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float64"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestXPUGatherOp(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.dtype = np.float32
+        self.attrs = {'use_xpu': True}
+
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            'X': xnp,
+            'Index': np.array(self.index).astype(self.index_type)
+        }
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = self.dtype
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase1(TestXPUGatherOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase2(TestXPUGatherOp):
+    def config(self):
+        """
+        For int64_t index type
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase3(TestXPUGatherOp):
+    def config(self):
+        """
+        For other input type
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase4(TestXPUGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': False}
+        self.x_type = "float32"
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase5(TestXPUGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': False}
+        self.x_type = "float32"
+        self.index = [1, 1, 3]
+        self.index_type = "int32"
+
+
+class TestCase6(TestXPUGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': True}
+        self.x_type = "float32"
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
new file mode 100644
index 0000000000000..454c3144908cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+from op_test import OpTest
+from test_gaussian_random_op import TestGaussianRandomOp
+
+paddle.enable_static()
+
+
+class TestXPUGaussianRandomOp(TestGaussianRandomOp):
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            outs = self.calc_output(place)
+            outs = [np.array(out) for out in outs]
+            outs.sort(key=len)
+            self.verify_output(outs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
new file mode 100644
index 0000000000000..b166661c3d6bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import sys
+import unittest
+from functools import reduce
+sys.path.append("..")
+from op_test import OpTest
+from operator import mul
+
+paddle.enable_static()
+
+
+def ref_layer_norm(x, scale, bias, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    left = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    right = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape = [left, right]
+    mean = np.mean(x, axis=1)
+    variance = np.var(x, axis=1) + epsilon
+    y = np.divide((x - mean.reshape([left, 1])),
+                  (np.sqrt(variance)).reshape([left, 1]))
+    if scale is not None:
+        y = scale.reshape([1, right]) * y
+    if bias is not None:
+        y = y + bias.reshape([1, right])
+    x.shape, y.shape = x_shape, x_shape
+    return y, mean, variance
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULayerNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "layer_norm"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.epsilon = 1e-05
+        self.begin_norm_axis = 1
+        self.set_attrs()
+
+        right = reduce(mul, self.shape[self.begin_norm_axis:len(self.shape)], 1)
+        np.random.seed(10)
+        x_np = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        scale_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype)
+        bias_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype)
+        ref_y_np, ref_mean_np, ref_variance_np = ref_layer_norm(
+            x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis)
+
+        self.inputs = {'X': x_np, 'Scale': scale_np, 'Bias': bias_np}
+        self.outputs = {
+            'Y': ref_y_np,
+            'Mean': ref_mean_np,
+            'Variance': ref_variance_np
+        }
+        self.attrs = {'begin_norm_axis': self.begin_norm_axis, 'use_xpu': True}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            paddle.XPUPlace(0), ['X'], 'Y', max_relative_error=0.02)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULayerNormOpAxis2(TestXPULayerNormOp):
+    def set_attrs(self):
+        self.begin_norm_axis = 2
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULayerNormOpAxis3(TestXPULayerNormOp):
+    def set_attrs(self):
+        self.begin_norm_axis = 3
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULayerNormOp2D(TestXPULayerNormOp):
+    def set_attrs(self):
+        self.shape = [10, 12]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULayerNormOp3D(TestXPULayerNormOp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
new file mode 100644
index 0000000000000..3ba3a8b5eef30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def sigmoid_array(x):
+    return 1 / (1 + np.exp(-x))
+
+
+class TestXPULogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 100
+
+        x = np.random.random((samples_num, 1)).astype("float32")
+        predicted = sigmoid_array(x)
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-7
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad(['Predicted'], 'Loss')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
new file mode 100644
index 0000000000000..0a33c875bf30c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
@@ -0,0 +1,223 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.op import Operator
+import paddle.compat as cpt
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+paddle.enable_static()
+
+
+class TestDygraphEmbeddingAPIError(unittest.TestCase):
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            dict_size = 20
+            layer = fluid.dygraph.nn.Embedding(
+                size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
+            # the input must be Variable
+            x0 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], paddle.XPUPlace(0))
+            self.assertRaises(TypeError, layer, x0)
+            # the input dtype must be int64
+            data_t = fluid.data(name='word', shape=[1], dtype='int32')
+            self.assertRaises(TypeError, layer, data_t)
+
+
+class TestLookupTableOp(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table_v2"
+        table = np.random.random((17, 31)).astype("float64")
+        ids = np.random.randint(0, 17, 4).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids]}
+
+    def test_check_output_with_place(self):
+        self.check_output_with_place(place=paddle.XPUPlace(0))
+
+    def test_check_grad(self):
+
+        self.check_grad_with_place(
+            inputs_to_check=['W'],
+            output_names='Out',
+            no_grad_set=set('Ids'),
+            place=paddle.XPUPlace(0),
+            in_place=True)
+
+
+class TestLookupTableOpWithTensorIds(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table_v2"
+        table = np.random.random((17, 31)).astype("float64")
+        ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int32")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=paddle.XPUPlace(0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            inputs_to_check=['W'],
+            output_names='Out',
+            no_grad_set=set('Ids'),
+            place=paddle.XPUPlace(0),
+            in_place=True)
+
+
+@skip_check_grad_ci(
+    reason="Since paddings are not trainable and fixed in forward,"
+    "the gradient of paddings makes no sense and we don't "
+    "test the gradient here.")
+class TestLookupTableOpWithPadding(TestLookupTableOp):
+    def test_check_output(self):
+        ids = np.squeeze(self.inputs['Ids'])
+        padding_idx = np.random.choice(ids, 1)[0]
+        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
+        self.attrs = {'padding_idx': int(padding_idx)}
+        self.check_output_with_place(place=paddle.XPUPlace(0))
+
+
+@skip_check_grad_ci(
+    reason="Since paddings are not trainable and fixed in forward,"
+    "the gradient of paddings makes no sense and we don't "
+    "test the gradient here.")
+class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': cpt.long_type(padding_idx)}
+        self.check_output_with_place(place=paddle.XPUPlace(0))
+
+
+class TestLookupTableWIsSelectedRows(unittest.TestCase):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([0, 4, 3, 5]).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def prepare_w(self, scope, place):
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+        w_selected_rows = scope.var('W')
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
+
+        # create and run lookup_table_v2 operator
+        lookup_table = Operator("lookup_table_v2", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+
+        self.check_result(ids_array, result_array)
+
+    def test_w_is_selected_rows(self):
+        places = [paddle.XPUPlace(0)]
+        for place in places:
+            self.check_with_place(place)
+
+
+class TestLookupTableWithTensorIdsWIsSelectedRows(
+        TestLookupTableWIsSelectedRows):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
+class TestLookupTableApi(unittest.TestCase):
+    def test_api(self):
+        x = fluid.layers.data(name='x', shape=[20], dtype='int64')
+        emb = fluid.embedding(input=x, size=[128, 64])
+
+        place = paddle.XPUPlace(0)
+        x_data = np.random.randint(0, 127, [2, 20]).astype("int64")
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'x': x_data, },
+                      fetch_list=[emb],
+                      return_numpy=False)
+
+
+class TestEmbedOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            input_data = np.random.randint(0, 10, (4, 6)).astype("int64")
+
+            def test_Variable():
+                # the input type must be Variable
+                fluid.embedding(input=input_data, size=(10, 64))
+
+            self.assertRaises(TypeError, test_Variable)
+
+            def test_input_dtype():
+                # the input dtype must be int64
+                input = fluid.data(name='x1', shape=[4, 6], dtype='float32')
+                fluid.embedding(input=input, size=(10, 64))
+
+            self.assertRaises(TypeError, test_input_dtype)
+
+            def test_param_dtype():
+                # dtype must be float32 or float64
+                input2 = fluid.data(name='x2', shape=[4, 6], dtype='int64')
+                fluid.embedding(input=input2, size=(10, 64), dtype='int64')
+
+            self.assertRaises(TypeError, test_param_dtype)
+            input3 = fluid.data(name='x3', shape=[4, 6], dtype='int64')
+            fluid.embedding(input=input3, size=(10, 64), dtype='float16')
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
new file mode 100644
index 0000000000000..1cc9950f9a15b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -0,0 +1,277 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid.core as core
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMulV2Op(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+
+    def init_kernel_type(self):
+        self.dtype = "float32"
+
+    def setUp(self):
+        self.init_kernel_type()
+        self.config()
+        self.op_type = "matmul_v2"
+        self.use_xpu = True
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=0.01)
+
+    def test_check_grad(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.1)
+
+
+'''
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp3(TestMatMulV2Op):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp5(TestMatMulV2Op):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 1, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+'''
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp8(TestMatMulV2Op):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp13(TestMatMulV2Op):
+    """
+    case 13
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 2, 50)
+        self.y_shape = (2, 2, 2, 50)
+        self.trans_x = True
+        self.trans_y = False
+
+
+'''
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMuklOp17(TestMatMulV2Op):
+    """
+    case 17 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.trans_x = False
+        self.trans_y = False
+'''
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestMatMulV2API(unittest.TestCase):
+    def setUp(self):
+        self.places = [fluid.CPUPlace()]
+        self.places.append(fluid.XPUPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32")
+            input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32")
+
+            result = paddle.matmul(input_x, input_y)
+
+            x_np = np.random.random([4, 3]).astype("float32")
+            y_np = np.random.random([3, 4]).astype("float32")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input_x": x_np,
+                                    "input_y": y_np},
+                              fetch_list=[result])
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
new file mode 100644
index 0000000000000..f43516235c057
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+np.random.seed(10)
+
+
+class TestMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = "mean"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
+        self.outputs = {'Out': np.mean(self.inputs["X"])}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_checkout_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestMeanOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of mean_op must be Variable.
+            input1 = 12
+            self.assertRaises(TypeError, fluid.layers.mean, input1)
+            # The input dtype of mean_op must be float16, float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[12, 10], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.mean, input2)
+            input3 = fluid.layers.data(
+                name='input3', shape=[4], dtype="float16")
+            fluid.layers.softmax(input3)
+
+
+class TestXPUMeanOp(TestMeanOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_checkout_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class TestMeanAPI(unittest.TestCase):
+    # test paddle.tensor.stat.mean
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = paddle.XPUPlace(0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_shape)
+            out1 = paddle.mean(x)
+            out2 = paddle.tensor.mean(x)
+            out3 = paddle.tensor.stat.mean(x)
+            axis = np.arange(len(self.x_shape)).tolist()
+            out4 = paddle.mean(x, axis)
+            out5 = paddle.mean(x, tuple(axis))
+
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x},
+                          fetch_list=[out1, out2, out3, out4, out5])
+        out_ref = np.mean(self.x)
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, axis=None, keepdim=False):
+            x_tensor = paddle.to_tensor(x)
+            out = paddle.mean(x_tensor, axis, keepdim)
+            if isinstance(axis, list):
+                axis = tuple(axis)
+                if len(axis) == 0:
+                    axis = None
+            out_ref = np.mean(x, axis, keepdims=keepdim)
+            self.assertEqual(
+                np.allclose(
+                    out.numpy(), out_ref, rtol=1e-04), True)
+
+        test_case(self.x)
+        test_case(self.x, [])
+        test_case(self.x, -1)
+        test_case(self.x, keepdim=True)
+        test_case(self.x, 2, keepdim=True)
+        test_case(self.x, [0, 2])
+        test_case(self.x, (0, 2))
+        test_case(self.x, [0, 1, 2, 3])
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.disable_static()
+        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        x = paddle.to_tensor(x)
+        self.assertRaises(Exception, paddle.mean, x, -3)
+        self.assertRaises(Exception, paddle.mean, x, 2)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [10, 12], 'int32')
+            self.assertRaises(TypeError, paddle.mean, x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
new file mode 100644
index 0000000000000..ccee79e8cd77a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import os
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+
+
+class TestMomentumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "momentum"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        param = np.random.random((123, 321)).astype(self.dtype)
+        grad = np.random.random((123, 321)).astype(self.dtype)
+        velocity = np.zeros((123, 321)).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(self.dtype)
+        mu = 0.0001
+        use_nesterov = False
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {'mu': mu}
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - grad * learning_rate - \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output_with_place(self):
+        self.check_output_with_place(paddle.XPUPlace(0))
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
index 94ab5b71e4fbf..7cf005fefa613 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
@@ -23,37 +23,13 @@
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import time
 
-
-class TestMulOp(OpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float64
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((20, 5)).astype(self.dtype),
-            'Y': np.random.random((5, 21)).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+paddle.enable_static()
 
 
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
 class TestMulOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -69,11 +45,15 @@ def test_errors(self):
             self.assertRaises(TypeError, fluid.layers.mul, x3, x4)
 
 
-class TestMulOp2(OpTest):
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.dtype = np.float64
+        self.dtype = np.float32
+        self.use_xpu = True
         self.init_dtype_type()
+        np.random.seed((int)(time.time()))
         self.inputs = {
             'X': np.random.random((3, 4, 2, 9)).astype(self.dtype),
             'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype)
@@ -90,71 +70,62 @@ def setUp(self):
     def init_dtype_type(self):
         pass
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp1(TestMulOp):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
     def test_check_output(self):
         place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, atol=1e-1)
+        self.check_output_with_place(place, atol=0.01)
 
     def test_check_grad_normal(self):
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+            place, ['X', 'Y'], 'Out', max_relative_error=0.1)
 
     def test_check_grad_ingore_x(self):
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+            place, ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
 
-    def test_check_grad_ingore_y(self):
+    def test_check_grad_ignore_y(self):
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+            place, ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
-class TestXPUMulOp2(TestMulOp2):
+class TestXPUMulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.use_xpu = True
+        self.dtype = np.float32
+        self.init_dtype_type()
+        np.random.seed((int)(time.time()))
+        self.inputs = {
+            'X': np.random.random((20, 5)).astype(self.dtype),
+            'Y': np.random.random((5, 21)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
     def init_dtype_type(self):
         self.dtype = np.float32
 
     def test_check_output(self):
         place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, atol=2e-1)
+        self.check_output_with_place(place, atol=0.01)
 
     def test_check_grad_normal(self):
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+            place, ['X', 'Y'], 'Out', max_relative_error=0.1)
 
     def test_check_grad_ingore_x(self):
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+            place, ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+            place, ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
new file mode 100644
index 0000000000000..7f20c83aacb1f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
@@ -0,0 +1,338 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def max_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=True,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in range(H_out):
+        for j in range(W_out):
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+
+
+def avg_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False,
+                             exclusive=True,
+                             adaptive=False,
+                             data_type=np.float64):
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in range(H_out):
+        for j in range(W_out):
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = i * strides[0] - paddings[0]
+                r_end = i * strides[0] + ksize[0] - paddings[0]
+                c_start = j * strides[1] - paddings[1]
+                c_end = j * strides[1] + ksize[1] - paddings[1]
+                field_size = (r_end - r_start) * (c_end - c_start)
+                r_start = np.max((r_start, 0))
+                r_end = np.min((r_end, H))
+                c_start = np.max((c_start, 0))
+                c_end = np.min((c_end, W))
+
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            if (exclusive or adaptive):
+                field_size = (r_end - r_start) * (c_end - c_start)
+
+            if data_type == np.int8 or data_type == np.uint8:
+                out[:, :, i, j] = (np.rint(
+                    np.sum(x_masked, axis=(2, 3)) /
+                    field_size)).astype(data_type)
+            else:
+                out[:, :, i, j] = (np.sum(x_masked, axis=(2, 3)) /
+                                   field_size).astype(data_type)
+    return out
+
+
+def pool2D_forward_naive(x,
+                         ksize,
+                         strides,
+                         paddings,
+                         global_pool=0,
+                         ceil_mode=False,
+                         exclusive=True,
+                         adaptive=False,
+                         data_format='NCHW',
+                         pool_type="max",
+                         padding_algorithm="EXPLICIT"):
+
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCHW":
+                input_data_shape = x.shape[2:4]
+            elif data_format == "NHWC":
+                input_data_shape = x.shape[1:3]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+
+    assert len(paddings) == 2 or len(paddings) == 4
+    is_sys = True if len(paddings) == 2 else False
+
+    N = x.shape[0]
+    C, H, W = [x.shape[1], x.shape[2], x.shape[3]] if data_format == 'NCHW' \
+        else [x.shape[3], x.shape[1], x.shape[2]]
+
+    if global_pool == 1:
+        ksize = [H, W]
+        paddings = [0 for _ in range(len(paddings))]
+
+    pad_h_up = paddings[0] if is_sys else paddings[0]
+    pad_h_down = paddings[0] if is_sys else paddings[1]
+    pad_w_left = paddings[1] if is_sys else paddings[2]
+    pad_w_right = paddings[1] if is_sys else paddings[3]
+
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \
+            if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1
+        W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \
+            if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1
+
+    out = np.zeros((N, C, H_out, W_out)) if data_format=='NCHW' \
+        else np.zeros((N, H_out, W_out, C))
+    for i in range(H_out):
+        if adaptive:
+            in_h_start = adaptive_start_index(i, H, ksize[0])
+            in_h_end = adaptive_end_index(i, H, ksize[0])
+        else:
+            in_h_start = np.max((i * strides[0] - pad_h_up, 0))
+            in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H))
+
+        for j in range(W_out):
+            if adaptive:
+                in_w_start = adaptive_start_index(j, W, ksize[1])
+                in_w_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                in_h_start = i * strides[0] - pad_h_up
+                in_w_start = j * strides[1] - pad_w_left
+                in_h_end = i * strides[0] + ksize[0] - pad_h_up
+                in_w_end = j * strides[1] + ksize[1] - pad_w_left
+
+                field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start)
+                in_h_start = np.max((in_h_start, 0))
+                in_w_start = np.max((in_w_start, 0))
+                in_h_end = np.min((in_h_end, H))
+                in_w_end = np.min((in_w_end, W))
+
+            if data_format == 'NCHW':
+                x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
+                if pool_type == 'avg':
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
+
+#                         if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
+                elif pool_type == 'max':
+                    out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+            elif data_format == 'NHWC':
+                x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
+                if pool_type == 'avg':
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
+                    out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
+                elif pool_type == 'max':
+                    out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+    return out
+
+
+class TestPool2D_Op(OpTest):
+    def setUp(self):
+        self.op_type = "pool2d"
+        self.use_cudnn = False
+        self.init_kernel_type()
+        self.use_mkldnn = False
+        self.init_data_type()
+        self.init_test_case()
+        self.padding_algorithm = "EXPLICIT"
+        self.init_paddings()
+        self.init_global_pool()
+        self.init_kernel_type()
+        self.init_pool_type()
+        self.init_ceil_mode()
+        self.init_exclusive()
+        self.init_adaptive()
+        self.init_data_format()
+        self.init_shape()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        output = pool2D_forward_naive(
+            input, self.ksize, self.strides, self.paddings, self.global_pool,
+            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
+            self.pool_type, self.padding_algorithm).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'ceil_mode': self.ceil_mode,
+            'data_format': self.data_format,
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive,
+            "padding_algorithm": self.padding_algorithm,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=0.07)
+
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+
+    def init_paddings(self):
+        self.paddings = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+    def init_kernel_type(self):
+        self.use_cudnn = False
+
+    def init_data_type(self):
+        self.dtype = np.float64
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
+
+    def init_ceil_mode(self):
+        self.ceil_mode = False
+
+    def init_exclusive(self):
+        self.exclusive = True
+
+    def init_adaptive(self):
+        self.adaptive = False
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
new file mode 100644
index 0000000000000..44c356ca65f29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+class TestMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def check_grad_(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestMeanOp5D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 2, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestMeanOp6D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestMeanOp8D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+        }
+        self.attrs = {'dim': (0, 3), 'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class Test1DReduce(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random(120).astype("float64")}
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class Test2DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [0], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [2], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [-2], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1, 2], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'reduce_all': True, 'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].mean()}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
new file mode 100644
index 0000000000000..2a0457d186229
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+class TestSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def check_grad_(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp5D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 2, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp6D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSumOp8D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+        }
+        self.attrs = {'dim': (0, 3), 'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=(0, 3))}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class Test1DReduce(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random(120).astype("float64")}
+        self.attrs = {'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class Test2DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [0], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [2], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [-2], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1, 2], 'use_xpu': True}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'reduce_all': True, 'use_xpu': True}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
new file mode 100644
index 0000000000000..813bbffefcb34
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import math
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+@skip_check_grad_ci(reason="There is no grad kernel for roi_align_xpu kernel.")
+class TestROIAlignOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_align()
+
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+        }
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width,
+            'sampling_ratio': self.sampling_ratio
+        }
+
+        self.outputs = {'Out': self.out_data}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = -1
+
+        self.x = np.random.random(self.x_dim).astype('float64')
+
+    def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
+                 bin_size_h, bin_size_w):
+        count = roi_bin_grid_h * roi_bin_grid_w
+        bilinear_pos = np.zeros(
+            [self.channels, self.pooled_height, self.pooled_width, count, 4],
+            np.float64)
+        bilinear_w = np.zeros(
+            [self.pooled_height, self.pooled_width, count, 4], np.float64)
+        for ph in range(self.pooled_width):
+            for pw in range(self.pooled_height):
+                c = 0
+                for iy in range(roi_bin_grid_h):
+                    y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
+                        bin_size_h / roi_bin_grid_h
+                    for ix in range(roi_bin_grid_w):
+                        x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
+                            bin_size_w / roi_bin_grid_w
+                        if y < -1.0 or y > self.height or \
+                               x < -1.0 or x > self.width:
+                            continue
+                        if y <= 0:
+                            y = 0
+                        if x <= 0:
+                            x = 0
+                        y_low = int(y)
+                        x_low = int(x)
+                        if y_low >= self.height - 1:
+                            y = y_high = y_low = self.height - 1
+                        else:
+                            y_high = y_low + 1
+                        if x_low >= self.width - 1:
+                            x = x_high = x_low = self.width - 1
+                        else:
+                            x_high = x_low + 1
+                        ly = y - y_low
+                        lx = x - x_low
+                        hy = 1 - ly
+                        hx = 1 - lx
+                        for ch in range(self.channels):
+                            bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
+                                                                 x_high]
+                            bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
+                                                                 x_high]
+                        bilinear_w[ph, pw, c, 0] = hy * hx
+                        bilinear_w[ph, pw, c, 1] = hy * lx
+                        bilinear_w[ph, pw, c, 2] = ly * hx
+                        bilinear_w[ph, pw, c, 3] = ly * lx
+                        c = c + 1
+        return bilinear_pos, bilinear_w
+
+    def calc_roi_align(self):
+        self.out_data = np.zeros(
+            (self.rois_num, self.channels, self.pooled_height,
+             self.pooled_width)).astype('float64')
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            x_i = self.x[roi_batch_id]
+            roi_xmin = roi[1] * self.spatial_scale
+            roi_ymin = roi[2] * self.spatial_scale
+            roi_xmax = roi[3] * self.spatial_scale
+            roi_ymax = roi[4] * self.spatial_scale
+            roi_width = max(roi_xmax - roi_xmin, 1)
+            roi_height = max(roi_ymax - roi_ymin, 1)
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_height / self.pooled_height)
+            roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_width / self.pooled_width)
+            count = int(roi_bin_grid_h * roi_bin_grid_w)
+            pre_size = count * self.pooled_width * self.pooled_height
+            bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
+                                                     int(roi_bin_grid_h),
+                                                     int(roi_bin_grid_w),
+                                                     bin_size_h, bin_size_w)
+            for ch in range(self.channels):
+                align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
+                output_val = align_per_bin.mean(axis=-1)
+                self.out_data[i, ch, :, :] = output_val
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
+
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype("float64")
+
+    def setUp(self):
+        self.op_type = "roi_align"
+        self.set_data()
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
new file mode 100644
index 0000000000000..c29150ef921c2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import os
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+
+
+class TestSGDOp(OpTest):
+    def setUp(self):
+        self.op_type = "sgd"
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype("float32")
+        g = np.random.random((self.h, self.w)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def conf(self):
+        self.h = 102
+        self.w = 105
+
+    def test_check_output_with_place(self):
+        self.check_output_with_place(paddle.XPUPlace(0))
+
+
+class TestSGDOpCase8X(TestSGDOp):
+    def conf(self):
+        self.h = 10
+        self.w = 64
+
+
+class TestSGDOpWithLargeInput(unittest.TestCase):
+    def runTest(self):
+        data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64')
+        label = fluid.layers.fill_constant(
+            shape=[1, 150], value=0.5, dtype='float32')
+        emb = fluid.embedding(input=data, size=(10000, 150), dtype='float32')
+        out = fluid.layers.l2_normalize(x=emb, axis=-1)
+
+        cost = fluid.layers.square_error_cost(input=out, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        sgd_optimizer.minimize(avg_cost)
+
+        place = paddle.XPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        result = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
new file mode 100644
index 0000000000000..44c8821be06bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+# Situation 1: starts(list, no tensor), ends(list, no tensor)
+# 1.1 without attr(decrease)
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            "use_xpu": True
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(place, ['Input'], 'Out')
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+# 1.2 with attr(decrease)
+class TestSliceOp_decs_dim(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+            "use_xpu": True
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(place, ['Input'], 'Out')
+
+
+class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-1, 0, 2]
+        self.ends = [1000000, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 7]).astype("float64")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[:, :, :, -1]
+
+
+class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
new file mode 100644
index 0000000000000..80e83e030fed6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -0,0 +1,391 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid.core as core
+
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
+    if soft_label:
+        return (-label * np.log(softmax)).sum(axis=axis, keepdims=True)
+
+    shape = softmax.shape
+    axis %= len(shape)
+    n = int(np.prod(shape[:axis]))
+    axis_dim = shape[axis]
+    remain = int(np.prod(shape[axis + 1:]))
+    softmax_reshape = softmax.reshape((n, axis_dim, remain))
+    label_reshape = label.reshape((n, 1, remain))
+    result = np.zeros_like(label_reshape, dtype=softmax.dtype)
+    for i in range(n):
+        for j in range(remain):
+            lbl = label_reshape[i, 0, j]
+            if lbl != ignore_index:
+                result[i, 0, j] -= np.log(softmax_reshape[i, lbl, j])
+    return result.reshape(label.shape)
+
+
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = False
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
+
+    def setUp(self):
+        self.initParams()
+
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
+
+        if self.soft_label:
+            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+            labels /= np.sum(labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
+
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
+                             self.ignore_index)
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+        }
+        if self.ignore_index >= 0:
+            self.attrs['ignore_index'] = self.ignore_index
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ["Logits"], "Loss", max_relative_error=0.1)
+
+
+class TestXPUSoftmaxWithCrossEntropyOp(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ["Logits"], "Loss", max_relative_error=0.1)
+
+
+class TestXPUSoftmaxWithCrossEntropyOp2(TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ["Logits"], "Loss", max_relative_error=0.1)
+
+
+class TestXPUSoftmaxWithCrossEntropyOp3(TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with ignore_index.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [41, 37]
+        self.ignore_index = 5
+        self.axis = -1
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpAxis1(TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 0
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
+
+
+class TestXPUSoftmaxWithCrossEntropyOpAxis2(TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 1
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
+
+
+class TestXPUSoftmaxWithCrossEntropyOpAxis3(TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 2
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
+
+
+class TestXPUSoftmaxWithCrossEntropyOpAxis4(TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 3
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
+
+
+class TestXPUSoftmaxWithCrossEntropyOpAxisDimEqualOne(
+        TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 1]
+
+
+class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis1(
+        TestXPUSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 0
+        self.ignore_index = -1
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis2(
+        TestXPUSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 1
+        self.ignore_index = -1
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis3(
+        TestXPUSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 2
+        self.ignore_index = -1
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpSoftLabelAxis4(
+        TestXPUSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 3
+        self.ignore_index = -1
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
+        TestXPUSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 1
+        self.axis = 0
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
+        TestXPUSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 0
+        self.axis = 1
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
+        TestXPUSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 3
+        self.axis = 2
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
+        TestXPUSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 3
+        self.axis = 3
+        self.dtype = np.float64
+
+
+class TestXPUSoftmaxWithCrossEntropyOpBoundary0(
+        TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test stable softmax with cross entropy operator will not product INF
+    with small logits value.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
+        self.dtype = np.float64
+        self.logits = np.full(self.shape, -500.0).astype(self.dtype)
+
+
+class TestXPUSoftmaxWithCrossEntropyOpBoundary1(
+        TestXPUSoftmaxWithCrossEntropyOp):
+    """
+    Test stable softmax with cross entropy operator will not product INF
+    with small logits value.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
+        self.dtype = np.float64
+        self.logits = np.full(self.shape, 1000.0).astype(self.dtype)
+        self.logits[:, :, 0, :] = -1000.0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
new file mode 100644
index 0000000000000..13de73fef6f3d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+@skip_check_grad_ci(reason="There is no grad kernel for stack_xpu op.")
+class TestStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float64'
+
+    def initParameters(self):
+        pass
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'stack'
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+class TestStackOp1(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 16
+
+
+class TestStackOp2(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 20
+
+
+class TestStackOp3(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -4
+
+
+class TestStackOp5(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 3
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
new file mode 100644
index 0000000000000..c191e5f0b2966
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
@@ -0,0 +1,230 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+class TestXPUTransposeOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.inputs = {'X': np.random.random(self.shape).astype("float64")}
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': False,
+            'use_xpu': True
+        }
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float64"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = False
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place=place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
+
+
+class TestCase0(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.axis = (0, )
+
+
+class TestCase1(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+
+
+class TestCase2(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+class TestCase5(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 16, 96)
+        self.axis = (0, 2, 1)
+
+
+class TestCase6(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 10, 12, 16)
+        self.axis = (3, 1, 2, 0)
+
+
+class TestCase7(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 10, 2, 16)
+        self.axis = (0, 1, 3, 2)
+
+
+class TestCase8(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestXPUTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
+class TestTransposeOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(name='x', shape=[10, 5, 3], dtype='float64')
+
+            def test_x_Variable_check():
+                # the Input(x)'s type must be Variable
+                fluid.layers.transpose("not_variable", perm=[1, 0, 2])
+
+            self.assertRaises(TypeError, test_x_Variable_check)
+
+            def test_x_dtype_check():
+                # the Input(x)'s dtype must be one of [float16, float32, float64, int32, int64]
+                x1 = fluid.layers.data(
+                    name='x1', shape=[10, 5, 3], dtype='bool')
+                fluid.layers.transpose(x1, perm=[1, 0, 2])
+
+            self.assertRaises(TypeError, test_x_dtype_check)
+
+            def test_perm_list_check():
+                # Input(perm)'s type must be list
+                fluid.layers.transpose(x, perm="[1, 0, 2]")
+
+            self.assertRaises(TypeError, test_perm_list_check)
+
+            def test_perm_length_and_x_dim_check():
+                # Input(perm) is the permutation of dimensions of Input(input)
+                # its length should be equal to dimensions of Input(input)
+                fluid.layers.transpose(x, perm=[1, 0, 2, 3, 4])
+
+            self.assertRaises(ValueError, test_perm_length_and_x_dim_check)
+
+            def test_each_elem_value_check():
+                # Each element in Input(perm) should be less than Input(x)'s dimension
+                fluid.layers.transpose(x, perm=[3, 5, 7])
+
+            self.assertRaises(ValueError, test_each_elem_value_check)
+
+
+class TestTAPI(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[10], dtype="float64", name="data")
+            data_t = paddle.t(data)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            data_np = np.random.random([10]).astype("float64")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[10, 5], dtype="float64", name="data")
+            data_t = paddle.t(data)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            data_np = np.random.random([10, 5]).astype("float64")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            data = fluid.data(shape=[1, 5], dtype="float64", name="data")
+            data_t = paddle.t(data)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            data_np = np.random.random([1, 5]).astype("float64")
+            result, = exe.run(feed={"data": data_np}, fetch_list=[data_t])
+            expected_result = np.transpose(data_np)
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([10]).astype("float64")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([10, 5]).astype("float64")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+        with fluid.dygraph.guard():
+            np_x = np.random.random([1, 5]).astype("float64")
+            data = fluid.dygraph.to_variable(np_x)
+            z = paddle.t(data)
+            np_z = z.numpy()
+            z_expected = np.array(np.transpose(np_x))
+        self.assertEqual((np_z == z_expected).all(), True)
+
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name='x', shape=[10, 5, 3], dtype='float64')
+
+            def test_x_dimension_check():
+                paddle.t(x)
+
+            self.assertRaises(ValueError, test_x_dimension_check)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/functional/rnn.py b/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
similarity index 50%
rename from python/paddle/nn/functional/rnn.py
rename to python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
index b7a97bc5aa303..d096cb8ec13f4 100644
--- a/python/paddle/nn/functional/rnn.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
@@ -12,6 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.layers.rnn import rnn, birnn
+from __future__ import print_function
 
-__all__ = ['rnn', 'birnn']
+import sys
+sys.path.append("..")
+import unittest
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+from test_truncated_gaussian_random_op import TestTrunctedGaussianRandomOp
+
+paddle.enable_static()
+
+
+class TestXPUTrunctedGaussianRandomOp(TestTrunctedGaussianRandomOp):
+    def test_xpu(self):
+        if paddle.is_compiled_with_xpu():
+            self.gaussian_random_test(place=fluid.XPUPlace(0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
new file mode 100644
index 0000000000000..ab59fd2665679
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import subprocess
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from test_uniform_random_op import TestUniformRandomOp, TestUniformRandomOpSelectedRows
+
+paddle.enable_static()
+
+
+class TestXPUUniformRandomOp(TestUniformRandomOp):
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            outs = self.calc_output(place)
+            outs = [np.array(out) for out in outs]
+            outs.sort(key=len)
+            self.verify_output(outs)
+
+
+class TestXPUUniformRandomOpSelectedRows(TestUniformRandomOpSelectedRows):
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 7e2f0eb2fb8bb..e52d9da99c3b9 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -14,7 +14,7 @@
 
 # TODO: import framework api under this directory 
 __all__ = [
-    'create_global_var', 'create_parameter', 'ParamAttr', 'Variable',
+    'create_parameter', 'ParamAttr',
     'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', 'get_default_dtype',
     'set_default_dtype'
 ]
@@ -24,20 +24,14 @@
     'DataParallel'
 ]
 
-__all__ += [
-    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
-]
-
 from . import random
 from .random import manual_seed
 from .framework import get_default_dtype
 from .framework import set_default_dtype
 
-from ..fluid.framework import Variable  #DEFINE_ALIAS
 from ..fluid.framework import ComplexVariable  #DEFINE_ALIAS
 from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
-from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
+# from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
 from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
 from ..fluid.core import CPUPlace  #DEFINE_ALIAS
 from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
@@ -51,11 +45,3 @@
 from .io import save
 from .io import load
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
-
-from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
-from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay  #DEFINE_ALIAS
-from ..fluid.dygraph.learning_rate_scheduler import NaturalExpDecay  #DEFINE_ALIAS
-from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay  #DEFINE_ALIAS
-from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay  #DEFINE_ALIAS
-from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay  #DEFINE_ALIAS
-from ..fluid.dygraph.learning_rate_scheduler import CosineDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index c196c1d689bfe..7e8c717bb1deb 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -204,10 +204,13 @@ def save(obj, path):
         Now only supports save ``state_dict`` of Layer or Optimizer.
 
     .. note::
-        ``paddle.save`` will not add a suffix to the saved results, 
-        but we recommend that you use the following paddle standard suffixes:
-        1. for ``Layer.state_dict`` -> ``.pdparams``
-        2. for ``Optimizer.state_dict`` -> ``.pdopt``
+        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
+        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path`` 
+        of ``paddle.save`` will be directly used as the saved file name instead of a prefix. 
+        In order to unify the saved file name format, we recommend using the paddle standard suffix:
+        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ; 
+        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` . 
+        For specific examples, please refer to API code examples.
     
     Args:
         obj(Object) : The object to be saved.
@@ -228,7 +231,7 @@ def save(obj, path):
             layer_state_dict = emb.state_dict()
             paddle.save(layer_state_dict, "emb.pdparams")
 
-            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+            scheduler = paddle.optimizer.lr.NoamDecay(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
@@ -272,9 +275,10 @@ def load(path, **configs):
         Now only supports load ``state_dict`` of Layer or Optimizer.
 
     .. note::
-        ``paddle.load`` supports loading ``state_dict`` of Layer or Optimizer from 
-        the result of other save APIs except ``paddle.load`` , but the argument 
-        ``path`` format is different:
+        In order to use the model parameters saved by paddle more efficiently, 
+        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of 
+        other save APIs except ``paddle.save`` , but the argument ``path`` format is 
+        different:
         1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
         ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
         ``model.pdopt`` ; 
@@ -287,22 +291,23 @@ def load(path, **configs):
         directory, such as ``model`` and model is a directory.
 
     .. note::
-        If you load ``state_dict`` from the saved result of 
+        If you load ``state_dict`` from the saved result of static mode API such as 
         ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
-        the structured variable name will cannot be restored. You need to set the argument 
-        ``use_structured_name=False`` when using ``Layer.set_state_dict`` later.
+        the structured variable name in dynamic mode will cannot be restored. 
+        You need to set the argument ``use_structured_name=False`` when using 
+        ``Layer.set_state_dict`` later.
 
     Args:
         path(str) : The path to load the target object. Generally, the path is the target 
-            file path. When compatible with loading the saved results other APIs, the path 
-            can be a file prefix or directory. 
+            file path. When loading state_dict from the saved result of the API used to save 
+            the inference model, the path may be a file prefix or directory.
         **configs (dict, optional): other load configuration options for compatibility. We do not 
             recommend using these configurations, they may be removed in the future. If not necessary, 
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            (1) model_filename (str): The inference model file name of the paddle 1.x 
             ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
             ``save_inference_model`` save format. No default file name, save variables separately 
             by default.
 
@@ -320,7 +325,7 @@ def load(path, **configs):
             layer_state_dict = emb.state_dict()
             paddle.save(layer_state_dict, "emb.pdparams")
 
-            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+            scheduler = paddle.optimizer.lr.NoamDecay(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 21e3054dde7d7..4f36effe6dd62 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -453,6 +453,12 @@ def _run(self, inputs, labels=None):
             if len(name) > 0:
                 rets.insert(i, feed[name])
 
+        # step learning rate scheduler on each batch end
+        if self.model._optimizer and \
+                isinstance(self.model._optimizer._learning_rate,
+                           paddle.optimizer.lr.LRScheduler):
+            self.model._optimizer._learning_rate.step()
+
         # LoDTensor cannot be fetch as numpy directly
         rets = [np.array(v) for v in rets]
         if self.mode == 'test':
@@ -652,6 +658,13 @@ def train_batch(self, inputs, labels=None):
 
         self.model._optimizer.minimize(final_loss)
         self.model.network.clear_gradients()
+
+        # step learning rate scheduler on each batch end
+        if self.model._optimizer and \
+                isinstance(self.model._optimizer._learning_rate,
+                           paddle.optimizer.lr.LRScheduler):
+            self.model._optimizer._learning_rate.step()
+
         metrics = []
         for metric in self.model._metrics:
             metric_outs = metric.compute(*(to_list(outputs) + labels))
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index fba45523889db..2e7f55bdd1481 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -15,12 +15,9 @@
 from .metrics import *
 from . import metrics
 
-from ..fluid.layers.metric_op import accuracy, auc
 from ..fluid.layers.nn import chunk_eval, mean_iou
 
 __all__ = metrics.__all__ + [
-    'accuracy',
-    'auc',
     'chunk_eval',
     'mean_iou',
 ]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index f4a9b8c01d02a..fed659562cbb0 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -20,9 +20,13 @@
 import abc
 import numpy as np
 
+from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.layers.nn import topk
+from ..fluid.framework import core, _varbase_creator, in_dygraph_mode
 import paddle
 
-__all__ = ['Metric', 'Accuracy', 'Precision', 'Recall', 'Auc']
+__all__ = ['Metric', 'Accuracy', 'Precision', 'Recall', 'Auc', 'accuracy']
 
 
 def _is_numpy_(var):
@@ -733,3 +737,70 @@ def name(self):
         Returns metric name
         """
         return self._name
+
+
+def accuracy(input, label, k=1, correct=None, total=None, name=None):
+    """
+    accuracy layer.
+    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall                                                                                           
+ 
+    This function computes the accuracy using the input and label.
+    If the correct label occurs in top k predictions, then correct will increment by one.
+    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+ 
+    Args:
+        input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
+            The shape is ``[sample_number, class_dim]`` .
+        label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32.
+        name(str, optional): The default value is None. Normally there is no need for
+            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+ 
+    Returns:
+        Tensor, the correct rate. A Tensor with type float32.
+ 
+    Examples:
+        .. code-block:: python
+ 
+            import paddle
+ 
+            predictions = paddle.to_tensor([[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], dtype='float32')
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.metric.accuracy(input=predictions, label=label, k=1)
+            # [0.5]
+    """
+    if in_dygraph_mode():
+        if correct is None:
+            correct = _varbase_creator(dtype="int32")
+        if total is None:
+            total = _varbase_creator(dtype="int32")
+
+        topk_out, topk_indices = topk(input, k=k)
+        _acc, _, _ = core.ops.accuracy(topk_out, topk_indices, label, correct,
+                                       total)
+        return _acc
+
+    helper = LayerHelper("accuracy", **locals())
+    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
+                             'accuracy')
+    topk_out, topk_indices = topk(input, k=k)
+    acc_out = helper.create_variable_for_type_inference(dtype="float32")
+    if correct is None:
+        correct = helper.create_variable_for_type_inference(dtype="int32")
+    if total is None:
+        total = helper.create_variable_for_type_inference(dtype="int32")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 1dddef0cace1d..1d626c38c21bd 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -37,19 +37,16 @@
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
-from .control_flow import cond  #DEFINE_ALIAS
+# from .control_flow import cond  #DEFINE_ALIAS
 # from .control_flow import DynamicRNN        #DEFINE_ALIAS
 # from .control_flow import StaticRNN        #DEFINE_ALIAS
-from .control_flow import while_loop  #DEFINE_ALIAS
+# from .control_flow import while_loop  #DEFINE_ALIAS
 # from .control_flow import rnn        #DEFINE_ALIAS
-# from .decode import BeamSearchDecoder        #DEFINE_ALIAS
+from .decode import BeamSearchDecoder  #DEFINE_ALIAS
+from .decode import dynamic_decode  #DEFINE_ALIAS
 # from .decode import Decoder        #DEFINE_ALIAS
-# from .decode import beam_search  #DEFINE_ALIAS
-# from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import crf_decoding        #DEFINE_ALIAS
 # from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
-# from .decode import dynamic_decode        #DEFINE_ALIAS
-from .decode import gather_tree  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
 from .layer.activation import ELU  #DEFINE_ALIAS
 from .layer.activation import GELU  #DEFINE_ALIAS
@@ -76,23 +73,14 @@
 from .layer.activation import Maxout  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
+from .layer.common import Pad1D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
-from .layer.common import ReflectionPad1d  #DEFINE_ALIAS
-from .layer.common import ReplicationPad1d  #DEFINE_ALIAS
-from .layer.common import ConstantPad1d  #DEFINE_ALIAS
-from .layer.common import ReflectionPad2d  #DEFINE_ALIAS
-from .layer.common import ReplicationPad2d  #DEFINE_ALIAS
-from .layer.common import ConstantPad2d  #DEFINE_ALIAS
-from .layer.common import ZeroPad2d  #DEFINE_ALIAS
-from .layer.common import ReplicationPad3d  #DEFINE_ALIAS
-from .layer.common import ConstantPad3d  #DEFINE_ALIAS
+from .layer.common import Pad3D  #DEFINE_ALIAS
 from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
 from .layer.common import Upsample  #DEFINE_ALIAS
-from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
-from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
 from .layer.common import Dropout  #DEFINE_ALIAS
 from .layer.common import Dropout2d  #DEFINE_ALIAS
@@ -121,13 +109,6 @@
 # from .layer.conv import TreeConv        #DEFINE_ALIAS
 # from .layer.conv import Conv1D        #DEFINE_ALIAS
 from .layer.extension import RowConv  #DEFINE_ALIAS
-# from .layer.learning_rate import CosineDecay        #DEFINE_ALIAS
-# from .layer.learning_rate import ExponentialDecay        #DEFINE_ALIAS
-# from .layer.learning_rate import InverseTimeDecay        #DEFINE_ALIAS
-# from .layer.learning_rate import NaturalExpDecay        #DEFINE_ALIAS
-# from .layer.learning_rate import NoamDecay        #DEFINE_ALIAS
-# from .layer.learning_rate import PiecewiseDecay        #DEFINE_ALIAS
-# from .layer.learning_rate import PolynomialDecay        #DEFINE_ALIAS
 from .layer.common import Linear
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
 from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
@@ -146,13 +127,13 @@
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
+from .layer.norm import LocalResponseNorm  #DEFINE_ALIAS
 
 from .layer.rnn import RNNCellBase  #DEFINE_ALIAS
 from .layer.rnn import SimpleRNNCell  #DEFINE_ALIAS
diff --git a/python/paddle/nn/control_flow.py b/python/paddle/nn/control_flow.py
deleted file mode 100644
index a78b65c3c6c82..0000000000000
--- a/python/paddle/nn/control_flow.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: define the control flow api  
-from ..fluid.layers import cond  #DEFINE_ALIAS
-from ..fluid.layers import while_loop  #DEFINE_ALIAS
-
-__all__ = [
-    'cond',
-    #       'DynamicRNN',
-    #       'StaticRNN',
-    'while_loop',
-    #       'rnn'
-]
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index 214744217e957..bba5aba0da9ad 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -12,19 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define api to implement decoding algorithm  
-# from ..fluid.layers import beam_search  #DEFINE_ALIAS
-# from ..fluid.layers import beam_search_decode  #DEFINE_ALIAS
-
-from ..fluid.layers import gather_tree  #DEFINE_ALIAS
+from ..fluid.layers import BeamSearchDecoder  #DEFINE_ALIAS
+from ..fluid.layers import dynamic_decode  #DEFINE_ALIAS
 
 __all__ = [
-    #       'BeamSearchDecoder',
-    #       'Decoder',
-    #       'beam_search',
-    #       'beam_search_decode',
-    #       'crf_decoding',
-    #       'ctc_greedy_decoder',
-    #       'dynamic_decode',
-    'gather_tree'
+    'BeamSearchDecoder',
+    'dynamic_decode',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 30eefb2c3912b..5f9307845ae9d 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -30,7 +30,7 @@
 from . import loss
 __all__ += loss.__all__
 from .activation import elu  #DEFINE_ALIAS
-from .activation import erf  #DEFINE_ALIAS
+# from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
 from .activation import hardshrink  #DEFINE_ALIAS
 from .activation import hardtanh  #DEFINE_ALIAS
@@ -44,7 +44,7 @@
 from .activation import relu6  #DEFINE_ALIAS
 from .activation import selu  #DEFINE_ALIAS
 from .activation import sigmoid  #DEFINE_ALIAS
-from .activation import soft_relu  #DEFINE_ALIAS
+# from .activation import soft_relu  #DEFINE_ALIAS
 from .activation import softmax  #DEFINE_ALIAS
 from .activation import softplus  #DEFINE_ALIAS
 from .activation import softshrink  #DEFINE_ALIAS
@@ -61,10 +61,10 @@
 # from .common import embedding        #DEFINE_ALIAS
 # from .common import fc  #DEFINE_ALIAS
 from .common import label_smooth
-from .common import one_hot  #DEFINE_ALIAS
+# from .common import one_hot  #DEFINE_ALIAS
 from .common import pad  #DEFINE_ALIAS
-from .common import pad_constant_like  #DEFINE_ALIAS
-from .common import pad2d  #DEFINE_ALIAS
+# from .common import pad_constant_like  #DEFINE_ALIAS
+# from .common import pad2d  #DEFINE_ALIAS
 from .common import cosine_similarity  #DEFINE_ALIAS
 from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
@@ -79,30 +79,22 @@
 from .conv import conv_transpose2d  #DEFINE_ALIAS
 from .conv import conv3d  #DEFINE_ALIAS
 from .conv import conv_transpose3d  #DEFINE_ALIAS
-from .extension import add_position_encoding  #DEFINE_ALIAS
+# from .extension import add_position_encoding  #DEFINE_ALIAS
 # from .extension import autoincreased_step_counter        #DEFINE_ALIAS
-from .extension import continuous_value_model  #DEFINE_ALIAS
-from .extension import filter_by_instag  #DEFINE_ALIAS
+# from .extension import continuous_value_model  #DEFINE_ALIAS
+# from .extension import filter_by_instag  #DEFINE_ALIAS
 # from .extension import linear_chain_crf        #DEFINE_ALIAS
 # from .extension import merge_selected_rows        #DEFINE_ALIAS
-from .extension import multiclass_nms  #DEFINE_ALIAS
-from .extension import polygon_box_transform  #DEFINE_ALIAS
-from .extension import random_crop  #DEFINE_ALIAS
-from .extension import row_conv  #DEFINE_ALIAS
-from .extension import rpn_target_assign  #DEFINE_ALIAS
-from .extension import similarity_focus  #DEFINE_ALIAS
-from .extension import target_assign  #DEFINE_ALIAS
-from .extension import temporal_shift  #DEFINE_ALIAS
-from .extension import warpctc  #DEFINE_ALIAS
+# from .extension import multiclass_nms  #DEFINE_ALIAS
+# from .extension import polygon_box_transform  #DEFINE_ALIAS
+# from .extension import random_crop  #DEFINE_ALIAS
+# from .extension import row_conv  #DEFINE_ALIAS
+# from .extension import rpn_target_assign  #DEFINE_ALIAS
+# from .extension import similarity_focus  #DEFINE_ALIAS
+# from .extension import target_assign  #DEFINE_ALIAS
+# from .extension import temporal_shift  #DEFINE_ALIAS
+# from .extension import warpctc  #DEFINE_ALIAS
 from .extension import diag_embed  #DEFINE_ALIAS
-from .learning_rate import cosine_decay  #DEFINE_ALIAS
-from .learning_rate import exponential_decay  #DEFINE_ALIAS
-from .learning_rate import inverse_time_decay  #DEFINE_ALIAS
-from .learning_rate import natural_exp_decay  #DEFINE_ALIAS
-from .learning_rate import noam_decay  #DEFINE_ALIAS
-from .learning_rate import piecewise_decay  #DEFINE_ALIAS
-from .learning_rate import polynomial_decay  #DEFINE_ALIAS
-from .learning_rate import linear_lr_warmup  #DEFINE_ALIAS
 # from .lod import sequence_concat        #DEFINE_ALIAS
 # from .lod import sequence_conv        #DEFINE_ALIAS
 # from .lod import sequence_enumerate        #DEFINE_ALIAS
@@ -123,7 +115,7 @@
 # from .lod import array_read        #DEFINE_ALIAS
 # from .lod import array_write        #DEFINE_ALIAS
 # from .lod import create_array        #DEFINE_ALIAS
-from .lod import hash  #DEFINE_ALIAS
+# from .lod import hash  #DEFINE_ALIAS
 # from .lod import im2sequence        #DEFINE_ALIAS
 # from .lod import lod_append        #DEFINE_ALIAS
 # from .lod import lod_reset        #DEFINE_ALIAS
@@ -134,11 +126,10 @@
 # from .lod import dynamic_lstmp        #DEFINE_ALIAS
 from .loss import binary_cross_entropy  #DEFINE_ALIAS
 from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
-from .loss import bpr_loss  #DEFINE_ALIAS
-from .loss import center_loss  #DEFINE_ALIAS
+# from .loss import bpr_loss  #DEFINE_ALIAS
+# from .loss import center_loss  #DEFINE_ALIAS
 from .loss import cross_entropy  #DEFINE_ALIAS
 from .loss import dice_loss  #DEFINE_ALIAS
-from .loss import edit_distance  #DEFINE_ALIAS
 from .loss import hsigmoid_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
 from .loss import kl_div  #DEFINE_ALIAS
@@ -149,27 +140,24 @@
 from .loss import nll_loss  #DEFINE_ALIAS
 # from .loss import nce        #DEFINE_ALIAS
 from .loss import npair_loss  #DEFINE_ALIAS
-from .loss import rank_loss  #DEFINE_ALIAS
-from .loss import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
-from .loss import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
 from .loss import sigmoid_focal_loss  #DEFINE_ALIAS
-from .loss import smooth_l1  #DEFINE_ALIAS
+# from .loss import smooth_l1  #DEFINE_ALIAS
 from .loss import smooth_l1_loss  #DEFINE_ALIAS
 from .loss import softmax_with_cross_entropy  #DEFINE_ALIAS
 from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
-from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
+# from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 from .loss import ctc_loss  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
 from .norm import batch_norm  #DEFINE_ALIAS
 from .norm import instance_norm  #DEFINE_ALIAS
 from .norm import layer_norm  #DEFINE_ALIAS
-from .norm import lrn  #DEFINE_ALIAS
+from .norm import local_response_norm  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
-from .pooling import pool2d  #DEFINE_ALIAS
-from .pooling import pool3d  #DEFINE_ALIAS
+# from .pooling import pool2d  #DEFINE_ALIAS
+# from .pooling import pool3d  #DEFINE_ALIAS
 from .pooling import avg_pool1d  #DEFINE_ALIAS
 from .pooling import avg_pool2d  #DEFINE_ALIAS
 from .pooling import avg_pool3d  #DEFINE_ALIAS
@@ -184,47 +172,48 @@
 from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
 
-from .rnn import rnn  #DEFINE_ALIAS
-from .rnn import birnn  #DEFINE_ALIAS
+# from .rnn import rnn  #DEFINE_ALIAS
+# from .rnn import birnn  #DEFINE_ALIAS
 # from .rnn import gru_unit        #DEFINE_ALIAS
 # from .rnn import lstm        #DEFINE_ALIAS
 # from .rnn import lstm_unit        #DEFINE_ALIAS
-from .vision import affine_channel  #DEFINE_ALIAS
+# from .vision import affine_channel  #DEFINE_ALIAS
 from .vision import affine_grid  #DEFINE_ALIAS
-from .vision import anchor_generator  #DEFINE_ALIAS
-from .vision import bipartite_match  #DEFINE_ALIAS
-from .vision import box_clip  #DEFINE_ALIAS
-from .vision import box_coder  #DEFINE_ALIAS
-from .vision import box_decoder_and_assign  #DEFINE_ALIAS
-from .vision import collect_fpn_proposals  #DEFINE_ALIAS
+# from .vision import anchor_generator  #DEFINE_ALIAS
+# from .vision import bipartite_match  #DEFINE_ALIAS
+# from .vision import box_clip  #DEFINE_ALIAS
+# from .vision import box_coder  #DEFINE_ALIAS
+# from .vision import box_decoder_and_assign  #DEFINE_ALIAS
+# from .vision import collect_fpn_proposals  #DEFINE_ALIAS
 # from .vision import deformable_conv  #DEFINE_ALIAS
-from .vision import deformable_roi_pooling  #DEFINE_ALIAS
-from .vision import density_prior_box  #DEFINE_ALIAS
-from .vision import detection_output  #DEFINE_ALIAS
-from .vision import distribute_fpn_proposals  #DEFINE_ALIAS
-from .vision import fsp_matrix  #DEFINE_ALIAS
-from .vision import generate_mask_labels  #DEFINE_ALIAS
-from .vision import generate_proposal_labels  #DEFINE_ALIAS
-from .vision import generate_proposals  #DEFINE_ALIAS
+# from .vision import deformable_roi_pooling  #DEFINE_ALIAS
+# from .vision import density_prior_box  #DEFINE_ALIAS
+# from .vision import detection_output  #DEFINE_ALIAS
+# from .vision import distribute_fpn_proposals  #DEFINE_ALIAS
+# from .vision import fsp_matrix  #DEFINE_ALIAS
+# from .vision import generate_mask_labels  #DEFINE_ALIAS
+# from .vision import generate_proposal_labels  #DEFINE_ALIAS
+# from .vision import generate_proposals  #DEFINE_ALIAS
 from .vision import grid_sample  #DEFINE_ALIAS
-from .vision import image_resize  #DEFINE_ALIAS
-from .vision import image_resize_short  #DEFINE_ALIAS
+# from .vision import image_resize  #DEFINE_ALIAS
+# from .vision import image_resize_short  #DEFINE_ALIAS
 # from .vision import multi_box_head  #DEFINE_ALIAS
 from .vision import pixel_shuffle  #DEFINE_ALIAS
-from .vision import prior_box  #DEFINE_ALIAS
-from .vision import prroi_pool  #DEFINE_ALIAS
-from .vision import psroi_pool  #DEFINE_ALIAS
-from .vision import resize_bilinear  #DEFINE_ALIAS
-from .vision import resize_nearest  #DEFINE_ALIAS
-from .vision import resize_trilinear  #DEFINE_ALIAS
-from .vision import retinanet_detection_output  #DEFINE_ALIAS
-from .vision import retinanet_target_assign  #DEFINE_ALIAS
-from .vision import roi_align  #DEFINE_ALIAS
-from .vision import roi_perspective_transform  #DEFINE_ALIAS
-from .vision import roi_pool  #DEFINE_ALIAS
-from .vision import shuffle_channel  #DEFINE_ALIAS
-from .vision import space_to_depth  #DEFINE_ALIAS
-from .vision import yolo_box  #DEFINE_ALIAS
-from .vision import yolov3_loss  #DEFINE_ALIAS
+# from .vision import prior_box  #DEFINE_ALIAS
+# from .vision import prroi_pool  #DEFINE_ALIAS
+# from .vision import psroi_pool  #DEFINE_ALIAS
+# from .vision import resize_bilinear  #DEFINE_ALIAS
+# from .vision import resize_nearest  #DEFINE_ALIAS
+# from .vision import resize_trilinear  #DEFINE_ALIAS
+# from .vision import retinanet_detection_output  #DEFINE_ALIAS
+# from .vision import retinanet_target_assign  #DEFINE_ALIAS
+# from .vision import roi_align  #DEFINE_ALIAS
+# from .vision import roi_perspective_transform  #DEFINE_ALIAS
+# from .vision import roi_pool  #DEFINE_ALIAS
+# from .vision import shuffle_channel  #DEFINE_ALIAS
+# from .vision import space_to_depth  #DEFINE_ALIAS
+# from .vision import yolo_box  #DEFINE_ALIAS
+# from .vision import yolov3_loss  #DEFINE_ALIAS
 from .input import one_hot  #DEFINE_ALIAS
 from .input import embedding  #DEFINE_ALIAS
+from ...fluid.layers import gather_tree
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 33ecd29162c12..0f79aa012ca32 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -13,14 +13,18 @@
 # limitations under the License.
 
 # TODO: define activation functions of neural network
-from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import soft_relu  #DEFINE_ALIAS
+from ...fluid.layers import brelu  #DEFINE_ALIAS
+# from ...fluid.layers import erf  #DEFINE_ALIAS
+from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
+from ...fluid.layers import hard_swish  #DEFINE_ALIAS
+from ...fluid.layers import maxout  #DEFINE_ALIAS
+# from ...fluid.layers import soft_relu  #DEFINE_ALIAS
+from ...fluid.layers import swish  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid  #DEFINE_ALIAS
 from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
     'elu',
-    'erf',
     'gelu',
     'hardshrink',
     'hardtanh',
@@ -33,7 +37,6 @@
     'relu',
     'relu6',
     'selu',
-    'soft_relu',
     'softmax',
     'softplus',
     'softshrink',
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 81c38c0be6557..0b18dec943d5f 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -20,13 +20,12 @@
 from ...fluid.layers import core
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-from ...fluid import one_hot  #DEFINE_ALIAS
-from ...fluid.layers import pad2d  #DEFINE_ALIAS
+# from ...fluid import one_hot  #DEFINE_ALIAS
+# from ...fluid.layers import pad2d  #DEFINE_ALIAS
 from ...fluid.layers import unfold  #DEFINE_ALIAS
 from ...fluid.layers import assign  #DEFINE_ALIAS
 from ...fluid.layers import squeeze  #DEFINE_ALIAS
 from ...fluid.layers import unsqueeze  #DEFINE_ALIAS
-from ...fluid.layers import elementwise_mul  #DEFINE_ALIAS
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
@@ -36,7 +35,7 @@
 from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
 
 #from ...fluid.layers import fc  #DEFINE_ALIAS
-from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
+# from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
 from ...fluid.framework import in_dygraph_mode
 from ...fluid import core, dygraph_utils
 from ...fluid import core, layers
@@ -51,10 +50,7 @@
     #       'fc',
     'label_smooth',
     'linear',
-    'one_hot',
     'pad',
-    'pad_constant_like',
-    'pad2d',
     'unfold',
     #       'bilinear_tensor_product',
     'assign',
@@ -1395,9 +1391,9 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
             # [0.99806249 0.9817672  0.94987036]
             
     """
-    w12 = sum(elementwise_mul(x1, x2), axis=axis)
-    w1 = sum(elementwise_mul(x1, x1), axis=axis)
-    w2 = sum(elementwise_mul(x2, x2), axis=axis)
+    w12 = sum(paddle.multiply(x1, x2), axis=axis)
+    w1 = sum(paddle.multiply(x1, x1), axis=axis)
+    w2 = sum(paddle.multiply(x2, x2), axis=axis)
     n12 = sqrt(clip(w1 * w2, min=eps * eps))
     cos_sim = w12 / n12
     return cos_sim
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 87210b3832fb0..4ec0f8407fa91 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -13,36 +13,10 @@
 # limitations under the License.
 
 # TODO: define the extention functions
-from ...fluid.layers import add_position_encoding  #DEFINE_ALIAS
-from ...fluid.layers import multiclass_nms  #DEFINE_ALIAS
-from ...fluid.layers import target_assign  #DEFINE_ALIAS
-from ...fluid.layers import temporal_shift  #DEFINE_ALIAS
-
-from ...fluid.layers import continuous_value_model  #DEFINE_ALIAS
-from ...fluid.layers import filter_by_instag  #DEFINE_ALIAS
-from ...fluid.layers import polygon_box_transform  #DEFINE_ALIAS
-from ...fluid.layers import random_crop  #DEFINE_ALIAS
-from ...fluid.layers import rpn_target_assign  #DEFINE_ALIAS
-from ...fluid.layers import similarity_focus  #DEFINE_ALIAS
-from ...fluid.layers import warpctc  #DEFINE_ALIAS
 
 __all__ = [
-    'add_position_encoding',
-    #       'autoincreased_step_counter',
-    'continuous_value_model',
-    'filter_by_instag',
-    #       'linear_chain_crf',
-    #       'merge_selected_rows',
-    'multiclass_nms',
-    'polygon_box_transform',
-    'random_crop',
-    'row_conv',
-    'rpn_target_assign',
-    'similarity_focus',
-    'target_assign',
-    'temporal_shift',
-    'warpctc',
-    'diag_embed'
+    'diag_embed',
+    'row_conv'
 ]
 
 import numpy as np
@@ -176,8 +150,6 @@ def __check_input(input, offset, dim1, dim2):
 @templatedoc()
 def row_conv(input, weight, act=None):
     """
-	:alias_main: paddle.nn.functional.row_conv
-	:alias: paddle.nn.functional.row_conv,paddle.nn.functional.extension.row_conv
 
     ${comment}
 
@@ -217,7 +189,7 @@ def row_conv(input, weight, act=None):
             with dg.guard(place):
                 x_var = dg.to_variable(x)
                 w_var = dg.to_variable(weight)
-                y_var = F.row_conv(x_var, w_var)
+                y_var = F.extension.row_conv(x_var, w_var)
                 y_np = y_var.numpy()
 
             print(y_np.shape)
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 0794b95c80101..2e4bbd99a726d 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -74,7 +74,7 @@ def one_hot(x, num_classes, name=None):
 
             import paddle
             # Correspond to the first example above, where label.shape is 4 and one_hot_label.shape is [4, 4].
-            label = paddle.data(name="label", shape=[4, 1], dtype="int64")
+            label = paddle.static.data(name="label", shape=[4, 1], dtype="int64")
             # label.shape = [4]
             # label.data = [1, 1, 3, 0]
             one_hot_label = paddle.nn.functional.one_hot(x=label, num_classes=4)
@@ -183,7 +183,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
             weight = prog.global_block().create_parameter(
                     (128, 100), dtype="float32", default_initializer=Constant(1.0))
 
-            label = paddle.data(
+            label = paddle.static.data(
                     name="label",
                     shape=[4],
                     append_batch_size=False,
diff --git a/python/paddle/nn/functional/learning_rate.py b/python/paddle/nn/functional/learning_rate.py
deleted file mode 100644
index 83837fc5d46ac..0000000000000
--- a/python/paddle/nn/functional/learning_rate.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: define learning rate decay  
-from ...fluid.layers import cosine_decay  #DEFINE_ALIAS
-from ...fluid.layers import exponential_decay  #DEFINE_ALIAS
-from ...fluid.layers import inverse_time_decay  #DEFINE_ALIAS
-from ...fluid.layers import natural_exp_decay  #DEFINE_ALIAS
-from ...fluid.layers import noam_decay  #DEFINE_ALIAS
-from ...fluid.layers import piecewise_decay  #DEFINE_ALIAS
-from ...fluid.layers import polynomial_decay  #DEFINE_ALIAS
-from ...fluid.layers import linear_lr_warmup  #DEFINE_ALIAS
-
-__all__ = [
-    'cosine_decay', 'exponential_decay', 'inverse_time_decay',
-    'natural_exp_decay', 'noam_decay', 'piecewise_decay', 'polynomial_decay',
-    'linear_lr_warmup'
-]
diff --git a/python/paddle/nn/functional/lod.py b/python/paddle/nn/functional/lod.py
deleted file mode 100644
index 266e3f9c71efd..0000000000000
--- a/python/paddle/nn/functional/lod.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: define functions which accept only LoDTensor as input  
-from ...fluid.layers import hash  #DEFINE_ALIAS
-
-__all__ = [
-    #       'sequence_concat',
-    #       'sequence_conv',
-    #       'sequence_enumerate',
-    #       'sequence_expand_as',
-    #       'sequence_expand',
-    #       'sequence_first_step',
-    #       'sequence_last_step',
-    #       'sequence_mask',
-    #       'sequence_pad',
-    #       'sequence_pool',
-    #       'sequence_reshape',
-    #       'sequence_reverse',
-    #       'sequence_scatter',
-    #       'sequence_slice',
-    #       'sequence_softmax',
-    #       'sequence_unpad',
-    #       'array_length',
-    #       'array_read',
-    #       'array_write',
-    #       'create_array',
-    'hash',
-    #       'im2sequence',
-    #       'lod_append',
-    #       'lod_reset',
-    #       'reorder_lod_tensor_by_rank',
-    #       'tensor_array_to_tensor',
-    #       'dynamic_gru',
-    #       'dynamic_lstm',
-    #       'dynamic_lstmp'
-]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d085213dffc23..b056029fb5aa1 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -23,20 +23,14 @@
 import paddle.fluid as fluid
 from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
-from ...fluid.layers import bpr_loss  #DEFINE_ALIAS
-from ...fluid.layers import center_loss  #DEFINE_ALIAS
 from ...fluid.layers import dice_loss  #DEFINE_ALIAS
 from ...fluid.layers import iou_similarity  #DEFINE_ALIAS
 from ...fluid.layers import log_loss  #DEFINE_ALIAS
 from ...fluid.layers import npair_loss  #DEFINE_ALIAS
-from ...fluid.layers import rank_loss  #DEFINE_ALIAS
 from ...fluid.layers import reshape
-from ...fluid.layers import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
-from ...fluid.layers import smooth_l1  #DEFINE_ALIAS
 from ...fluid.layers import softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
 from ...fluid.layers import ssd_loss  #DEFINE_ALIAS
-from ...fluid.layers import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
@@ -49,11 +43,8 @@
 __all__ = [
     'binary_cross_entropy',
     'binary_cross_entropy_with_logits',
-    'bpr_loss',
-    'center_loss',
     'cross_entropy',
     'dice_loss',
-    'edit_distance',
     'hsigmoid_loss',
     'iou_similarity',
     'kl_div',
@@ -64,16 +55,11 @@
     #       'nce',
     'nll_loss',
     'npair_loss',
-    'rank_loss',
-    'sampled_softmax_with_cross_entropy',
-    'sigmoid_cross_entropy_with_logits',
     'sigmoid_focal_loss',
-    'smooth_l1',
     'smooth_l1_loss',
     'softmax_with_cross_entropy',
     'square_error_cost',
     'ssd_loss',
-    'teacher_student_sigmoid_loss',
     'ctc_loss',
 ]
 
@@ -181,7 +167,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
         outputs={'Out': [out]})
 
     if weight is not None:
-        if isinstance(weight, paddle.framework.Variable):
+        if isinstance(weight, paddle.static.Variable):
             weight_name = name if reduction is 'none' else None
             out = paddle.multiply(out, weight, axis=-1, name=weight_name)
         else:
@@ -316,16 +302,18 @@ def binary_cross_entropy_with_logits(logit,
     if reduction == 'none' and pos_weight is None and weight is None:
         sigmoid_name = name
 
-    out = paddle.nn.functional.sigmoid_cross_entropy_with_logits(
+    out = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(
         logit, label, name=sigmoid_name)
 
-    one = paddle.fill_constant(shape=[1], value=1.0, dtype=logit.dtype)
+    one = paddle.fluid.layers.fill_constant(
+        shape=[1], value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
         fluid.data_feeder.check_variable_and_dtype(
             pos_weight, 'pos_weight', ['float32', 'float64'],
             'binary_cross_entropy_with_logits')
         log_weight = paddle.add(
-            paddle.multiply(label, paddle.elementwise_sub(pos_weight, one)),
+            paddle.multiply(
+                label, paddle.fluid.layers.elementwise_sub(pos_weight, one)),
             one)
         pos_weight_name = name if reduction == 'none' and weight is None else None
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
@@ -627,12 +615,13 @@ def margin_ranking_loss(input,
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['float32', 'float64'], 'margin_rank_loss')
 
-    out = paddle.elementwise_sub(other, input)
+    out = paddle.fluid.layers.elementwise_sub(other, input)
     out = paddle.multiply(out, label)
 
     if margin != 0.0:
         margin_var = out.block.create_var(dtype=out.dtype)
-        paddle.fill_constant([1], out.dtype, margin, out=margin_var)
+        paddle.fluid.layers.fill_constant(
+            [1], out.dtype, margin, out=margin_var)
         out = paddle.add(out, margin_var)
 
     result_out = helper.create_variable_for_type_inference(input.dtype)
@@ -737,13 +726,14 @@ def l1_loss(input, label, reduction='mean', name=None):
         label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
 
     if reduction == 'sum':
-        unreduced = paddle.elementwise_sub(input, label, act='abs')
+        unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
         return paddle.sum(unreduced, name=name)
     elif reduction == 'mean':
-        unreduced = paddle.elementwise_sub(input, label, act='abs')
+        unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
         return paddle.mean(unreduced, name=name)
     else:
-        return paddle.elementwise_sub(input, label, act='abs', name=name)
+        return paddle.fluid.layers.elementwise_sub(
+            input, label, act='abs', name=name)
 
 
 def nll_loss(input,
@@ -1010,8 +1000,8 @@ def mse_loss(input, label, reduction='mean', name=None):
             # static graph mode
             paddle.enable_static()
             mse_loss = paddle.nn.loss.MSELoss()
-            input = paddle.data(name="input", shape=[1])
-            label = paddle.data(name="label", shape=[1])
+            input = paddle.fluid.data(name="input", shape=[1])
+            label = paddle.fluid.data(name="label", shape=[1])
             place = paddle.CPUPlace()
 
             output = mse_loss(input,label)
@@ -1356,7 +1346,7 @@ def sigmoid_focal_loss(logit,
             label = paddle.to_tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32')
             one = paddle.to_tensor([1.], dtype='float32')
             fg_label = paddle.greater_equal(label, one)
-            fg_num = paddle.reduce_sum(paddle.cast(fg_label, dtype='float32'))
+            fg_num = paddle.sum(paddle.cast(fg_label, dtype='float32'))
             output = paddle.nn.functional.sigmoid_focal_loss(logit, label, normalizer=fg_num)
             print(output.numpy())  # [0.65782464]
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 75d1b549b08d5..9b78368259127 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -19,7 +19,6 @@
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, core
 from ...framework import create_parameter
-from ...fluid.layers import lrn  #DEFINE_ALIAS
 from ...fluid.initializer import Constant
 from ...fluid.param_attr import ParamAttr
 from ...fluid import core, dygraph_utils
@@ -29,7 +28,7 @@
     #       'data_norm',
     'instance_norm',
     'layer_norm',
-    'lrn',
+    'local_response_norm',
     'normalize',
     #       'spectral_norm'
 ]
@@ -110,8 +109,9 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     helper.append_op(
         type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
     eps = out.block.create_var(dtype=out.dtype)
-    paddle.fill_constant([1], out.dtype, epsilon, out=eps)
-    return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+    paddle.fluid.layers.fill_constant([1], out.dtype, epsilon, out=eps)
+    return paddle.fluid.layers.elementwise_div(
+        x, paddle.maximum(out, eps), name=name)
 
 
 def batch_norm(x,
@@ -138,7 +138,7 @@ def batch_norm(x,
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Defalut "NCHW".
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Returns:
@@ -174,13 +174,13 @@ def batch_norm(x,
     mean_out = running_mean
     variance_out = running_var
 
-    true_data_format = ['NC', 'NCL', 'NCHW', 'NCDHW']
+    true_data_format = ['NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC']
     if data_format not in true_data_format:
         raise ValueError(
-            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', but receive {}".
-            format(data_format))
+            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
+            "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
 
-    data_format = 'NCHW'
+    data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
     if in_dygraph_mode():
         # for dygraph need tuple
@@ -403,3 +403,109 @@ def instance_norm(x,
     helper.append_op(
         type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
     return instance_norm_out
+
+
+def local_response_norm(x,
+                        size,
+                        alpha=1e-4,
+                        beta=0.75,
+                        k=1.,
+                        data_format="NCHW",
+                        name=None):
+    """
+        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
+        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+
+        The formula is as follows:
+
+        .. math::
+
+            Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C-1, i + size/2)}_{j = \\max(0, i - size/2)}(Input(j, x, y))^2\\right)^{\\beta}
+
+        In the above equation:
+
+        - :math:`size` : The number of channels to sum over.
+        - :math:`k` : The offset (avoid being divided by 0).
+        - :math:`\\alpha` : The scaling parameter.
+        - :math:`\\beta` : The exponent parameter.
+
+
+        Args:
+            x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32.
+            size (int): The number of channels to sum over.
+            alpha (float, optional): The scaling parameter, positive. Default:1e-4
+            beta (float, optional): The exponent, positive. Default:0.75
+            k (float, optional): An offset, positive. Default: 1.0
+            data_format (str, optional): Specify the data format of the input, and the data format of the output
+                will be consistent with that of the input. An optional string from:
+                If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
+                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
+                If x is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
+                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
+                If x is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
+                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+            name (str, optional): Name for the operation (optional, default is None). For more information,
+                please refer to :ref:`api_guide_Name`.
+
+        Returns:
+            A tensor storing the transformation result with the same shape and data type as input.
+
+
+        Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+            y = paddle.nn.functional.local_response_norm(x, size=5)
+            print(y.shape)  # [3, 3, 112, 112]
+        """
+    if not in_dygraph_mode():
+        check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm')
+    if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
+        raise ValueError(
+            "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
+            "but got {}".format(data_format))
+
+    sizes = x.shape
+    dim = len(sizes)
+    if dim < 3:
+        raise ValueError(
+            'Expected 3D or higher dimensionality input, but got {} dimensions'.
+            format(dim))
+
+    channel_last = True if data_format[-1] == "C" else False
+
+    div = paddle.unsqueeze(paddle.multiply(x, x), axis=1)
+    if not channel_last:
+        pad4d_shape = [0, 0, size // 2, (size - 1) // 2]
+        pool2d_shape = (size, 1)
+        reshape_shape = [sizes[0], 1, sizes[1], sizes[2], -1]
+        pad5d_shape = [0, 0, 0, 0, size // 2, (size - 1) // 2]
+        pool3d_shape = (size, 1, 1)
+    else:
+        pad4d_shape = [size // 2, (size - 1) // 2, 0, 0]
+        pool2d_shape = (1, size)
+        reshape_shape = [sizes[0], 1, sizes[1], -1, sizes[-1]]
+        pad5d_shape = [size // 2, (size - 1) // 2, 0, 0, 0, 0]
+        pool3d_shape = (1, 1, size)
+
+    if dim == 3:
+        div = paddle.nn.functional.pad(div, pad=pad4d_shape)
+        div = paddle.nn.functional.avg_pool2d(
+            div, kernel_size=pool2d_shape, stride=1)
+        div = paddle.squeeze(div, axis=1)
+    else:
+        div = paddle.reshape(div, shape=reshape_shape)
+        div = paddle.nn.functional.pad(div,
+                                       pad=pad5d_shape,
+                                       data_format='NCDHW')
+        div = paddle.nn.functional.avg_pool3d(
+            div, kernel_size=pool3d_shape, stride=1)
+        div = paddle.reshape(paddle.squeeze(div, axis=1), sizes)
+
+    div = paddle.scale(div, scale=alpha, bias=k)
+    div = paddle.pow(div, beta)
+    res = paddle.divide(x, div, name=name)
+    return res
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 1b8e1fb576880..73652ff1266f5 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -13,16 +13,12 @@
 # limitations under the License.
 
 # TODO: define pooling functions
-from ...fluid.layers import pool2d  #DEFINE_ALIAS
-from ...fluid.layers import pool3d  #DEFINE_ALIAS
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
 
 __all__ = [
-    'pool2d',
-    'pool3d',
     'avg_pool1d',
     'avg_pool2d',
     'avg_pool3d',
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 7f86e56df1b54..5e1cb377bd72b 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -20,79 +20,44 @@
 import numpy as np
 
 # TODO: define specitial functions used in computer vision task  
-from ...fluid.layers import affine_channel  #DEFINE_ALIAS
-from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
-from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
-from ...fluid.layers import box_clip  #DEFINE_ALIAS
-from ...fluid.layers import box_coder  #DEFINE_ALIAS
-from ...fluid.layers import box_decoder_and_assign  #DEFINE_ALIAS
-from ...fluid.layers import collect_fpn_proposals  #DEFINE_ALIAS
-from ...fluid.layers import deformable_roi_pooling  #DEFINE_ALIAS
-from ...fluid.layers import density_prior_box  #DEFINE_ALIAS
-from ...fluid.layers import detection_output  #DEFINE_ALIAS
-from ...fluid.layers import distribute_fpn_proposals  #DEFINE_ALIAS
-from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
-from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
-from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
-from ...fluid.layers import image_resize  #DEFINE_ALIAS
-from ...fluid.layers import prior_box  #DEFINE_ALIAS
-from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
-from ...fluid.layers import psroi_pool  #DEFINE_ALIAS
-from ...fluid.layers import resize_bilinear  #DEFINE_ALIAS
-from ...fluid.layers import resize_nearest  #DEFINE_ALIAS
-from ...fluid.layers import resize_trilinear  #DEFINE_ALIAS
-from ...fluid.layers import roi_align  #DEFINE_ALIAS
-from ...fluid.layers import roi_pool  #DEFINE_ALIAS
-from ...fluid.layers import space_to_depth  #DEFINE_ALIAS
-from ...fluid.layers import yolo_box  #DEFINE_ALIAS
-from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
-
-from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
-from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
+# from ...fluid.layers import affine_channel  #DEFINE_ALIAS
+# from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
+# from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
+# from ...fluid.layers import box_clip  #DEFINE_ALIAS
+# from ...fluid.layers import box_coder  #DEFINE_ALIAS
+# from ...fluid.layers import box_decoder_and_assign  #DEFINE_ALIAS
+# from ...fluid.layers import collect_fpn_proposals  #DEFINE_ALIAS
+# from ...fluid.layers import deformable_roi_pooling  #DEFINE_ALIAS
+# from ...fluid.layers import density_prior_box  #DEFINE_ALIAS
+# from ...fluid.layers import detection_output  #DEFINE_ALIAS
+# from ...fluid.layers import distribute_fpn_proposals  #DEFINE_ALIAS
+# from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
+# from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
+# from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
+# from ...fluid.layers import image_resize  #DEFINE_ALIAS
+# from ...fluid.layers import prior_box  #DEFINE_ALIAS
+# from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
+# from ...fluid.layers import psroi_pool  #DEFINE_ALIAS
+# from ...fluid.layers import resize_bilinear  #DEFINE_ALIAS
+# from ...fluid.layers import resize_nearest  #DEFINE_ALIAS
+# from ...fluid.layers import resize_trilinear  #DEFINE_ALIAS
+# from ...fluid.layers import roi_align  #DEFINE_ALIAS
+# from ...fluid.layers import roi_pool  #DEFINE_ALIAS
+# from ...fluid.layers import space_to_depth  #DEFINE_ALIAS
+# from ...fluid.layers import yolo_box  #DEFINE_ALIAS
+# from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
+# from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
+# from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
 # from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
-from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
-from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
-from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
-from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS
+# from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
+# from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
+# from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
+# from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS
 
 __all__ = [
-    'affine_channel',
     'affine_grid',
-    'anchor_generator',
-    'bipartite_match',
-    'box_clip',
-    'box_coder',
-    'box_decoder_and_assign',
-    'collect_fpn_proposals',
-    #       'deformable_conv',
-    'deformable_roi_pooling',
-    'density_prior_box',
-    'detection_output',
-    'distribute_fpn_proposals',
-    'fsp_matrix',
-    'generate_mask_labels',
-    'generate_proposal_labels',
-    'generate_proposals',
     'grid_sample',
-    'image_resize',
-    'image_resize_short',
-    #       'multi_box_head',
-    'pixel_shuffle',
-    'prior_box',
-    'prroi_pool',
-    'psroi_pool',
-    'resize_bilinear',
-    'resize_nearest',
-    'resize_trilinear',
-    'retinanet_detection_output',
-    'retinanet_target_assign',
-    'roi_align',
-    'roi_perspective_transform',
-    'roi_pool',
-    'shuffle_channel',
-    'space_to_depth',
-    'yolo_box',
-    'yolov3_loss'
+    'pixel_shuffle'
 ]
 
 
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index db0f5dbff2b80..5d80386838435 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -14,22 +14,34 @@
 
 # TODO: define the initializers to create a Parameter in neural network
 from ...fluid.initializer import Bilinear  #DEFINE_ALIAS
-from ...fluid.initializer import MSRA  #DEFINE_ALIAS
-from ...fluid.initializer import Normal  #DEFINE_ALIAS
-from ...fluid.initializer import TruncatedNormal  #DEFINE_ALIAS
-from ...fluid.initializer import Uniform  #DEFINE_ALIAS
-from ...fluid.initializer import Xavier  #DEFINE_ALIAS
 
 from . import constant
 from .constant import Constant  #DEFINE_ALIAS
 
-__all__ = [
-    'Bilinear',
-    'MSRA',
-    'Normal',
-    'TruncatedNormal',
-    'Uniform',
-    'Xavier',
-]
+from . import kaiming
+from .kaiming import KaimingNormal  #DEFINE_ALIAS
+from .kaiming import KaimingUniform  #DEFINE_ALIAS
+
+__all__ = ['Bilinear', ]
 
 __all__ += constant.__all__
+__all__ += kaiming.__all__
+
+from . import xavier
+from .xavier import XavierNormal  #DEFINE_ALIAS
+from .xavier import XavierUniform  #DEFINE_ALIAS
+
+from . import assign
+from .assign import Assign  #DEFINE_ALIAS
+
+from . import normal
+from .normal import Normal  #DEFINE_ALIAS
+from .normal import TruncatedNormal  #DEFINE_ALIAS
+
+from . import uniform
+from .uniform import Uniform  #DEFINE_ALIAS
+
+__all__ += xavier.__all__
+__all__ += assign.__all__
+__all__ += normal.__all__
+__all__ += uniform.__all__
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
new file mode 100644
index 0000000000000..a33301230e89e
--- /dev/null
+++ b/python/paddle/nn/initializer/assign.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid import framework
+from ...fluid import core
+from ...fluid import unique_name
+from ...fluid.core import VarDesc
+from ...fluid.data_feeder import check_type
+from ...fluid.initializer import NumpyArrayInitializer
+
+__all__ = ['Assign']
+
+
+class Assign(NumpyArrayInitializer):
+    """Init an parameter with a numpy array, list, or tensor.
+
+    Args:
+        value (Tensor|numpy.ndarray|list): numpy array, list, or tensor to initialize the parameter.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A parameter initialized by the input numpy array, list, or tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # numpy array
+            data_1 = paddle.ones(shape=[1, 2], dtype='float32')
+            weight_attr_1 = paddle.framework.ParamAttr(
+                name="linear_weight_1", 
+                initializer=paddle.nn.initializer.Assign(np.array([2, 2])))
+            bias_attr_1 = paddle.framework.ParamAttr(
+                name="linear_bias_1",
+                initializer=paddle.nn.initializer.Assign(np.array([2])))
+            linear_1 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_1, bias_attr=bias_attr_1)
+            # linear_1.weight:  [2. 2.]
+            # linear_1.bias:  [2.]
+
+            res_1 = linear_1(data_1)
+            # res_1:  [6.]
+
+            # python list
+            data_2 = paddle.ones(shape=[1, 2], dtype='float32')
+            weight_attr_2 = paddle.framework.ParamAttr(
+                name="linear_weight_2",
+                initializer=paddle.nn.initializer.Assign([2, 2]))
+            bias_attr_2 = paddle.framework.ParamAttr(
+                name="linear_bias_2",
+                initializer=paddle.nn.initializer.Assign([2]))
+            linear_2 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_2, bias_attr=bias_attr_2)
+            # linear_2.weight:  [2. 2.]
+            # linear_2.bias:  [2.]
+
+            res_2 = linear_2(data_2)
+            # res_2:  [6.]
+
+            # tensor
+            data_3 = paddle.ones(shape=[1, 2], dtype='float32')
+            weight_attr_3 = paddle.framework.ParamAttr(
+                name="linear_weight_3",
+                initializer=paddle.nn.initializer.Assign(paddle.full([2], 2)))
+            bias_attr_3 = paddle.framework.ParamAttr(
+                name="linear_bias_3",
+                initializer=paddle.nn.initializer.Assign(paddle.full([1], 2)))
+            linear_3 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_3, bias_attr=bias_attr_3)
+            # linear_3.weight:  [2. 2.]
+            # linear_3.bias:  [2.]
+
+            res_3 = linear_3(data_3)
+            # res_3:  [6.]
+    """
+
+    def __init__(self, value, name=None):
+        import numpy
+        check_type(value, 'value', (numpy.ndarray, list, framework.Variable),
+                   'Assign')
+
+        if (isinstance(value, list)):
+            value = numpy.array(value)
+
+        # TODO: value is already is a tensor, accounting efficiency maybe it does not need to convert tensor to numpy data and then initialized.
+        if (isinstance(value, framework.Variable)):
+            value = value.numpy()
+
+        super(Assign, self).__init__(value)
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
new file mode 100644
index 0000000000000..f0c6880e89d8e
--- /dev/null
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define the initializers of Kaiming functions in neural network
+from ...fluid.initializer import MSRAInitializer
+
+__all__ = ['KaimingUniform', 'KaimingNormal']
+
+
+class KaimingNormal(MSRAInitializer):
+    """Implements the Kaiming Normal initializer
+
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities.
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
+        inferred from the variable. default is None.
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.KaimingNormal())
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            res = linear(data)
+
+    """
+
+    def __init__(self, fan_in=None):
+        super(KaimingNormal, self).__init__(
+            uniform=False, fan_in=fan_in, seed=0)
+
+
+class KaimingUniform(MSRAInitializer):
+    """Implements the Kaiming Uniform initializer
+
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities.
+    
+    In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+
+    Args:
+        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
+        inferred from the variable. default is None.
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.KaimingUniform())
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            res = linear(data)
+
+    """
+
+    def __init__(self, fan_in=None):
+        super(KaimingUniform, self).__init__(
+            uniform=True, fan_in=fan_in, seed=0)
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
new file mode 100644
index 0000000000000..a572d0e2c9216
--- /dev/null
+++ b/python/paddle/nn/initializer/normal.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.initializer import NormalInitializer
+from ...fluid.initializer import TruncatedNormalInitializer
+
+__all__ = ['Normal', 'TruncatedNormal']
+
+
+class Normal(NormalInitializer):
+    """The Random Normal (Gaussian) distribution initializer.
+
+    Args:
+        mean (float, optional): mean of the normal distribution. The default value is 0.0.
+        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A parameter initialized by Random Normal (Gaussian) distribution.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            weight_attr = paddle.framework.ParamAttr(
+                name="linear_weight",
+                initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0))
+            bias_attr = paddle.framework.ParamAttr(
+                name="linear_bias",
+                initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0))
+            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            # linear.weight:  [[ 2.1973135 -2.2697184]
+            #                  [-1.9104223 -1.0541488]]
+            # linear.bias:  [ 0.7885926  -0.74719954]
+            
+            res = linear(data)
+            # res:  [[[ 1.0754838 -4.071067 ]]
+            #        [[ 1.0754838 -4.071067 ]]
+            #        [[ 1.0754838 -4.071067 ]]]
+    """
+
+    def __init__(self, mean=0.0, std=1.0, name=None):
+        assert mean is not None, 'mean should not be None'
+        assert std is not None, 'std should not be None'
+        super(Normal, self).__init__(loc=mean, scale=std, seed=0)
+
+
+class TruncatedNormal(TruncatedNormalInitializer):
+    """The Random TruncatedNormal (Gaussian) distribution initializer.
+
+    Args:
+        mean (float, optional): mean of the normal distribution. The default value is 0.0.
+        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A parameter initialized by Random TruncatedNormal (Gaussian) distribution.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            weight_attr = paddle.framework.ParamAttr(
+                name="linear_weight",
+                initializer=paddle.nn.initializer.TruncatedNormal(mean=0.0, std=2.0))
+            bias_attr = paddle.framework.ParamAttr(
+                name="linear_bias",
+                initializer=paddle.nn.initializer.TruncatedNormal(mean=0.0, std=2.0))
+            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            # linear.weight:  [[-1.0981836  1.4140984]
+            #                  [ 3.1390522 -2.8266568]]
+            # linear.bias:  [-2.1546738 -1.6570673]
+
+            res = linear(data)
+            # res:  [[[-0.11380529 -3.0696259 ]]
+            #        [[-0.11380529 -3.0696259 ]]
+            #        [[-0.11380529 -3.0696259 ]]
+    """
+
+    def __init__(self, mean=0.0, std=1.0, name=None):
+        assert mean is not None, 'mean should not be None'
+        assert std is not None, 'std should not be None'
+        super(TruncatedNormal, self).__init__(loc=mean, scale=std, seed=0)
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
new file mode 100644
index 0000000000000..a5d7d34efcf66
--- /dev/null
+++ b/python/paddle/nn/initializer/uniform.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.initializer import UniformInitializer
+
+__all__ = ['Uniform']
+
+
+class Uniform(UniformInitializer):
+    """The random uniform distribution initializer.
+
+    Args:
+        low (float, optional): lower boundary of the uniform distribution. The default value is -1.0.
+        high (float, optional): upper boundary of the uniform distribution. The default value is 1.0.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A parameter initialized by random uniform distribution.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            weight_attr = paddle.framework.ParamAttr(
+                name="linear_weight",
+                initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
+            bias_attr = paddle.framework.ParamAttr(
+                name="linear_bias",
+                initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
+            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            # linear.weight:  [[-0.46245047  0.05260676]
+            #                  [ 0.38054508  0.29169726]]
+            # linear.bias:  [-0.2734719   0.23939109]
+            
+            res = linear(data)
+            # res:  [[[-0.3553773  0.5836951]]
+            #        [[-0.3553773  0.5836951]]
+            #        [[-0.3553773  0.5836951]]]
+    """
+
+    def __init__(self, low=-1.0, high=1.0, name=None):
+        assert low is not None, 'low should not be None'
+        assert high is not None, 'high should not be None'
+        assert high >= low, 'high should greater or equal than low'
+        super(Uniform, self).__init__(
+            low=low, high=high, seed=0, diag_num=0, diag_step=0, diag_val=1.0)
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
new file mode 100644
index 0000000000000..5a4e7fec057e7
--- /dev/null
+++ b/python/paddle/nn/initializer/xavier.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.initializer import XavierInitializer
+
+__all__ = ['XavierNormal', 'XavierUniform']
+
+
+class XavierNormal(XavierInitializer):
+    """
+    This class implements the Xavier weight initializer from the paper
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio, using a normal distribution.
+
+    The mean is 0 and the standard deviation is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+
+
+    Args:
+        fan_in (float, optional): fan_in for Xavier initialization, It is
+                inferred from the tensor. The default value is None.
+        fan_out (float, optional): fan_out for Xavier initialization, it is
+                 inferred from the tensor. The default value is None.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A parameter initialized by Xavier weight, using a normal distribution.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            weight_attr = paddle.framework.ParamAttr(
+                name="linear_weight",
+                initializer=paddle.nn.initializer.XavierNormal())
+            bias_attr = paddle.framework.ParamAttr(
+                name="linear_bias",
+                initializer=paddle.nn.initializer.XavierNormal())
+            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            # inear.weight:  [[ 0.06910077 -0.18103665]
+            #                 [-0.02546741 -1.0402188 ]]
+            # linear.bias:  [-0.5012929   0.12418364]
+
+            res = linear(data)
+            # res:  [[[-0.4576595 -1.0970719]]
+            #        [[-0.4576595 -1.0970719]]
+            #        [[-0.4576595 -1.0970719]]]
+    """
+
+    def __init__(self, fan_in=None, fan_out=None, name=None):
+        super(XavierNormal, self).__init__(
+            uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0)
+
+
+class XavierUniform(XavierInitializer):
+    """
+    This class implements the Xavier weight initializer from the paper
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
+
+    This initializer is designed to keep the scale of the gradients
+    approximately same in all the layers. In case of Uniform distribution,
+    the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+
+    Args:
+        fan_in (float, optional): fan_in for Xavier initialization, it is
+                inferred from the tensor. The default value is None.
+        fan_out (float, optional): fan_out for Xavier initialization, it is
+                 inferred from the tensor. The default value is None.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A parameter initialized by Xavier weight, using a uniform distribution.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            weight_attr = paddle.framework.ParamAttr(
+                name="linear_weight",
+                initializer=paddle.nn.initializer.XavierUniform())
+            bias_attr = paddle.framework.ParamAttr(
+                name="linear_bias",
+                initializer=paddle.nn.initializer.XavierUniform())
+            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            # linear.weight:  [[-0.04229349 -1.1248565 ]
+            #                  [-0.10789523 -0.5938053 ]]
+            # linear.bias:  [ 1.1983747  -0.40201235]
+
+            res = linear(data)
+            # res:  [[[ 1.0481861 -2.1206741]]
+            #        [[ 1.0481861 -2.1206741]]
+            #        [[ 1.0481861 -2.1206741]]]
+    """
+
+    def __init__(self, fan_in=None, fan_out=None, name=None):
+        super(XavierUniform, self).__init__(
+            uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0)
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 3a5bcaa21fe5b..1defed3362c1c 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -44,23 +44,14 @@
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
+from .common import Pad1D  #DEFINE_ALIAS
 from .common import Pad2D  #DEFINE_ALIAS
-from .common import ReflectionPad1d  #DEFINE_ALIAS
-from .common import ReplicationPad1d  #DEFINE_ALIAS
-from .common import ConstantPad1d  #DEFINE_ALIAS
-from .common import ReflectionPad2d  #DEFINE_ALIAS
-from .common import ReplicationPad2d  #DEFINE_ALIAS
-from .common import ConstantPad2d  #DEFINE_ALIAS
-from .common import ZeroPad2d  #DEFINE_ALIAS
-from .common import ReplicationPad3d  #DEFINE_ALIAS
-from .common import ConstantPad3d  #DEFINE_ALIAS
+from .common import Pad3D  #DEFINE_ALIAS
 from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
 from .common import Upsample  #DEFINE_ALIAS
-from .common import UpsamplingNearest2d  #DEFINE_ALIAS
-from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
 from .common import Dropout2d  #DEFINE_ALIAS
 from .common import Dropout3d  #DEFINE_ALIAS
@@ -86,13 +77,6 @@
 # from .conv import TreeConv        #DEFINE_ALIAS
 # from .conv import Conv1D        #DEFINE_ALIAS
 from .extension import RowConv  #DEFINE_ALIAS
-# from .learning_rate import CosineDecay        #DEFINE_ALIAS
-# from .learning_rate import ExponentialDecay        #DEFINE_ALIAS
-# from .learning_rate import InverseTimeDecay        #DEFINE_ALIAS
-# from .learning_rate import NaturalExpDecay        #DEFINE_ALIAS
-# from .learning_rate import NoamDecay        #DEFINE_ALIAS
-# from .learning_rate import PiecewiseDecay        #DEFINE_ALIAS
-# from .learning_rate import PolynomialDecay        #DEFINE_ALIAS
 # from .loss import NCELoss        #DEFINE_ALIAS
 from .loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .loss import CrossEntropyLoss  #DEFINE_ALIAS
@@ -109,7 +93,8 @@
 from .norm import GroupNorm  #DEFINE_ALIAS
 from .norm import LayerNorm  #DEFINE_ALIAS
 from .norm import SpectralNorm  #DEFINE_ALIAS
-from .norm import InstanceNorm  #DEFINE_ALIAS
+#from .norm import InstanceNorm  #DEFINE_ALIAS
+from .norm import LocalResponseNorm  #DEFINE_ALIAS
 # from .rnn import RNNCell        #DEFINE_ALIAS
 # from .rnn import GRUCell        #DEFINE_ALIAS
 # from .rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 05cbd96863c28..71bddefdb13e7 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 # TODO: define the common classes to build a neural network
+import paddle
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
@@ -26,18 +27,9 @@
     'Embedding',
     'Linear',
     'Upsample',
+    'Pad1D',
     'Pad2D',
-    'UpsamplingNearest2d',
-    'UpsamplingBilinear2d',
-    'ReflectionPad1d',
-    'ReplicationPad1d',
-    'ConstantPad1d',
-    'ReflectionPad2d',
-    'ReplicationPad2d',
-    'ConstantPad2d',
-    'ZeroPad2d',
-    'ConstantPad3d',
-    'ReplicationPad3d',
+    'Pad3D',
     'CosineSimilarity',
     'Dropout',
     'Dropout2d',
@@ -388,256 +380,6 @@ def forward(self, x):
         return out
 
 
-class UpsamplingNearest2d(layers.Layer):
-    """
-    This op upsamples a batch of images, using nearest neighbours' pixel values.
-    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    where in_w is width of the input tensor, in_h is the height of the input tensor.
-    And the upsampling only applies on the two dimensions(height and width).
-
-    Nearest neighbor interpolation is to perform nearest neighbor interpolation
-    in both the 3rd dimension(in height direction) and the 4th dimension(in width
-    direction) on input tensor.
-    
-    For details of nearest neighbor interpolation, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
-        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
-                          its data format is specified by :attr:`data_format`.
-        size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
-             If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`size` or :attr:`scale_factor` must be set.
-             And :attr:`size` has a higher priority than :attr:`scale_factor`.
-             Has to match input size if it is either a list or a tuple or a Tensor.
-             Default: None.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output
-            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
-            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
-            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-    Returns:
-        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
-    Raises:
-        TypeError: size should be a list or tuple or Tensor.
-        ValueError: 'nearest' only support 4-D tensor.
-        ValueError: One of size and scale_factor must not be None.
-        ValueError: size length should be 2 for input 4-D tensor.
-        ValueError: scale_factor should be greater than zero.
-        ValueError: data_format can only be 'NCHW', 'NHWC'.
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_data = np.random.rand(2,3,6,10).astype("float32")
-            upsample_out  = paddle.nn.UpsamplingNearest2d(size=[12,12])
-
-            input = paddle.to_tensor(input_data)
-            output = upsample_out(x=input)
-            print(output.shape)
-            # [2L, 3L, 12L, 12L]
-
-    """
-
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 data_format='NCHW',
-                 name=None):
-        super(UpsamplingNearest2d, self).__init__()
-        self.size = size
-        self.scale_factor = scale_factor
-        self.data_format = data_format
-        self.name = name
-
-    def forward(self, x):
-        out = F.interpolate(
-            x,
-            size=self.size,
-            scale_factor=self.scale_factor,
-            mode='nearest',
-            align_corners=False,
-            align_mode=0,
-            data_format=self.data_format,
-            name=self.name)
-
-        return out
-
-
-class UpsamplingBilinear2d(layers.Layer):
-    """
-    This op upsamples a batch of images, using bilinear' pixel values.
-    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    where in_w is width of the input tensor, in_h is the height of the input tensor.
-    And the upsampling only applies on the two dimensions(height and width).
-
-    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this op) on a rectilinear 2D grid. The key idea is
-    to perform linear interpolation first in one direction, and then
-    again in the other direction.
-    
-    For details of bilinear interpolation, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
-        x (Tensor): 4-D Tensor, its data type is float32, float64, or uint8,
-                          its data format is specified by :attr:`data_format`.
-        size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
-             If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`size` or :attr:`scale_factor` must be set.
-             And :attr:`size` has a higher priority than :attr:`scale_factor`.
-             Has to match input size if it is either a list or a tuple or a Tensor.
-             Default: None.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output
-            will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
-            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
-            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-    Returns:
-        A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
-    Raises:
-        TypeError: size should be a list or tuple or Tensor.
-        ValueError: 'bilinear' only support 4-D tensor.
-        ValueError: One of size and scale_factor must not be None.
-        ValueError: size length should be 2 for input 4-D tensor.
-        ValueError: scale_factor should be greater than zero.
-        ValueError: data_format can only be 'NCHW', 'NHWC'.
-    Examples:
-        .. code-block:: python
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_data = np.random.rand(2,3,6,10).astype("float32")
-            upsample_out  = paddle.nn.UpsamplingBilinear2d(size=[12,12])
-
-            input = paddle.to_tensor(input_data)
-            output = upsample_out(x=input)
-            print(output.shape)
-            # [2L, 3L, 12L, 12L]
-    """
-
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 data_format='NCHW',
-                 name=None):
-        super(UpsamplingBilinear2d, self).__init__()
-        self.size = size
-        self.scale_factor = scale_factor
-        self.data_format = data_format
-        self.name = name
-
-    def forward(self, x):
-        out = F.interpolate(
-            x,
-            size=self.size,
-            scale_factor=self.scale_factor,
-            mode='bilinear',
-            align_corners=True,
-            align_mode=0,
-            data_format=self.data_format,
-            name=self.name)
-
-        return out
-
-
-class Pad2D(layers.Layer):
-    """
-        :alias_main: paddle.nn.Pad2D
-        :alias: paddle.nn.Pad2D,paddle.nn.layer.Pad2D,paddle.nn.layer.common.Pad2D
-    This interface is used to construct a callable object of the ``Pad2D``  class.
-    The Pad2D layer pads the input tensor boundaries according to 'paddings' and 'mode'.
-    If mode is 'reflect', paddings[0] and paddings[1] must be no greater
-    than height-1. And the width dimension has the same condition.
-    Parameters:
-        paddings (int | List[int32]): The padding size. If padding is a int, uses the same
-            padding in all boundaries, if padding is a List, it must contain four integers,
-            (padding_top, padding_bottom, padding_left, padding_right).
-            Default is [0, 0, 0, 0].
-        mode (str): Three modes: 'constant' (default), 'reflect', 'edge' .
-        	When in 'constant' mode, this op uses a constant value to pad the input tensor.
-        	When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
-        	When in 'edge' mode, uses input boundaries to pad the input tensor.
-        	Default is 'constant'
-        pad_value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
-        data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
-                           the input data.
-                           Default is  "NCHW"
-    Returns:
-        None
-    Examples:
-        .. code-block:: text
-            Input = [[[[1., 2., 3.],
-                       [4., 5., 6.]]]]
-            Case 0:
-                paddings = [0, 1, 2, 3],
-                mode = 'constant'
-                pad_value = 0
-                Out = [[[[0., 0., 1., 2., 3., 0., 0., 0.],
-                         [0., 0., 4., 5., 6., 0., 0., 0.],
-                         [0., 0., 0., 0., 0., 0., 0., 0.]]]]
-            Case 1:
-                paddings = [0, 1, 2, 1],
-                mode = 'reflect'
-                Out = [[[[3., 2., 1., 2., 3., 2.],
-                         [6., 5., 4., 5., 6., 5.],
-                         [3., 2., 1., 2., 3., 2.]]]]
-            Case 2:
-                paddings = [0, 1, 2, 1],
-                mode = 'edge'
-                Out = [[[[1., 1., 1., 2., 3., 3.],
-                         [4., 4., 4., 5., 6., 6.],
-                         [4., 4., 4., 5., 6., 6.]]]]
-    Code Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            import paddle.nn as nn
-            import numpy as np
-            data = np.ones((2, 2, 2, 2)).astype('float32')
-            my_pad = nn.Pad2D(paddings=[1, 1, 1, 1])
-            with fluid.dygraph.guard():
-                data = fluid.dygraph.to_variable(data)
-                result = my_pad(data)
-    """
-
-    def __init__(self,
-                 paddings=0,
-                 mode='constant',
-                 pad_value=0.0,
-                 data_format="NCHW"):
-        super(Pad2D, self).__init__()
-        self._mode = mode
-        self._pad_value = pad_value
-        self._data_format = data_format
-        self._paddings = [paddings] * 4 if isinstance(paddings,
-                                                      int) else paddings
-
-    def forward(self, input):
-        return F.pad2d(
-            input,
-            paddings=self._paddings,
-            mode=self._mode,
-            pad_value=self._pad_value,
-            data_format=self._data_format)
-
-
 class Bilinear(layers.Layer):
     """
 
@@ -961,132 +703,21 @@ def forward(self, input):
         return out
 
 
-class ReflectionPad1d(layers.Layer):
+class Pad1D(layers.Layer):
     """
-    This interface is used to construct a callable object of the ``ReflectionPad1d`` class.
-    Uses reflection of the input boundaries to pad the input tensor.
-
-    Parameters:
-        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
-            of input will be padded. The pad has the form (pad_left, pad_right).
-        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
-           Default is  "NCL"
-        name (str, optional) : The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: text
-
-            x = [[[1., 2., 3.],
-                  [4., 5., 6.]]]
-            padding = [1, 2],
-            Out = [[[2. 1. 2. 3. 2. 1.]
-                    [5. 4. 5. 6. 5. 4.]]]
-
-    Code Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_shape = (1, 2, 3)
-            pad = [1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ReflectionPad1d(padding=pad)
-            data = paddle.to_tensor(data)
-            result = my_pad(data)
-            print(result.numpy())
-            # [[[2. 1. 2. 3. 2. 1.]
-            #   [5. 4. 5. 6. 5. 4.]]]
-    """
-
-    def __init__(self, padding, data_format="NCL", name=None):
-        super(ReflectionPad1d, self).__init__()
-        self._mode = "reflect"
-        self._data_format = data_format
-        self._pad = padding
-        self._name = name
-
-    def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     data_format=self._data_format,
-                     name=self._name)
-
-
-class ReplicationPad1d(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``ReplicationPad1d`` class.
-    Uses input boundaries to pad the input tensor.
-
-    Parameters:
-        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
-            of input will be padded. The pad has the form (pad_left, pad_right).
-        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
-           Default is  "NCL"
-        name (str, optional) : The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: text
-
-            x = [[[1., 2., 3.],
-                  [4., 5., 6.]]]
-            padding = [1, 2],
-            Out = [[[2. 1. 2. 3. 2. 1.]
-                    [5. 4. 5. 6. 5. 4.]]]
-
-    Code Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_shape = (1, 2, 3)
-            pad = [1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ReplicationPad1d(padding=pad)
-            data = paddle.to_tensor(data)
-            result = my_pad(data)
-            print(result.numpy())
-            # [[[1. 1. 2. 3. 3. 3.]
-            #   [1. 4. 5. 6. 6. 6.]]]
-    """
-
-    def __init__(self, padding, data_format="NCL", name=None):
-        super(ReplicationPad1d, self).__init__()
-        self._mode = "replicate"
-        self._data_format = data_format
-        self._pad = padding
-        self._name = name
-
-    def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     data_format=self._data_format,
-                     name=self._name)
-
-
-class ConstantPad1d(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``ConstantPad1d`` class.
-    Uses a constant value to pad the input tensor.
+    This interface is used to construct a callable object of the ``Pad1D`` class.
+    Pad tensor according to 'pad', 'mode' and 'value'.
+    If mode is 'reflect', pad[0] and pad[1] must be no greater than width-1.
 
     Parameters:
         padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
             of input will be padded. The pad has the form (pad_left, pad_right).
+        mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'.
+            When in 'constant' mode, this op uses a constant value to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'constant'.
         value (float32): The value to fill the padded areas. Default is 0.0
         data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
            Default is  "NCL"
@@ -1102,6 +733,7 @@ class ConstantPad1d(layers.Layer):
             x = [[[1., 2., 3.],
                   [4., 5., 6.]]]
             padding = [1, 2],
+            mode = "constant"
             value = 0.0
             Out = [[[0. 1. 2. 3. 0. 0.]
                     [0. 4. 5. 6. 0. 0.]]]
@@ -1116,21 +748,26 @@ class ConstantPad1d(layers.Layer):
 
             input_shape = (1, 2, 3)
             pad = [1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ConstantPad1d(padding=pad)
-            data = paddle.to_tensor(data)
+            mode = "constant"
+            data = paddle.arange(np.prod(input_shape), dtype="float32").reshape(input_shape) + 1
+            my_pad = nn.Pad1D(padding=pad, mode=mode)
             result = my_pad(data)
             print(result.numpy())
             # [[[0. 1. 2. 3. 0. 0.]
             #   [0. 4. 5. 6. 0. 0.]]]
     """
 
-    def __init__(self, padding, value=0.0, data_format="NCL", name=None):
-        super(ConstantPad1d, self).__init__()
-        self._mode = "constant"
-        self._data_format = data_format
+    def __init__(self,
+                 padding,
+                 mode='constant',
+                 value=0.0,
+                 data_format="NCL",
+                 name=None):
+        super(Pad1D, self).__init__()
         self._pad = padding
+        self._mode = mode
         self._value = value
+        self._data_format = data_format
         self._name = name
 
     def forward(self, x):
@@ -1142,14 +779,22 @@ def forward(self, x):
                      name=self._name)
 
 
-class ConstantPad2d(layers.Layer):
+class Pad2D(layers.Layer):
     """
-    This interface is used to construct a callable object of the ``ConstantPad2d`` class.
-    Uses a constant value to pad the input tensor.
+    This interface is used to construct a callable object of the ``Pad2D`` class.
+    Pad tensor according to 'pad', 'mode' and 'value'.
+    If mode is 'reflect', pad[0] and pad[1] must be no greater
+    than width-1. The height dimension has the same condition.
 
     Parameters:
         padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
             of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
+        mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'.
+            When in 'constant' mode, this op uses a constant value to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'constant'.
         value (float32): The value to fill the padded areas. Default is 0.0
         data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
            Default is  "NCHW"
@@ -1165,6 +810,7 @@ class ConstantPad2d(layers.Layer):
             x = [[[[1., 2., 3.],
                    [4., 5., 6.]]]]
             padding = [1, 1, 0, 0]
+            mode = "constant"
             value = 0.0
             Out = [[[[0. 1. 2. 3. 0.]
                      [0. 4. 5. 6. 0.]]]]
@@ -1176,12 +822,11 @@ class ConstantPad2d(layers.Layer):
             import paddle.nn as nn
             import numpy as np
             paddle.disable_static()
-
             input_shape = (1, 1, 2, 3)
             pad = [1, 0, 1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ConstantPad2d(padding=pad)
-            data = paddle.to_tensor(data)
+            mode = "constant"
+            data = paddle.arange(np.prod(input_shape), dtype="float32").reshape(input_shape) + 1
+            my_pad = nn.Pad2D(padding=pad, mode=mode)
             result = my_pad(data)
             print(result.numpy())
             # [[[[0. 0. 0. 0.]
@@ -1191,219 +836,44 @@ class ConstantPad2d(layers.Layer):
             #    [0. 0. 0. 0.]]]]
     """
 
-    def __init__(self, padding, value=0.0, data_format="NCHW", name=None):
-        super(ConstantPad2d, self).__init__()
-        self._mode = "constant"
-        self._data_format = data_format
+    def __init__(self,
+                 padding,
+                 mode='constant',
+                 value=0.0,
+                 data_format="NCHW",
+                 name=None):
+        super(Pad2D, self).__init__()
         self._pad = padding
+        self._mode = mode
         self._value = value
-        self._name = name
-
-    def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
-
-
-class ZeroPad2d(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``ZeroPad2d`` class.
-    Uses 0 to pad the input tensor.
-
-    Parameters:
-        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
-            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
-        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
-           Default is  "NCHW"
-        name (str, optional) : The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: text
-
-            x = [[[[1., 2., 3.],
-                   [4., 5., 6.]]]]
-            padding = [1, 1, 0, 0]
-            Out = [[[[0. 1. 2. 3. 0.]
-                     [0. 4. 5. 6. 0.]]]]
-
-    Code Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_shape = (1, 1, 2, 3)
-            pad = [1, 0, 1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ZeroPad2d(padding=pad)
-            data = paddle.to_tensor(data)
-            result = my_pad(data)
-            print(result.numpy())
-            # [[[[0. 0. 0. 0.]
-            #    [0. 1. 2. 3.]
-            #    [0. 4. 5. 6.]
-            #    [0. 0. 0. 0.]
-            #    [0. 0. 0. 0.]]]]
-    """
-
-    def __init__(self, padding, data_format="NCHW", name=None):
-        super(ZeroPad2d, self).__init__()
-        self._mode = "constant"
         self._data_format = data_format
-        self._pad = padding
-        self._name = name
-
-    def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     data_format=self._data_format,
-                     name=self._name)
-
-
-class ReplicationPad2d(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``ReplicationPad2d`` class.
-    Uses input boundaries to pad the input tensor.
-
-    Parameters:
-        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
-            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
-        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
-           Default is  "NCHW"
-        name (str, optional) : The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: text
-
-            x = [[[[1., 2., 3.],
-                   [4., 5., 6.]]]]
-            padding = [1, 1, 0, 0]
-            Out = [[[[1. 1. 2. 3. 3.]
-                     [4. 4. 5. 6. 6.]]]]
-
-    Code Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_shape = (1, 1, 2, 3)
-            pad = [1, 0, 1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ReplicationPad2d(padding=pad)
-            data = paddle.to_tensor(data)
-            result = my_pad(data)
-            print(result.numpy())
-            # [[[[1. 1. 2. 3.]
-            #    [1. 1. 2. 3.]
-            #    [4. 4. 5. 6.]
-            #    [4. 4. 5. 6.]
-            #    [4. 4. 5. 6.]]]]
-    """
-
-    def __init__(self, padding, data_format="NCHW", name=None):
-        super(ReplicationPad2d, self).__init__()
-        self._mode = "replicate"
-        self._data_format = data_format
-        self._pad = padding
-        self._name = name
-
-    def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     data_format=self._data_format,
-                     name=self._name)
-
-
-class ReflectionPad2d(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``ReflectionPad2d`` class.
-    Uses reflection of the input boundaries to pad the input tensor.
-
-    Parameters:
-        padding (Variable | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
-            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
-        data_format (str): An string from: "NCHW", "NHWC". Specify the data format of the input data.
-           Default is  "NCHW"
-        name (str, optional) : The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: text
-
-            x = [[[[1., 2., 3.],
-                   [4., 5., 6.]]]]
-            padding = [1, 1, 0, 0]
-            Out = [[[[2. 1. 2. 3. 2.]
-                     [5. 4. 5. 6. 5.]]]]
-
-    Code Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_shape = (1, 1, 4, 3)
-            pad = [1, 0, 1, 2]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ReflectionPad2d(padding=pad)
-            data = paddle.to_tensor(data)
-            result = my_pad(data)
-            print(result.numpy())
-            # [[[[ 5.  4.  5.  6.]
-            #    [ 2.  1.  2.  3.]
-            #    [ 5.  4.  5.  6.]
-            #    [ 8.  7.  8.  9.]
-            #    [11. 10. 11. 12.]
-            #    [ 8.  7.  8.  9.]
-            #    [ 5.  4.  5.  6.]]]]
-    """
-
-    def __init__(self, padding, data_format="NCHW", name=None):
-        super(ReflectionPad2d, self).__init__()
-        self._mode = "reflect"
-        self._data_format = data_format
-        self._pad = padding
         self._name = name
 
     def forward(self, x):
         return F.pad(x,
                      pad=self._pad,
                      mode=self._mode,
+                     value=self._value,
                      data_format=self._data_format,
                      name=self._name)
 
 
-class ConstantPad3d(layers.Layer):
+class Pad3D(layers.Layer):
     """
-    This interface is used to construct a callable object of the ``ConstantPad3d`` class.
-    Uses a constant value to pad the input tensor.
+    This interface is used to construct a callable object of the ``Pad3D`` class.
+    Pad tensor according to 'pad', 'mode' and 'value'.
+    If mode is 'reflect', pad[0] and pad[1] must be no greater
+    than width-1. The height and depth dimension has the same condition.
 
     Parameters:
         padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
             of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'.
+            When in 'constant' mode, this op uses a constant value to pad the input tensor.
+            When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+            When in 'replicate' mode, uses input boundaries to pad the input tensor.
+            When in 'circular' mode, uses circular input to pad the input tensor.
+            Default is 'constant'.
         value (float32): The value to fill the padded areas. Default is 0.0
         data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
            Default is  "NCDHW"
@@ -1419,6 +889,7 @@ class ConstantPad3d(layers.Layer):
             x = [[[[[1., 2., 3.],
                     [4., 5., 6.]]]]]
             padding = [1, 2, 0, 0, 0, 0]
+            mode = "constant"
             value = 0.0
             Out = [[[[[0. 1. 2. 3. 0. 0.]
                       [0. 4. 5. 6. 0. 0.]]]]]
@@ -1429,13 +900,11 @@ class ConstantPad3d(layers.Layer):
             import paddle
             import paddle.nn as nn
             import numpy as np
-            paddle.disable_static()
-
             input_shape = (1, 1, 1, 2, 3)
             pad = [1, 0, 1, 2, 0, 0]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ConstantPad3d(padding=pad)
-            data = paddle.to_tensor(data)
+            mode = "constant"
+            data = paddle.arange(np.prod(input_shape), dtype="float32").reshape(input_shape) + 1
+            my_pad = nn.Pad3D(padding=pad, mode=mode)
             result = my_pad(data)
             print(result.numpy())
             # [[[[[0. 0. 0. 0.]
@@ -1445,81 +914,24 @@ class ConstantPad3d(layers.Layer):
             #     [0. 0. 0. 0.]]]]]
     """
 
-    def __init__(self, padding, value=0.0, data_format="NCDHW", name=None):
-        super(ConstantPad3d, self).__init__()
-        self._mode = "constant"
-        self._data_format = data_format
+    def __init__(self,
+                 padding,
+                 mode='constant',
+                 value=0.0,
+                 data_format="NCDHW",
+                 name=None):
+        super(Pad3D, self).__init__()
         self._pad = padding
+        self._mode = mode
         self._value = value
-        self._name = name
-
-    def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
-
-
-class ReplicationPad3d(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``ReplicationPad3d`` class.
-    Uses input boundaries to pad the input tensor.
-
-    Parameters:
-        padding (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions
-            of input will be padded. The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
-        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
-           Default is  "NCDHW"
-        name (str, optional) : The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: text
-
-            x = [[[[[1., 2., 3.],
-                    [4., 5., 6.]]]]]
-            padding = [1, 2, 0, 0, 0, 0]
-            Out = [[[[[1. 1. 2. 3. 3. 3.]
-                      [4. 4. 5. 6. 6. 6.]]]]]
-
-    Code Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import numpy as np
-            paddle.disable_static()
-
-            input_shape = (1, 1, 1, 2, 3)
-            pad = [1, 0, 1, 2, 0, 0]
-            data = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) + 1
-            my_pad = nn.ReplicationPad3d(padding=pad)
-            data = paddle.to_tensor(data)
-            result = my_pad(data)
-            print(result.numpy())
-            # [[[[[1. 1. 2. 3.]
-            #     [1. 1. 2. 3.]
-            #     [4. 4. 5. 6.]
-            #     [4. 4. 5. 6.]
-            #     [4. 4. 5. 6.]]]]]
-    """
-
-    def __init__(self, padding, data_format="NCDHW", name=None):
-        super(ReplicationPad3d, self).__init__()
-        self._mode = "replicate"
         self._data_format = data_format
-        self._pad = padding
         self._name = name
 
     def forward(self, x):
         return F.pad(x,
                      pad=self._pad,
                      mode=self._mode,
+                     value=self._value,
                      data_format=self._data_format,
                      name=self._name)
 
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 334b71151b563..28b29a583d8a3 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -86,7 +86,7 @@ def forward(self, x, y):
                                  'PairwiseDistance')
         check_variable_and_dtype(y, 'y', ['float32', 'float64'],
                                  'PairwiseDistance')
-        sub = paddle.elementwise_sub(x, y)
+        sub = paddle.fluid.layers.elementwise_sub(x, y)
 
         helper = LayerHelper("PairwiseDistance", name=self.name)
         attrs = {
diff --git a/python/paddle/nn/layer/extension.py b/python/paddle/nn/layer/extension.py
index 01ca472315f3d..3972a1b715712 100644
--- a/python/paddle/nn/layer/extension.py
+++ b/python/paddle/nn/layer/extension.py
@@ -102,5 +102,5 @@ def __init__(self,
             filter_shape, attr=param_attr, dtype=dtype)
 
     def forward(self, input):
-        out = F.row_conv(input, self.weight, act=self._act)
+        out = F.extension.row_conv(input, self.weight, act=self._act)
         return out
diff --git a/python/paddle/nn/layer/learning_rate.py b/python/paddle/nn/layer/learning_rate.py
deleted file mode 100644
index e91f755cb0615..0000000000000
--- a/python/paddle/nn/layer/learning_rate.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: define learning rate decay  
-
-__all__ = [
-    #       'CosineDecay',
-    #       'ExponentialDecay',
-    #       'InverseTimeDecay',
-    #       'NaturalExpDecay',
-    #       'NoamDecay',
-    #       'PiecewiseDecay',
-    #       'PolynomialDecay'
-]
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 2000fbf388f88..ad8dc9b64e78a 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -28,7 +28,7 @@
 # TODO: define normalization api  
 
 import six
-from ...fluid.dygraph.nn import InstanceNorm
+#from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
 #from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
@@ -51,11 +51,12 @@
 import numbers
 import warnings
 from ...fluid.dygraph.base import no_grad
+from .. import functional as F
 
 __all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
-    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'BatchNorm1d',
+    'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d', 'InstanceNorm2d',
+    'InstanceNorm3d', 'SyncBatchNorm', 'LocalResponseNorm'
 ]
 
 
@@ -718,14 +719,15 @@ class BatchNorm1d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Defalut "NCL".
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
-        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length) when data_format is "NC" or "NCL",
+            (batch, length, num_features) when data_format is "NLC".
         - output: 3-D tensor with same shape as input x.
 
     Returns:
@@ -754,8 +756,11 @@ class BatchNorm1d(_BatchNormBase):
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NC' or input == 'NCL':
             self._data_format = 'NCHW'
+        elif input == "NHWC" or input == 'NLC':
+            self._data_format = "NHWC"
         else:
-            raise ValueError('expected NC , NCL or None for data_format input')
+            raise ValueError(
+                'expected NC , NCL, NLC or None for data_format input')
 
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
@@ -811,14 +816,15 @@ class BatchNorm2d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW". Default: NCHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
-        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - x: 4-D tensor with shape: (batch, num_features, height, weight) when data_format is "NCHW",
+            or (batch, height, weight, num_features) when data_format is "NHWC".
         - output: 4-D tensor with same shape as input x.
 
     Returns:
@@ -846,8 +852,10 @@ class BatchNorm2d(_BatchNormBase):
     def _check_data_format(self, input):
         if input == 'NCHW':
             self._data_format = input
+        elif input == "NHWC":
+            self._data_format = input
         else:
-            raise ValueError('expected NCHW for data_format input')
+            raise ValueError('expected NCHW or NHWC for data_format input')
 
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
@@ -903,14 +911,15 @@ class BatchNorm3d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC. Default: NCDHW.
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
-        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight) when data_format is "NCDHW",
+            or (batch, dims, height, weight, num_features) when data_format is "NDHWC".
         - output: 5-D tensor with same shape as input x.
 
     Returns:
@@ -938,8 +947,11 @@ class BatchNorm3d(_BatchNormBase):
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NCDHW':
             self._data_format = 'NCHW'
+        elif input == "NHWC" or input == "NDHWC":
+            self._data_format = 'NHWC'
         else:
-            raise ValueError('expected NCDHW or None for data_format input')
+            raise ValueError(
+                'expected NCDHW, NDHWC or None for data_format input')
 
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
@@ -993,6 +1005,11 @@ class SyncBatchNorm(_BatchNormBase):
     - :math:`\\gamma` : trainable scale parameter vector
     - :math:`\\beta` : trainable shift parameter vector 
 
+    Note:
+        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
+        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
+        ``list`` to pack the model. 
+
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
@@ -1147,3 +1164,63 @@ def convert_sync_batchnorm(cls, layer):
                                       cls.convert_sync_batchnorm(sublayer))
         del layer
         return layer_output
+
+
+class LocalResponseNorm(layers.Layer):
+    """
+        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
+        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+
+        See more details in :ref:`api_paddle_nn_functional_local_response_norm` .
+
+        Parameters:
+            size (int): The number of channels to sum over.
+            alpha (float, optional): The scaling parameter, positive. Default:1e-4
+            beta (float, optional): The exponent, positive. Default:0.75
+            k (float, optional): An offset, positive. Default: 1.0
+            data_format (str, optional): Specify the data format of the input, and the data format of the output
+                will be consistent with that of the input. An optional string from:
+                If input is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
+                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
+                If input is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
+                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
+                If input is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
+                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+            name (str, optional): Name for the operation (optional, default is None). For more information,
+                please refer to :ref:`api_guide_Name`.
+
+        Shape:
+            - input: 3-D/4-D/5-D tensor.
+            - output: 3-D/4-D/5-D tensor, the same shape as input.
+
+        Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+            m = paddle.nn.LocalResponseNorm(size=5)
+            y = m(x)
+            print(y.shape)  # [3, 3, 112, 112]
+        """
+
+    def __init__(self,
+                 size,
+                 alpha=0.0001,
+                 beta=0.75,
+                 k=1.0,
+                 data_format="NCHW",
+                 name=None):
+        super(LocalResponseNorm, self).__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, input):
+        out = F.local_response_norm(input, self.size, self.alpha, self.beta,
+                                    self.k, self.data_format, self.name)
+        return out
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 0687fefe00506..33904524862d4 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -21,8 +21,11 @@
 import warnings
 from functools import partial, reduce
 
+import numpy as np
 import paddle
+import paddle.fluid as fluid
 from paddle import framework
+from paddle.device import get_device, get_cudnn_version
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from paddle.fluid.dygraph import Layer, LayerList
@@ -48,7 +51,7 @@ def split_states(states, bidirectional=False, state_components=1):
     Split states of RNN network into possibly nested list or tuple of
     states of each RNN cells of the RNN network.
 
-    Arguments:
+    Parameters:
         states (Tensor|tuple|list): the concatenated states for RNN network.
             When `state_components` is 1, states in a Tensor with shape
             `(L*D, N, C)` where `L` is the number of layers of the RNN 
@@ -101,7 +104,7 @@ def concat_states(states, bidirectional=False, state_components=1):
     Concatenate a possibly nested list or tuple of RNN cell states into a 
     compact form.
 
-    Arguments:
+    Parameters:
         states (list|tuple): a possibly nested list or tuple of RNN cell 
             states. 
             If `bidirectional` is True, it can be indexed twice to get an 
@@ -135,7 +138,7 @@ def concat_states(states, bidirectional=False, state_components=1):
         componnets = []
         for i in range(state_components):
             componnets.append(states[i::state_components])
-        return [paddle.stack(item) for item in componnets]
+        return tuple([paddle.stack(item) for item in componnets])
 
 
 class RNNCellBase(Layer):
@@ -154,13 +157,14 @@ def get_initial_states(self,
         r"""
         Generate initialized states according to provided shape, data type and
         value.
-        Arguments:
+
+        Parameters:
             batch_ref (Tensor): A tensor, which shape would be used to 
                 determine the batch size, which is used to generate initial 
                 states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is 
                 treated as batch size.
             shape (list|tuple, optional): A (possibly nested structure of) shape[s], 
-                where a shape is a list/tuple of integer). `-1` (for batch size) 
+                where a shape is a list/tuple of integer. `-1` (for batch size) 
                 will be automatically prepended if a shape does not starts with 
                 it. If None, property `state_shape` will be used. Defaults to 
                 None.
@@ -174,6 +178,7 @@ def get_initial_states(self,
                 Defaults to 0.
             batch_dim_idx (int, optional): An integer indicating which 
                 dimension of the of `batch_ref` represents batch. Defaults to 0.
+                
         Returns:
             init_states (Tensor|tuple|list): tensor of the provided shape and 
                 dtype, or list of tensors that each satisfies the requirements,
@@ -268,16 +273,17 @@ class SimpleRNNCell(RNNCellBase):
     The formula used is as follows:
 
     .. math::
-        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+
         y_{t} & = h_{t}
     
-    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    where :math:`act` is for :attr:`activation` , and * is the elemetwise
     multiplication operator.
 
     Please refer to `Finding Structure in Time 
     <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
     
-    Arguments:
+    Parameters:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
         activation (str, optional): The activation in the SimpleRNN cell. 
@@ -293,7 +299,7 @@ class SimpleRNNCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Parameters:
+    Attributes:
         weight_ih (Parameter): shape (hidden_size, input_size), input to hidden 
             weight, corresponding to :math:`W_{ih}` in the formula.
         weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to 
@@ -329,13 +335,15 @@ class SimpleRNNCell(RNNCellBase):
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             x = paddle.randn((4, 16))
             prev_h = paddle.randn((4, 32))
 
             cell = paddle.nn.SimpleRNNCell(16, 32)
             y, h = cell(x, prev_h)
+            print(y.shape)
+
+            #[4,32]
 
     """
 
@@ -407,20 +415,26 @@ class LSTMCell(RNNCellBase):
 
     .. math::
         i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+
         f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+
         o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
-        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
-        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
-        h_{t} & = o_{t} \* \\tanh(c_{t})
+
+        \widetilde{c}_{t} & = \tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+
+        c_{t} & = f_{t} * c_{t-1} + i_{t} * \widetilde{c}_{t}
+
+        h_{t} & = o_{t} * \tanh(c_{t})
+
         y_{t} & = h_{t}
 
-    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
     multiplication operator.
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
     <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
 
-    Arguments:
+    Parameters:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
         weight_ih_attr(ParamAttr, optional): The parameter attribute for 
@@ -434,7 +448,7 @@ class LSTMCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Parameters:
+    Attributes:
         weight_ih (Parameter): shape (4 * hidden_size, input_size), input to 
             hidden weight, which corresponds to the concatenation of
              :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
@@ -462,7 +476,7 @@ class LSTMCell(RNNCellBase):
             corresponding to :math:`h_{t}` in the formula.
         states (tuple): a tuple of two tensors, each of shape 
             `[batch_size, hidden_size]`, the new hidden states,
-            corresponding to :math:`h_{t}, c{t}` in the formula.
+            corresponding to :math:`h_{t}, c_{t}` in the formula.
 
     Notes:
         All the weights and bias are initialized with `Uniform(-std, std)` by 
@@ -475,7 +489,6 @@ class LSTMCell(RNNCellBase):
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             x = paddle.randn((4, 16))
             prev_h = paddle.randn((4, 32))
@@ -484,6 +497,14 @@ class LSTMCell(RNNCellBase):
             cell = paddle.nn.LSTMCell(16, 32)
             y, (h, c) = cell(x, (prev_h, prev_c))
 
+            print(y.shape)
+            print(h.shape)
+            print(c.shape)
+
+            #[4,32]
+            #[4,32]
+            #[4,32]
+
     """
 
     def __init__(self,
@@ -559,15 +580,19 @@ class GRUCell(RNNCellBase):
 
     The formula for GRU used is as follows:
 
-    .. math::
+    ..  math::
 
         r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
-        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
-        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
-        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t} + b_{hc}))
+
+        h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
+
         y_{t} & = h_{t}
     
-    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
     multiplication operator.
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
@@ -587,7 +612,7 @@ class GRUCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Parameters:
+    Attributes:
         weight_ih (Parameter): shape (3 * hidden_size, input_size), input to 
             hidden weight, which corresponds to the concatenation of
              :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
@@ -625,7 +650,6 @@ class GRUCell(RNNCellBase):
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             x = paddle.randn((4, 16))
             prev_h = paddle.randn((4, 32))
@@ -633,6 +657,12 @@ class GRUCell(RNNCellBase):
             cell = paddle.nn.GRUCell(16, 32)
             y, h = cell(x, prev_h)
 
+            print(y.shape)
+            print(h.shape)
+
+            #[4,32]
+            #[4,32]
+
     """
 
     def __init__(self,
@@ -707,7 +737,7 @@ class RNN(Layer):
     It performs :code:`cell.forward()` repeatedly until reaches to the maximum 
     length of `inputs`.
 
-    Arguments:
+    Parameters:
         cell(RNNCellBase): An instance of `RNNCellBase`.
         is_reverse (bool, optional): Indicate whether to calculate in the reverse
             order of input sequences. Defaults to False.
@@ -717,8 +747,8 @@ class RNN(Layer):
     Inputs:
         inputs (Tensor): A (possibly nested structure of) tensor[s]. The input 
             sequences. 
-            If time major is True, the shape is `[batch_size, time_steps, input_size]`
-            If time major is False, the shape is [time_steps, batch_size, input_size]`
+            If time major is False, the shape is `[batch_size, time_steps, input_size]`
+            If time major is True, the shape is `[time_steps, batch_size, input_size]`
             where `input_size` is the input size of the cell.
         initial_states (Tensor|list|tuple, optional): Tensor of a possibly 
             nested structure of tensors, representing the initial state for 
@@ -753,7 +783,6 @@ class RNN(Layer):
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             inputs = paddle.rand((4, 23, 16))
             prev_h = paddle.randn((4, 32))
@@ -762,6 +791,12 @@ class RNN(Layer):
             rnn = paddle.nn.RNN(cell)
             outputs, final_states = rnn(inputs, prev_h)
 
+            print(outputs.shape)
+            print(final_states.shape)
+
+            #[4,23,32]
+            #[4,32]
+
     """
 
     def __init__(self, cell, is_reverse=False, time_major=False):
@@ -778,13 +813,14 @@ def forward(self,
                 initial_states=None,
                 sequence_length=None,
                 **kwargs):
-        final_outputs, final_states = F.rnn(self.cell,
-                                            inputs,
-                                            initial_states=initial_states,
-                                            sequence_length=sequence_length,
-                                            time_major=self.time_major,
-                                            is_reverse=self.is_reverse,
-                                            **kwargs)
+        final_outputs, final_states = paddle.fluid.layers.rnn(
+            self.cell,
+            inputs,
+            initial_states=initial_states,
+            sequence_length=sequence_length,
+            time_major=self.time_major,
+            is_reverse=self.is_reverse,
+            **kwargs)
         return final_outputs, final_states
 
 
@@ -795,7 +831,7 @@ class BiRNN(Layer):
     backward RNN with coresponding cells separately and concats the outputs 
     along the last axis.
 
-    Arguments:
+    Parameters:
         cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
         cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
         time_major (bool): Whether the first dimension of the input means the
@@ -841,7 +877,6 @@ class BiRNN(Layer):
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             cell_fw = paddle.nn.LSTMCell(16, 32)
             cell_bw = paddle.nn.LSTMCell(16, 32)
@@ -850,6 +885,12 @@ class BiRNN(Layer):
             inputs = paddle.rand((2, 23, 16))
             outputs, final_states = rnn(inputs)
 
+            print(outputs.shape)
+            print(final_states[0][0].shape,len(final_states),len(final_states[0]))
+
+            #[4,23,64]
+            #[2,32] 2 2
+
     """
 
     def __init__(self, cell_fw, cell_bw, time_major=False):
@@ -875,18 +916,194 @@ def forward(self,
             assert len(initial_states) == 2, \
                 "length of initial_states should be 2 when it is a list/tuple"
 
-        outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
-                                        initial_states, sequence_length,
-                                        self.time_major, **kwargs)
+        outputs, final_states = paddle.fluid.layers.birnn(
+            self.cell_fw, self.cell_bw, inputs, initial_states, sequence_length,
+            self.time_major, **kwargs)
         return outputs, final_states
 
 
-class RNNMixin(LayerList):
+class RNNBase(LayerList):
     r"""
-    A Mixin class for RNN networks. It provides `forward` method for SimpleRNN,
-    LSTM and GRU.
+    RNNBase class for RNN networks. It provides `forward`, `flatten_parameters`
+    and other common methods for SimpleRNN, LSTM and GRU.
     """
 
+    def __init__(self,
+                 mode,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 time_major=False,
+                 dropout=0.,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None):
+        super(RNNBase, self).__init__()
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2 if mode == "LSTM" else 1
+
+        kwargs = {
+            "weight_ih_attr": weight_ih_attr,
+            "weight_hh_attr": weight_hh_attr,
+            "bias_ih_attr": bias_ih_attr,
+            "bias_hh_attr": bias_hh_attr
+        }
+
+        if mode == "LSTM":
+            rnn_cls = LSTMCell
+        elif mode == "GRU":
+            rnn_cls = GRUCell
+        else:
+            rnn_cls = SimpleRNNCell
+            kwargs["activation"] = self.activation
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = rnn_cls(input_size, hidden_size, **kwargs)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = rnn_cls(hidden_size, hidden_size, **kwargs)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = rnn_cls(input_size, hidden_size, **kwargs)
+            cell_bw = rnn_cls(input_size, hidden_size, **kwargs)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = rnn_cls(2 * hidden_size, hidden_size, **kwargs)
+                cell_bw = rnn_cls(2 * hidden_size, hidden_size, **kwargs)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.could_use_cudnn = get_device().startswith(
+            "gpu:") and get_cudnn_version()
+        self.could_use_cudnn &= direction != "backward"
+        self.could_use_cudnn &= len(self.parameters()) == num_layers * 4 * (
+            2 if direction == "bidirectional" else 1)
+        self.could_use_cudnn &= mode == "LSTM"  # currently only support LSTM
+
+        # Expose params as RNN's attribute, which can make it compatible when
+        # replacing small ops composed rnn with cpp rnn kernel.
+        # Moreover, `jit.to_static` assumes params are added by current layer
+        # and wouldn't include sublayer's params in current layer, which also
+        # requires these params are added to current layer for `jit.save`.
+        param_names = []
+        for layer in range(self.num_layers):
+            for direction in range(self.num_directions):
+                suffix = '_reverse' if direction == 1 else ''
+                param_names.extend(['weight_ih_l{}{}', 'weight_hh_l{}{}'])
+                if bias_ih_attr != False: param_names.append('bias_ih_l{}{}')
+                if bias_hh_attr != False: param_names.append('bias_hh_l{}{}')
+                param_names = [x.format(layer, suffix) for x in param_names]
+        for name, param in zip(param_names, self.parameters()):
+            setattr(self, name, param)
+
+        self.flatten_parameters()
+
+    def flatten_parameters(self):
+        """
+        Resets parameter data pointer to address in continuous memory block for
+        cudnn usage.
+        """
+        if self.could_use_cudnn:
+            # layer.parameters() is depth first and ordered
+            # for i in layer: for j in direct: w_ih, w_hh, b_ih, b_hh
+            # need to reorganize to cudnn param layout:
+            # all bias following all weights
+            params = self.parameters(include_sublayers=False)
+            shape = [np.prod(param.shape) for param in params]
+            self._all_weights = [None] * len(params)
+            for i, param in enumerate(params):
+                offset = 0 if i % 4 < 2 else (2 * self.num_layers *
+                                              self.num_directions)
+                layer_idx = i // 4
+                self._all_weights[offset + layer_idx * 2 + i % 2] = param
+            # Wrap using a list to avoid registed into params and saving, maybe
+            # need a better way to handle this later. Use `create_parameter` to
+            # add both to main_program and startup_program for static-graph.
+            # Use Constant initializer to avoid make effect on random generator.
+            self._flat_weight = [
+                self.create_parameter(
+                    shape=[np.sum(shape)],
+                    dtype=params[0].dtype,
+                    default_initializer=I.Constant(0.0))
+            ]
+            # dropout state may also can be hided and avoid saving
+            # should dropout state be persistable for static-graph
+            self._dropout_state = self.create_variable(
+                dtype=fluid.core.VarDesc.VarType.UINT8)
+            # for static-graph, append coalesce_tensor into startup program
+            with fluid.program_guard(fluid.default_startup_program(),
+                                     fluid.default_startup_program()):
+                with framework.no_grad():
+                    self._helper.append_op(
+                        type="coalesce_tensor",
+                        inputs={"Input": self._all_weights},
+                        outputs={
+                            "Output": self._all_weights,
+                            "FusedOutput": self._flat_weight
+                        },
+                        attrs={
+                            "copy_data": True,
+                            "use_align": False,
+                            "dtype": params[0].dtype
+                        })
+
+    def _cudnn_impl(self, inputs, initial_states, sequence_length):
+        if not self.time_major:
+            inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
+        # unify LSTM/GRU/SimpleRNN later, currently only support LSTM
+        # TODO(guosheng): use `core.ops.cudnn_lstm` in dygraph mode if support
+        # specify output, since `dropout_state` should be a persistable tensor
+        # rather than a temporary on.
+        out = self._helper.create_variable_for_type_inference(inputs.dtype)
+        last_h = self._helper.create_variable_for_type_inference(inputs.dtype)
+        last_c = self._helper.create_variable_for_type_inference(inputs.dtype)
+        reserve = self._helper.create_variable_for_type_inference(
+            dtype=fluid.core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+        inputs = {
+            'Input': inputs,
+            # 'W': self._flat_weight,  # would be unused_var
+            'WeightList': self._all_weights,
+            'InitH': initial_states[0],
+            'InitC': initial_states[1],
+            'SequenceLength': sequence_length
+        }
+        attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.num_directions == 2,
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'is_test': not self.training
+        }
+
+        outputs = {
+            'Out': out,
+            'LastH': last_h,
+            'LastC': last_c,
+            'Reserve': reserve,
+            'StateOut': self._dropout_state,
+        }
+
+        self._helper.append_op(
+            type="cudnn_lstm", inputs=inputs, outputs=outputs, attrs=attrs)
+        out = paddle.tensor.transpose(out,
+                                      [1, 0, 2]) if not self.time_major else out
+        states = (last_h, last_c)
+        return out, states
+
     def forward(self, inputs, initial_states=None, sequence_length=None):
         batch_index = 1 if self.time_major else 0
         dtype = inputs.dtype
@@ -903,6 +1120,10 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
                     for _ in range(self.state_components)
                 ])
 
+        if self.could_use_cudnn:
+            # Add CPU kernel and dispatch in backend later
+            return self._cudnn_impl(inputs, initial_states, sequence_length)
+
         states = split_states(initial_states, self.num_directions == 2,
                               self.state_components)
         final_states = []
@@ -923,7 +1144,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
         return outputs, final_states
 
 
-class SimpleRNN(RNNMixin):
+class SimpleRNN(RNNBase):
     r"""
     Multilayer Elman network(SimpleRNN). It takes input sequences and initial 
     states as inputs, and returns the output sequences and the final states.
@@ -936,24 +1157,28 @@ class SimpleRNN(RNNMixin):
 
     .. math::
 
-        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+
         y_{t} & = h_{t}
     
-    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    where :math:`act` is for :attr:`activation` , and * is the elemetwise
     multiplication operator.
 
-    Arguments:
+    Using key word arguments to construct is recommended.
+
+    Parameters:
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
         num_layers (int, optional): Number of layers. Defaults to 1.
-        activation (str, optional): The activation in each SimpleRNN cell. It can be 
-            `tanh` or `relu`. Defaults to `tanh`.
         direction (str, optional): The direction of the network. It can be "forward", 
-            "backward" and "bidirectional". Defaults to "forward".
-        dropout (float, optional): The droput probability. Dropout is applied to the 
-            input of each layer except for the first layer. Defaults to 0.
+            "backward" and "bidirectional". When "bidirectional", the way to merge
+            outputs of forward and backward is concatenating. Defaults to "forward".
         time_major (bool, optional): Whether the first dimension of the input means the
             time steps. Defaults to False.
+        dropout (float, optional): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer. Defaults to 0.
+        activation (str, optional): The activation in each SimpleRNN cell. It can be 
+            `tanh` or `relu`. Defaults to `tanh`.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih` of each cell. Defaults to None.
         weight_hh_attr (ParamAttr, optional): The parameter attribute for 
@@ -970,7 +1195,7 @@ class SimpleRNN(RNNMixin):
             If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
             else, the shape is `[batch_size, time_steps, hidden_size]`.
         initial_states (Tensor, optional): the initial state. The shape is
-            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            `[num_layers * num_directions, batch_size, hidden_size]`. 
             If initial_state is not given, zero initial states are used.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
             or int32. The valid lengths of input sequences. Defaults to None.
@@ -988,16 +1213,26 @@ class SimpleRNN(RNNMixin):
             Note that `num_directions` is 2 if direction is "bidirectional" 
             else 1.
         final_states (Tensor): final states. The shape is
-            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            `[num_layers * num_directions, batch_size, hidden_size]`.
             Note that `num_directions` is 2 if direction is "bidirectional" 
             else 1.
 
+    Attributes:
+        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
+            If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise,
+            the shape is `[hidden_size, num_directions * hidden_size]`.
+        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
+            with shape `[hidden_size, hidden_size]`.
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            with shape `[hidden_size]`.
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            with shape `[hidden_size]`.
+
     Examples:
 
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             rnn = paddle.nn.SimpleRNN(16, 32, 2)
 
@@ -1005,65 +1240,40 @@ class SimpleRNN(RNNMixin):
             prev_h = paddle.randn((2, 4, 32))
             y, h = rnn(x, prev_h)
 
+            print(y.shape)
+            print(h.shape)
+
+            #[4,23,32]
+            #[2,4,32]
+
     """
 
     def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
-                 activation="tanh",
                  direction="forward",
-                 dropout=0.,
                  time_major=False,
+                 dropout=0.,
+                 activation="tanh",
                  weight_ih_attr=None,
                  weight_hh_attr=None,
                  bias_ih_attr=None,
                  bias_hh_attr=None,
                  name=None):
-        super(SimpleRNN, self).__init__()
-
-        if direction in ["forward", "backward"]:
-            is_reverse = direction == "backward"
-            cell = SimpleRNNCell(input_size, hidden_size, activation,
-                                 weight_ih_attr, weight_hh_attr, bias_ih_attr,
-                                 bias_hh_attr)
-            self.append(RNN(cell, is_reverse, time_major))
-            for i in range(1, num_layers):
-                cell = SimpleRNNCell(hidden_size, hidden_size, activation,
-                                     weight_ih_attr, weight_hh_attr,
-                                     bias_ih_attr, bias_hh_attr)
-                self.append(RNN(cell, is_reverse, time_major))
-        elif direction == "bidirectional":
-            cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
-                                    weight_ih_attr, weight_hh_attr,
-                                    bias_ih_attr, bias_hh_attr)
-            cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
-                                    weight_ih_attr, weight_hh_attr,
-                                    bias_ih_attr, bias_hh_attr)
-            self.append(BiRNN(cell_fw, cell_bw, time_major))
-            for i in range(1, num_layers):
-                cell_fw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
-                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                cell_bw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
-                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        if activation == "tanh":
+            mode = "RNN_TANH"
+        elif activation == "relu":
+            mode = "RNN_RELU"
         else:
-            raise ValueError(
-                "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction))
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.dropout = dropout
-        self.num_directions = 2 if direction == "bidirectional" else 1
-        self.time_major = time_major
-        self.num_layers = num_layers
-        self.state_components = 1
+            raise ValueError("Unknown activation '{}'".format(activation))
+        self.activation = activation
+        super(SimpleRNN, self).__init__(
+            mode, input_size, hidden_size, num_layers, direction, time_major,
+            dropout, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr)
 
 
-class LSTM(RNNMixin):
+class LSTM(RNNBase):
     r"""
     Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
     returns the output sequences and the final states.
@@ -1077,26 +1287,35 @@ class LSTM(RNNMixin):
     .. math::
 
         i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+
         f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+
         o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
-        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
-        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
-        h_{t} & = o_{t} \* \\tanh(c_{t})
+
+        \widetilde{c}_{t} & = \tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+
+        c_{t} & = f_{t} * c_{t-1} + i_{t} * \widetilde{c}_{t}
+
+        h_{t} & = o_{t} * \tanh(c_{t})
+
         y_{t} & = h_{t}
 
-    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
     multiplication operator.
 
-    Arguments:
+    Using key word arguments to construct is recommended.
+
+    Parameters:
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
         num_layers (int, optional): Number of layers. Defaults to 1.
-        direction (str, optional): The direction of the network. It can be 
-            "forward", "backward" and "bidirectional". Defaults to "forward".
-        dropout (float, optional): The droput probability. Dropout is applied 
-            to the input of each layer except for the first layer. Defaults to 0.
+        direction (str, optional): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". When "bidirectional", the way to merge
+            outputs of forward and backward is concatenating. Defaults to "forward".
         time_major (bool, optional): Whether the first dimension of the input 
             means the time steps. Defaults to False.
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih` of each cell. Default: None.
         weight_hh_attr (ParamAttr, optional): The parameter attribute for 
@@ -1113,7 +1332,7 @@ class LSTM(RNNMixin):
             If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
             else, the shape is `[batch_size, time_steps, hidden_size]`.
         initial_states (tuple, optional): the initial state, a tuple of (h, c), 
-            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. 
             If initial_state is not given, zero initial states are used.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
             or int32. The valid lengths of input sequences. Defaults to None.
@@ -1130,18 +1349,28 @@ class LSTM(RNNMixin):
             `[batch_size, time_steps, num_directions * hidden_size]`. 
             Note that `num_directions` is 2 if direction is "bidirectional" 
             else 1. 
-        final_states (Tensor): the final state, a tuple of two tensors, h and c. 
+        final_states (tuple): the final state, a tuple of two tensors, h and c. 
             The shape of each is 
-            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            `[num_layers * num_directions, batch_size, hidden_size]`. 
             Note that `num_directions` is 2 if direction is "bidirectional" 
             else 1.
 
+    Attributes:
+        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
+            If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise,
+            the shape is `[hidden_size, num_directions * hidden_size]`.
+        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
+            with shape `[hidden_size, hidden_size]`.
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            with shape `[hidden_size]`.
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            with shape `[hidden_size]`.
+
     Examples:
     
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             rnn = paddle.nn.LSTM(16, 32, 2)
 
@@ -1150,6 +1379,14 @@ class LSTM(RNNMixin):
             prev_c = paddle.randn((2, 4, 32))
             y, (h, c) = rnn(x, (prev_h, prev_c))
 
+            print(y.shape)
+            print(h.shape)
+            print(c.shape)
+
+            #[4,23,32]
+            #[2,4,32]
+            #[2,4,32]
+
     """
 
     def __init__(self,
@@ -1157,51 +1394,19 @@ def __init__(self,
                  hidden_size,
                  num_layers=1,
                  direction="forward",
-                 dropout=0.,
                  time_major=False,
+                 dropout=0.,
                  weight_ih_attr=None,
                  weight_hh_attr=None,
                  bias_ih_attr=None,
                  bias_hh_attr=None,
                  name=None):
-        super(LSTM, self).__init__()
-
-        if direction in ["forward", "backward"]:
-            is_reverse = direction == "backward"
-            cell = LSTMCell(input_size, hidden_size, weight_ih_attr,
-                            weight_hh_attr, bias_ih_attr, bias_hh_attr)
-            self.append(RNN(cell, is_reverse, time_major))
-            for i in range(1, num_layers):
-                cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr,
-                                weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                self.append(RNN(cell, is_reverse, time_major))
-        elif direction == "bidirectional":
-            cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr,
-                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
-            cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr,
-                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
-            self.append(BiRNN(cell_fw, cell_bw, time_major))
-            for i in range(1, num_layers):
-                cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
-                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
-                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                self.append(BiRNN(cell_fw, cell_bw, time_major))
-        else:
-            raise ValueError(
-                "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction))
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.dropout = dropout
-        self.num_directions = 2 if direction == "bidirectional" else 1
-        self.time_major = time_major
-        self.num_layers = num_layers
-        self.state_components = 2
+        super(LSTM, self).__init__(
+            "LSTM", input_size, hidden_size, num_layers, direction, time_major,
+            dropout, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr)
 
 
-class GRU(RNNMixin):
+class GRU(RNNBase):
     r"""
     Multilayer GRU. It takes input sequencse and initial states as inputs, and 
     returns the output sequences and the final states.
@@ -1215,24 +1420,31 @@ class GRU(RNNMixin):
     .. math::
 
         r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
-        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
-        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
-        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t} + b_{hc}))
+
+        h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
+
         y_{t} & = h_{t}
 
-    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
     multiplication operator.
 
-    Arguments:
+    Using key word arguments to construct is recommended.
+
+    Parameters:
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
         num_layers (int, optional): Number of layers. Defaults to 1.
-        direction (str, optional): The direction of the network. It can be 
-            "forward", "backward" and "bidirectional". Defaults to "forward".
-        dropout (float, optional): The droput probability. Dropout is applied 
-            to the input of each layer except for the first layer. Defaults to 0.
+        direction (str, optional): The direction of the network. It can be "forward",
+            "backward" and "bidirectional". When "bidirectional", the way to merge
+            outputs of forward and backward is concatenating. Defaults to "forward".
         time_major (bool, optional): Whether the first dimension of the input 
             means the time steps. Defaults to False.
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih` of each cell. Default: None.
         weight_hh_attr (ParamAttr, optional): The parameter attribute for 
@@ -1249,7 +1461,7 @@ class GRU(RNNMixin):
             If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
             else, the shape is `[batch_size, time_steps, hidden_size]`.
         initial_states (Tensor, optional): the initial state. The shape is
-            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            `[num_layers * num_directions, batch_size, hidden_size]`. 
             If initial_state is not given, zero initial states are used. 
             Defaults to None.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
@@ -1268,16 +1480,26 @@ class GRU(RNNMixin):
             Note that `num_directions` is 2 if direction is "bidirectional" 
             else 1.
         final_states (Tensor): final states. The shape is
-            `[num_lauers * num_directions, batch_size, hidden_size]`.
+            `[num_layers * num_directions, batch_size, hidden_size]`.
             Note that `num_directions` is 2 if direction is "bidirectional" 
             else 1.
 
+    Attributes:
+        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
+            If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise,
+            the shape is `[hidden_size, num_directions * hidden_size]`.
+        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
+            with shape `[hidden_size, hidden_size]`.
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            with shape `[hidden_size]`.
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            with shape `[hidden_size]`.
+
     Examples:
 
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             rnn = paddle.nn.GRU(16, 32, 2)
 
@@ -1285,6 +1507,12 @@ class GRU(RNNMixin):
             prev_h = paddle.randn((2, 4, 32))
             y, h = rnn(x, prev_h)
 
+            print(y.shape)
+            print(h.shape)
+
+            #[4,23,32]
+            #[2,4,32]
+
     """
 
     def __init__(self,
@@ -1292,45 +1520,13 @@ def __init__(self,
                  hidden_size,
                  num_layers=1,
                  direction="forward",
-                 dropout=0.,
                  time_major=False,
+                 dropout=0.,
                  weight_ih_attr=None,
                  weight_hh_attr=None,
                  bias_ih_attr=None,
                  bias_hh_attr=None,
                  name=None):
-        super(GRU, self).__init__()
-
-        if direction in ["forward", "backward"]:
-            is_reverse = direction == "backward"
-            cell = GRUCell(input_size, hidden_size, weight_ih_attr,
-                           weight_hh_attr, bias_ih_attr, bias_hh_attr)
-            self.append(RNN(cell, is_reverse, time_major))
-            for i in range(1, num_layers):
-                cell = GRUCell(hidden_size, hidden_size, weight_ih_attr,
-                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                self.append(RNN(cell, is_reverse, time_major))
-        elif direction == "bidirectional":
-            cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr,
-                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
-            cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr,
-                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
-            self.append(BiRNN(cell_fw, cell_bw, time_major))
-            for i in range(1, num_layers):
-                cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
-                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
-                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
-                self.append(BiRNN(cell_fw, cell_bw, time_major))
-        else:
-            raise ValueError(
-                "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction))
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.dropout = dropout
-        self.num_directions = 2 if direction == "bidirectional" else 1
-        self.time_major = time_major
-        self.num_layers = num_layers
-        self.state_components = 1
+        super(GRU, self).__init__(
+            "GRU", input_size, hidden_size, num_layers, direction, time_major,
+            dropout, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr)
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 6f485e2e9d62f..756bf35486bf8 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -13,21 +13,12 @@
 # limitations under the License.
 
 __all__ = [
-    'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
-    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
-    'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer',
-    'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
-    'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
-    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
-    'ReduceLROnPlateau', 'CosineAnnealingLR'
+    'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
+    'SGD', 'Momentum', 'lr'
 ]
 
-
-from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
-            AdagradOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, \
-            FtrlOptimizer, AdadeltaOptimizer
-
 from .optimizer import Optimizer
+from .adagrad import Adagrad
 from .adam import Adam
 from .adamw import AdamW
 from .adamax import Adamax
@@ -35,7 +26,4 @@
 from .adadelta import Adadelta
 from .sgd import SGD
 from .momentum import Momentum
-
-from . import lr_scheduler
-from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
-            LinearLrWarmup, ExponentialLR, MultiStepLR, StepLR, LambdaLR, ReduceLROnPlateau, CosineAnnealingLR
+from . import lr
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
new file mode 100644
index 0000000000000..ed55ebd0bf2a3
--- /dev/null
+++ b/python/paddle/optimizer/adagrad.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["Adagrad"]
+
+
+class Adagrad(Optimizer):
+    """
+    The Adaptive Gradient optimizer (Adagrad for short) use an optimization described 
+    in paper: `Adaptive Subgradient Methods for Online Learning and
+    Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
+
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        moment\_out &= moment + grad * grad
+
+        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+
+
+    The original paper does not have the ``epsilon`` attribute. It is added here
+    in our implementation as also proposed `Per-parameter adaptive learning rate
+    methods <http://cs231n.github.io/neural-networks-3/#ada>`_
+    for numerical stability to avoid the division by zero error.
+
+    Args:
+        learning_rate (float|Tensor): The learning rate used to update ``Parameter``.
+            It can be a float value or a ``Variable`` with a float type.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-06.
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies, 
+            ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 
+            meaning there is no gradient clipping.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+        initial_accumulator_value (float, optional): Initial value for moment accumulator.
+            The default value is 0.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            inp = paddle.rand(shape=[10, 10])
+            linear = paddle.nn.Linear(10, 10)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adagrad = paddle.optimizer.Adagrad(learning_rate=0.1,
+                    parameters=linear.parameters())
+            out.backward()
+            adagrad.step()
+            adagrad.clear_grad()
+
+    """
+    _moment_acc_str = "moment"
+
+    def __init__(self,
+                 learning_rate,
+                 epsilon=1.0e-6,
+                 parameters=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 initial_accumulator_value=0.0):
+        assert learning_rate is not None
+        assert epsilon is not None
+        super(Adagrad, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "adagrad"
+        self._epsilon = epsilon
+        self.initial_accumulator_value = initial_accumulator_value
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(
+                self._moment_acc_str,
+                p,
+                fill_value=self.initial_accumulator_value)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment_acc = self._get_accumulator(self._moment_acc_str,
+                                           param_and_grad[0])
+        # Create the adagrad optimizer op
+        adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={"ParamOut": param_and_grad[0],
+                     "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True)
+
+        return adagrad_op
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 366d8b953e3d4..79caa1583121d 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -48,8 +48,8 @@ class Adam(Optimizer):
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
 
     Args:
-        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a _LRScheduler. The default value is 0.001.
+        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LRScheduler. The default value is 0.001.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index cca120efd4507..e5d1962d12625 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -47,8 +47,8 @@ class Adamax(Optimizer):
     it is added here for numerical stability to prevent the division by 0 error.
 
     Args:
-        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a _LRScheduler. The default value is 0.001.
+        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LRScheduler. The default value is 0.001.
         beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
             The default value is 0.9.
         beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 00c197a58b3dd..eaa0509029459 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -42,8 +42,8 @@ class AdamW(Adam):
 
 
     Args:
-        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a _LRScheduler. The default value is 0.001.
+        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LRScheduler. The default value is 0.001.
 	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr.py
similarity index 76%
rename from python/paddle/optimizer/lr_scheduler.py
rename to python/paddle/optimizer/lr.py
index 61391704061bd..051d3cf18f9f0 100644
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr.py
@@ -18,18 +18,62 @@
 from paddle import Tensor
 
 __all__ = [
-    'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
-    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
-    'ReduceLROnPlateau', 'CosineAnnealingLR'
+    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
+    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
+    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+    'CosineAnnealingDecay'
 ]
 
 
-class _LRScheduler(object):
-    """LRScheduler Base class.
+class LRScheduler(object):
+    """
+
+    LRScheduler Base class. Define the common interface of a learning rate scheduler.
+
+    User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,
+
+    then overload it for your subclass and have a custom implementation of ``get_lr()`` .
+
+    Otherwise, an ``NotImplementedError`` exception will be thrown.
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        instance to schedule learning rate.
+
+    Examples:
+        Here is an example of a simple ``StepDecay`` implementation. 
+        
+        .. code-block:: python
+            
+            import paddle
+            from paddle.optimizer.lr import LRScheduler
+
+            class StepDecay(LRScheduler):
+                def __init__(self,
+                            learning_rate,
+                            step_size,
+                            gamma=0.1,
+                            last_epoch=-1,
+                            verbose=False):
+                    if not isinstance(step_size, int):
+                        raise TypeError(
+                            "The type of 'step_size' must be 'int', but received %s." %
+                            type(step_size))
+                    if gamma >= 1.0:
+                        raise ValueError('gamma should be < 1.0.')
+
+                    self.step_size = step_size
+                    self.gamma = gamma
+                    super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+                def get_lr(self):
+                    i = self.last_epoch // self.step_size
+                    return self.base_lr * (self.gamma**i)
 
-    Define the common interface of an LRScheduler.
-    User can 'form paddle.optimizer.lr_scheduler import _LRScheduler'
-    And inherit from it to have a custom implementation of get_lr().
     """
 
     def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
@@ -47,23 +91,22 @@ def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
 
     def __call__(self):
         """ 
-        Return last computed learning rate on current epoch.
+        Return lastest computed learning rate on current epoch.
         """
         return self.last_lr
 
     def step(self, epoch=None):
         """
-        'step' should be called after 'minimize' . It will update the learning rate in optimizer according to 'epoch'.  
-        The new learning rate will take effect on next epoch.
+
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .  
+        The new learning rate will take effect on next ``optimizer.step`` .
 
         Args:
             epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
 
         Returns:
             None
-        
-        Examples:
-            Please refer to the example of current _LRScheduler. 
+
         """
         if epoch is None:
             self.last_epoch += 1
@@ -81,11 +124,12 @@ def step(self, epoch=None):
 
     def state_dict(self):
         """
+
         Returns the state of the scheduler as a :class:`dict`.
 
-        It is a subset of self.__dict__ .
+        It is a subset of ``self.__dict__`` .
         """
-        self._state_keys()
+        self.state_keys()
         state_dict = {}
         for key in self.keys:
             if key not in self.__dict__:
@@ -101,19 +145,26 @@ def state_dict(self):
 
         return state_dict
 
-    # For those subclass who overload _LRScheduler, "last_epoch, last_lr" will be saved by default.
+    # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
     # (Note): you can change it for your subclass.
-    def _state_keys(self):
+    def state_keys(self):
         """
-        set the keys in self.__dict__ that are needed to be saved.
+
+        For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .
+
+        ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.
+
+        If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .
+
         """
         self.keys = ['last_epoch', 'last_lr']
 
     def set_state_dict(self, state_dict):
         """
+
         Loads the schedulers state.
         """
-        self._state_keys()
+        self.state_keys()
         for key in self.keys:
             if key in state_dict:
                 self.__dict__[key] = state_dict[key]
@@ -130,14 +181,20 @@ def set_state_dict(self, state_dict):
     set_dict = set_state_dict
 
     def get_lr(self):
+        """
+        
+        For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .
+
+        Otherwise, an ``NotImplementedError`` exception will be thrown.
+        """
         # calculate by python float
         raise NotImplementedError
 
 
-class NoamLR(_LRScheduler):
+class NoamDecay(LRScheduler):
     """
 
-    Applies Noam Lear to the initial learning rate. 
+    Applies Noam Decay to the initial learning rate. 
 
     The algorithm can be described as following.
 
@@ -156,7 +213,7 @@ class NoamLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``NoamLR`` instance to schedule learning rate.
+        ``NoamDecay`` instance to schedule learning rate.
 
     Examples:
         .. code-block:: python
@@ -164,23 +221,21 @@ class NoamLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -189,7 +244,7 @@ class NoamLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True)
+                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -216,7 +271,7 @@ def __init__(self,
                  verbose=False):
         self.d_model = d_model
         self.warmup_steps = warmup_steps
-        super(NoamLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
 
     def get_lr(self):
         if self.last_epoch == 0:
@@ -227,7 +282,7 @@ def get_lr(self):
         return self.base_lr * (self.d_model**-0.5) * min(a, b)
 
 
-class PiecewiseLR(_LRScheduler):
+class PiecewiseDecay(LRScheduler):
     """
 
     Piecewise learning rate scheduler.
@@ -253,7 +308,7 @@ class PiecewiseLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``PiecewiseLR`` instance to schedule learning rate.
+        ``PiecewiseDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -262,23 +317,21 @@ class PiecewiseLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -287,7 +340,7 @@ class PiecewiseLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
+                scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -308,7 +361,7 @@ class PiecewiseLR(_LRScheduler):
     def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
         self.boundaries = boundaries
         self.values = values
-        super(PiecewiseLR, self).__init__(
+        super(PiecewiseDecay, self).__init__(
             last_epoch=last_epoch, verbose=verbose)
 
     def get_lr(self):
@@ -319,7 +372,7 @@ def get_lr(self):
         return self.values[len(self.values) - 1]
 
 
-class NaturalExpLR(_LRScheduler):
+class NaturalExpDecay(LRScheduler):
     """
 
     Applies natural exponential decay to the initial learning rate.
@@ -328,7 +381,7 @@ class NaturalExpLR(_LRScheduler):
 
     .. math::
 
-        new\_learning\_rate = learning\_rate * e^{- gama * epoch}
+        new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
@@ -337,7 +390,7 @@ class NaturalExpLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``NaturalExpLR`` instance to schedule learning rate.
+        ``NaturalExpDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -346,23 +399,21 @@ class NaturalExpLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -371,7 +422,7 @@ class NaturalExpLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -391,13 +442,14 @@ class NaturalExpLR(_LRScheduler):
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
         self.gamma = gamma
-        super(NaturalExpLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
+                                              verbose)
 
     def get_lr(self):
         return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
 
 
-class InverseTimeLR(_LRScheduler):
+class InverseTimeDecay(LRScheduler):
     """
 
     Applies inverse time decay to the initial learning rate.
@@ -416,7 +468,7 @@ class InverseTimeLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``InverseTimeLR`` instance to schedule learning rate.
+        ``InverseTimeDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -425,23 +477,21 @@ class InverseTimeLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -450,7 +500,7 @@ class InverseTimeLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True)
+                scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -471,13 +521,14 @@ class InverseTimeLR(_LRScheduler):
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
         self.gamma = gamma
-        super(InverseTimeLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
+                                               verbose)
 
     def get_lr(self):
         return self.base_lr / (1 + self.gamma * self.last_epoch)
 
 
-class PolynomialLR(_LRScheduler):
+class PolynomialDecay(LRScheduler):
     """
 
     Applies polynomial decay to the initial learning rate.
@@ -512,7 +563,7 @@ class PolynomialLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``PolynomialLR`` instance to schedule learning rate.
+        ``PolynomialDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -521,23 +572,21 @@ class PolynomialLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -546,7 +595,7 @@ class PolynomialLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True)
+                scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -576,7 +625,8 @@ def __init__(self,
         self.end_lr = end_lr
         self.power = power
         self.cycle = cycle
-        super(PolynomialLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
+                                              verbose)
 
     def get_lr(self):
         tmp_epoch_num = self.last_epoch
@@ -596,7 +646,7 @@ def get_lr(self):
              )**self.power) + self.end_lr
 
 
-class LinearLrWarmup(_LRScheduler):
+class LinearWarmup(LRScheduler):
     """
 
     Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
@@ -604,22 +654,22 @@ class LinearLrWarmup(_LRScheduler):
     
     When epoch < warmup_steps, learning rate is updated as:
     
-    .. code-block:: text
+    .. math::
     
-            lr = start_lr + (end_lr - start_lr) * (epoch / warmup_steps)
+            lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps}
     
     where start_lr is the initial learning rate, and end_lr is the final learning rate;
     
     When epoch >= warmup_steps, learning rate is updated as:
     
-    .. code-block:: text
+    .. math::
     
             lr = learning_rate
     
-    where lr is float or any subclass of ``_LRScheduler`` .
+    where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
 
     Args:
-        learning_rate (float|_LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``_LRScheduler`` .
+        learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
         warmup_steps (int): total steps of warm up.
         start_lr (float): Initial learning rate of warm up.
         end_lr (float): Final learning rate of warm up.
@@ -627,7 +677,7 @@ class LinearLrWarmup(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``LinearLrWarmup`` instance to schedule learning rate.
+        ``LinearWarmup`` instance to schedule learning rate.
 
     Examples:
         
@@ -636,24 +686,22 @@ class LinearLrWarmup(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.LinearLrWarmup(
+            scheduler = paddle.optimizer.lr.LinearWarmup(
                     learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -662,7 +710,7 @@ class LinearLrWarmup(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup(
+                scheduler = paddle.optimizer.lr.LinearWarmup(
                     learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
@@ -678,7 +726,7 @@ class LinearLrWarmup(_LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()      
+                scheduler.step()
     """
 
     def __init__(self,
@@ -689,10 +737,10 @@ def __init__(self,
                  last_epoch=-1,
                  verbose=False):
         type_check = isinstance(learning_rate, float) or isinstance(
-            learning_rate, int) or isinstance(learning_rate, _LRScheduler)
+            learning_rate, int) or isinstance(learning_rate, LRScheduler)
         if not type_check:
             raise TypeError(
-                "the type of learning_rate should be [int, float or _LRScheduler], the current type is {}".
+                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
                 format(learning_rate))
         self.learning_rate = learning_rate
         self.warmup_steps = warmup_steps
@@ -700,24 +748,24 @@ def __init__(self,
         self.end_lr = end_lr
         assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
             end_lr, start_lr)
-        super(LinearLrWarmup, self).__init__(start_lr, last_epoch, verbose)
+        super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
 
     def get_lr(self):
         if self.last_epoch < self.warmup_steps:
             return (self.end_lr - self.start_lr) * float(
                 self.last_epoch) / float(self.warmup_steps) + self.start_lr
         else:
-            if isinstance(self.learning_rate, _LRScheduler):
+            if isinstance(self.learning_rate, LRScheduler):
                 self.learning_rate.step()
                 return self.learning_rate()
 
             return self.learning_rate
 
 
-class ExponentialLR(_LRScheduler):
+class ExponentialDecay(LRScheduler):
     """
 
-    Update learning rate by 'gamma' each epoch.
+    Update learning rate by `gamma` each epoch.
 
     The algorithm can be described as following.
     
@@ -733,7 +781,7 @@ class ExponentialLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``ExponentialLR`` instance to schedule learning rate.
+        ``ExponentialDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -742,23 +790,21 @@ class ExponentialLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -767,7 +813,7 @@ class ExponentialLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True)
+                scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -787,15 +833,16 @@ class ExponentialLR(_LRScheduler):
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
         self.gamma = gamma
-        super(ExponentialLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
+                                               verbose)
 
     def get_lr(self):
         return self.base_lr * (self.gamma**self.last_epoch)
 
 
-class MultiStepLR(_LRScheduler):
+class MultiStepDecay(LRScheduler):
     """
-    Update the learning rate by ``gama`` once ``epoch`` reaches one of the milestones.
+    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
 
     The algorithm can be described as the code below. 
 
@@ -821,7 +868,7 @@ class MultiStepLR(_LRScheduler):
         
 
     Returns:
-        ``MultiStepLR`` instance to schedule learning rate.
+        ``MultiStepDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -830,23 +877,21 @@ class MultiStepLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -855,7 +900,7 @@ class MultiStepLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -894,7 +939,7 @@ def __init__(self,
 
         self.milestones = milestones
         self.gamma = gamma
-        super(MultiStepLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
 
     def get_lr(self):
         for i in range(len(self.milestones)):
@@ -903,7 +948,7 @@ def get_lr(self):
         return self.base_lr * (self.gamma**len(self.milestones))
 
 
-class StepLR(_LRScheduler):
+class StepDecay(LRScheduler):
     """
     Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
 
@@ -929,7 +974,7 @@ class StepLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``StepLR`` instance to schedule learning rate.
+        ``StepDecay`` instance to schedule learning rate.
 
 
     Examples:
@@ -939,23 +984,21 @@ class StepLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -964,7 +1007,7 @@ class StepLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
+                scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -997,14 +1040,14 @@ def __init__(self,
 
         self.step_size = step_size
         self.gamma = gamma
-        super(StepLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
 
     def get_lr(self):
         i = self.last_epoch // self.step_size
         return self.base_lr * (self.gamma**i)
 
 
-class LambdaLR(_LRScheduler):
+class LambdaDecay(LRScheduler):
     """
     Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
 
@@ -1015,9 +1058,9 @@ class LambdaLR(_LRScheduler):
         learning_rate = 0.5        # init learning_rate
         lr_lambda = lambda epoch: 0.95 ** epoch
 
-        learning_rate = 0.5        # epoch 0
-        learning_rate = 0.475      # epoch 1
-        learning_rate = 0.45125    # epoch 2
+        learning_rate = 0.5        # epoch 0, 0.5*0.95**0
+        learning_rate = 0.475      # epoch 1, 0.5*0.95**1
+        learning_rate = 0.45125    # epoch 2, 0.5*0.95**2
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
@@ -1026,7 +1069,7 @@ class LambdaLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
     
     Returns:
-        ``LambdaLR`` instance to schedule learning rate.
+        ``LambdaDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -1035,23 +1078,21 @@ class LambdaLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -1060,7 +1101,7 @@ class LambdaLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
+                scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -1082,17 +1123,17 @@ class LambdaLR(_LRScheduler):
     def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
         if not callable(lr_lambda):
             raise TypeError(
-                "The type of 'lr_lambda' in 'LambdaLR' must be 'function', but received %s."
+                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
                 % type(lr_lambda))
 
         self.lr_lambda = lr_lambda
-        super(LambdaLR, self).__init__(learning_rate, last_epoch, verbose)
+        super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
 
     def get_lr(self):
         return self.base_lr * self.lr_lambda(self.last_epoch)
 
 
-class ReduceLROnPlateau(_LRScheduler):
+class ReduceOnPlateau(LRScheduler):
     """
     Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate 
     by 2 to 10 times once model performance has no longer improvement.
@@ -1126,7 +1167,7 @@ class ReduceLROnPlateau(_LRScheduler):
 
     
     Returns:
-        ``ReduceLROnPlateau`` instance to schedule learning rate.
+        ``ReduceOnPlateau`` instance to schedule learning rate.
 
 
     Examples:
@@ -1135,23 +1176,21 @@ class ReduceLROnPlateau(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step(loss)
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -1160,7 +1199,7 @@ class ReduceLROnPlateau(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
+                scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -1207,7 +1246,7 @@ def __init__(self,
         self.threshold_mode = threshold_mode
         if not isinstance(learning_rate, (float, int)):
             raise TypeError(
-                "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float', but received %s."
+                "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
                 % type(learning_rate))
 
         self.verbose = verbose
@@ -1230,7 +1269,7 @@ def __init__(self,
         self._var_name = None
 
     # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
-    def _state_keys(self):
+    def state_keys(self):
         self.keys = [
             'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
             'last_lr'
@@ -1238,7 +1277,7 @@ def _state_keys(self):
 
     def step(self, metrics, epoch=None):
         """
-        step should be called after 'minimize' . It will update the learning rate in optimizer according to ``metrics`` .  
+        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .  
         The new learning rate will take effect on next epoch.
 
         Args:
@@ -1251,14 +1290,14 @@ def step(self, metrics, epoch=None):
             None
         
         Examples:
-            Please refer to the example of current _LRScheduler.
+            Please refer to the example of current LRScheduler.
         """
         if epoch is None:
             self.last_epoch = self.last_epoch + 1
         else:
             self.last_epoch = epoch
 
-        # loss must be 1-D Tensor with shape [1]
+        # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
         if isinstance(metrics, (Tensor, numpy.ndarray)):
             assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
                 "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
@@ -1290,7 +1329,6 @@ def step(self, metrics, epoch=None):
                             self.last_lr))
 
     def _is_better(self, current, best):
-        print("mode", self.mode, 'threshold_mode', self.threshold_mode)
         if self.mode == 'min' and self.threshold_mode == 'rel':
             return current < best - best * self.threshold
 
@@ -1304,31 +1342,23 @@ def _is_better(self, current, best):
             return current > best + self.threshold
 
 
-class CosineAnnealingLR(_LRScheduler):
+class CosineAnnealingDecay(LRScheduler):
     """
 
     Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
     the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
-    SGDR:
-
-        \begin{aligned}
-            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
-            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
-            & T_{cur} \neq (2k+1)T_{max}; \\
-            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
-            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
-            & T_{cur} = (2k+1)T_{max}.
-        \end{aligned}
+    SGDR.
 
     The algorithm can be described as following.
 
     .. math::
-        \begin{aligned}
-            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
-            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
-            & T_{cur} \neq (2k+1)T_{max}; \\
-            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
-            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+
+        \\begin{aligned}
+            \eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right),
+            & T_{cur} \\neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right),
             & T_{cur} = (2k+1)T_{max}.
         \end{aligned}
     
@@ -1343,7 +1373,7 @@ class CosineAnnealingLR(_LRScheduler):
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-        ``CosineAnnealingLR`` instance to schedule learning rate.
+        ``CosineAnnealingDecay`` instance to schedule learning rate.
 
     Examples:
         
@@ -1352,23 +1382,21 @@ class CosineAnnealingLR(_LRScheduler):
             import paddle
             import numpy as np
 
-            # train on default dygraph mode
-            paddle.disable_static()
-            x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+            # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
-            scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
-            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters())
+            scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
                 for batch_id in range(2):
-                    x = paddle.to_tensor(x)
+                    x = paddle.uniform([10, 10])
                     out = linear(x)
-                    loss = paddle.reduce_mean(out)
+                    loss = paddle.mean(out)
                     loss.backward()
-                    sgd.minimize(loss)
-                    linear.clear_gradients()
+                    sgd.step()
+                    sgd.clear_gradients()
                 scheduler.step()
 
-            # train on static mode
+            # train on static graph mode
             paddle.enable_static()
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
@@ -1377,7 +1405,7 @@ class CosineAnnealingLR(_LRScheduler):
                 y = paddle.static.data(name='y', shape=[None, 4, 5])
                 z = paddle.static.nn.fc(x, 100)
                 loss = paddle.mean(z)
-                scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True)
+                scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
                 sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                 sgd.minimize(loss)
 
@@ -1403,16 +1431,16 @@ def __init__(self,
                  verbose=False):
         if not isinstance(T_max, int):
             raise TypeError(
-                "The type of 'T_max' in 'CosineAnnealingLR' must be 'int', but received %s."
+                "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
                 % type(T_max))
         if not isinstance(eta_min, (float, int)):
             raise TypeError(
-                "The type of 'eta_min' in 'CosineAnnealingLR' must be 'float, int', but received %s."
+                "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
                 % type(eta_min))
         self.T_max = T_max
         self.eta_min = float(eta_min)
-        super(CosineAnnealingLR, self).__init__(learning_rate, last_epoch,
-                                                verbose)
+        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
+                                                   verbose)
 
     def get_lr(self):
         if self.last_epoch == 0:
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 15519cdd300e9..9f857680ca9e1 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -41,7 +41,7 @@
 from functools import reduce
 from ..fluid.wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
-from .lr_scheduler import _LRScheduler
+from .lr import LRScheduler
 
 __all__ = ['Optimizer']
 
@@ -54,8 +54,8 @@ class Optimizer(object):
     but need to use one of it's implementation.
 
     Args:
-        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
-            It can be a float value or any subclass of ``_LRScheduler`` .
+        learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or any subclass of ``LRScheduler`` .
         parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
@@ -82,12 +82,8 @@ class Optimizer(object):
 
             #Take the subclass adam as an example
             import paddle
-            import numpy as np
-
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
             out = linear(inp)
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=0.1,
@@ -121,9 +117,9 @@ def __init__(self,
                             "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
                             % weight_decay.__str__())
                         break
-        if not isinstance(learning_rate, (float, _LRScheduler)):
+        if not isinstance(learning_rate, (float, LRScheduler)):
             raise TypeError(
-                "learning rate should be float or _LRScheduler, got %s here" %
+                "learning rate should be float or LRScheduler, got %s here" %
                 type(learning_rate))
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
@@ -156,7 +152,7 @@ def __init__(self,
     @framework.dygraph_only
     def state_dict(self):
         '''
-        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be include in state dict.
+        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
 
         Args: 
@@ -169,7 +165,6 @@ def state_dict(self):
             .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
                 emb = paddle.nn.Embedding(10, 10)
 
                 adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
@@ -181,14 +176,14 @@ def state_dict(self):
             for para_name, var_tmp in v.items():
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
-        if isinstance(self._learning_rate, _LRScheduler):
+        if isinstance(self._learning_rate, LRScheduler):
             state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
         return state_dict
 
     @framework.dygraph_only
     def set_state_dict(self, state_dict):
         '''
-        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If _LRScheduler have been used, global_step will be changed.
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
 
         Args: 
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
@@ -199,26 +194,28 @@ def set_state_dict(self, state_dict):
             .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
-                emb = paddle.nn.Embedding(10, 10)
 
-                state_dict = emb.state_dict()
-                paddle.framework.save(state_dict, "paddle_dy")
+                emb = paddle.nn.Embedding(10, 10)
 
-                adam = paddle.optimizer.Adam(learning_rate=paddle.optimizer.NoamLR( 100, 10000), 
-                                            parameters=emb.parameters())
-                state_dict = adam.state_dict()
-                paddle.framework.save(state_dict, "paddle_dy")
+                layer_state_dict = emb.state_dict()
+                paddle.save(layer_state_dict, "emb.pdparams")
 
-                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+                scheduler = paddle.optimizer.lr.NoamDecay(	
+                    d_model=0.01, warmup_steps=100, verbose=True)
+                adam = paddle.optimizer.Adam(
+                    learning_rate=scheduler,
+                    parameters=emb.parameters())
+                opt_state_dict = adam.state_dict()
+                paddle.save(opt_state_dict, "adam.pdopt")
 
+                opti_state_dict = paddle.load("adam.pdopt")
                 adam.set_state_dict(opti_state_dict)
 
         '''
-        if isinstance(self._learning_rate, _LRScheduler):
+        if isinstance(self._learning_rate, LRScheduler):
             self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
-        if isinstance(self._learning_rate, _LRScheduler):
+        if isinstance(self._learning_rate, LRScheduler):
             self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
 
         self._accumulators_holder = state_dict
@@ -256,7 +253,7 @@ def get_opti_var_name_list(self):
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
-        if isinstance(self._learning_rate, _LRScheduler):
+        if isinstance(self._learning_rate, LRScheduler):
             lr_var = self._global_learning_rate()
             # only create global lr_var once
             if not isinstance(lr_var, framework.Variable):
@@ -299,7 +296,7 @@ def set_lr(self, value):
         """
         :api_attr: imperative
         
-        Set the value of the learning rate manually in the optimizer. If the optimizer use _LRScheduler,
+        Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
         this API cannot be invoked, because it will lead to conflict.
 
         Args:
@@ -312,7 +309,6 @@ def set_lr(self, value):
             .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
                 linear = paddle.nn.Linear(10, 10)
 
                 adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
@@ -335,9 +331,9 @@ def set_lr(self, value):
             raise TypeError(
                 "The type of 'value' in optimizer.set_lr must be float, but received %s."
                 % (type(value)))
-        if isinstance(self._learning_rate, _LRScheduler):
+        if isinstance(self._learning_rate, LRScheduler):
             raise RuntimeError(
-                "optimizer's learning rate can't be _LRScheduler when invoke this API, because this will lead to conflict."
+                "optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
             )
         self._learning_rate = float(value)
         current_lr = self._global_learning_rate()
@@ -358,7 +354,7 @@ def get_lr(self):
         """
         :api_attr: imperative
         
-        Get current step learning rate. The return value is all the same When _LRScheduler is not used,
+        Get current step learning rate. The return value is all the same When LRScheduler is not used,
         otherwise return the current step learning rate.
 
 
@@ -370,24 +366,22 @@ def get_lr(self):
 
                 import numpy as np
                 import paddle
-                # example1: _LRScheduler is not used, return value is all the same
-                paddle.disable_static()
+                # example1: LRScheduler is not used, return value is all the same
                 emb = paddle.nn.Embedding(10, 10)
                 adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
                 lr = adam.get_lr()
                 print(lr) # 0.001
 
-                # example2: PiecewiseLR is used, return the step learning rate
-                paddle.disable_static()
+                # example2: PiecewiseDecay is used, return the scheduled learning rate
                 inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
                 linear = paddle.nn.Linear(10, 10)
                 inp = paddle.to_tensor(inp)
                 out = linear(inp)
-                loss = paddle.reduce_mean(out)
+                loss = paddle.mean(out)
                 
                 bd = [2, 4, 6, 8]
                 value = [0.2, 0.4, 0.6, 0.8, 1.0]
-                scheduler = paddle.optimizer.PiecewiseLR(bd, value, 0)
+                scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value, 0)
                 adam = paddle.optimizer.Adam(scheduler,
                                        parameters=linear.parameters())
 
@@ -656,7 +650,6 @@ def backward(self,
 
                 import paddle
                 import numpy as np
-                paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
@@ -727,7 +720,6 @@ def apply_gradients(self, params_grads):
                 import paddle
                 import numpy as np
 
-                paddle.disable_static()
                 inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
                 linear = paddle.nn.Linear(10, 10)
                 inp = paddle.to_tensor(inp)
@@ -805,7 +797,7 @@ def clear_grad(self):
 
                 import numpy as np
                 import paddle
-                paddle.disable_static()
+
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
@@ -854,13 +846,9 @@ def minimize(self,
             .. code-block:: python
  
                 import paddle
-                import numpy as np
-
-                paddle.disable_static()
-                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
                 linear = paddle.nn.Linear(10, 10)
-                inp = paddle.to_tensor(inp)
-                out = linear(inp)
+                input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+                out = linear(input)
                 loss = paddle.mean(out)
 
                 beta1 = paddle.to_tensor([0.9], dtype="float32")
@@ -903,7 +891,7 @@ def step(self):
 
                 import paddle
                 import numpy as np
-                paddle.disable_static()
+
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 5e17ca34ff218..a664b01595632 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
 
 
     Parameters:
-        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
-            It can be a float value or a _LRScheduler.
+        learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or a LRScheduler.
         rho(float): rho is :math: `\\rho` in equation, default is 0.95.
         epsilon(float): :math: `\\epsilon` in equation is smoothing term to
             avoid division by zero, default is 1e-6.
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 909a1b6f39503..a6ce4379824f0 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -19,7 +19,7 @@
     'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
     'default_main_program', 'default_startup_program', 'Program', 'data',
     'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model',
-    'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places'
+    'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places', 'Variable'
 ]
 
 from . import nn
@@ -41,6 +41,7 @@
 from ..fluid.framework import program_guard  #DEFINE_ALIAS
 from ..fluid.framework import cpu_places  #DEFINE_ALIAS
 from ..fluid.framework import cuda_places  #DEFINE_ALIAS
+from ..fluid.framework import Variable  #DEFINE_ALIAS
 from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
 from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
 from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 3ae65e879f723..9161bb7af412c 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -18,6 +18,7 @@
     'embedding',
     'bilinear_tensor_product',
     'case',
+    'cond',
     'conv2d',
     'conv2d_transpose',
     'conv3d',
@@ -36,6 +37,7 @@
     'row_conv',
     'spectral_norm',
     'switch_case',
+    'while_loop',
 ]
 
 from .common import fc  #DEFINE_ALIAS
@@ -44,6 +46,7 @@
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
 from ...fluid.layers import case  #DEFINE_ALIAS
+from ...fluid.layers import cond  #DEFINE_ALIAS
 from ...fluid.layers import conv2d  #DEFINE_ALIAS
 from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
 from ...fluid.layers import conv3d  #DEFINE_ALIAS
@@ -61,5 +64,6 @@
 from ...fluid.layers import row_conv  #DEFINE_ALIAS
 from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
 from ...fluid.layers import switch_case  #DEFINE_ALIAS
+from ...fluid.layers import while_loop  #DEFINE_ALIAS
 
 from ...fluid.input import embedding  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 93a603f4770a7..44f0a73fa42cd 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -229,7 +229,7 @@ def deform_conv2d(x,
             float32, float64.
         offset (Tensor): The input coordinate offset of deformable convolution layer.
             A Tensor with type float32, float64.
-        Mask (Tensor, Optional): The input mask of deformable convolution layer.
+        mask (Tensor, Optional): The input mask of deformable convolution layer.
             A Tensor with type float32, float64. It should be None when you use
             deformable convolution v1.
         num_filters(int): The number of filter. It is as same as the output
@@ -237,23 +237,23 @@ def deform_conv2d(x,
         filter_size (int|tuple): The filter size. If filter_size is a tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
             Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
+        stride (int|tuple, Optional): The stride size. If stride is a tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
+        padding (int|tuple, Optional): The padding size. If padding is a tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+        dilation (int|tuple, Optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the deformable conv layer. According to
+        groups (int, Optional): The groups number of the deformable conv layer. According to
             grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1.
-        deformable_groups (int): The number of deformable group partitions.
+        deformable_groups (int, Optional): The number of deformable group partitions.
             Default: deformable_groups = 1.
-        im2col_step (int): Maximum number of images per im2col computation;
+        im2col_step (int, Optional): Maximum number of images per im2col computation;
             The total batch size should be devisable by this value or smaller
             than this value; if you face out of memory problem, you can try
             to use a smaller value here.
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 940bd1a46742d..773e6ebc7af2e 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -23,10 +23,9 @@
 from .attribute import rank  #DEFINE_ALIAS
 from .attribute import shape  #DEFINE_ALIAS
 from .creation import to_tensor  #DEFINE_ALIAS
-from .creation import crop_tensor  #DEFINE_ALIAS
 from .creation import diag  #DEFINE_ALIAS
 from .creation import eye  #DEFINE_ALIAS
-from .creation import fill_constant  #DEFINE_ALIAS
+# from .creation import fill_constant  #DEFINE_ALIAS
 # from .creation import get_tensor_from_selected_rows        #DEFINE_ALIAS
 from .creation import linspace  #DEFINE_ALIAS
 from .creation import ones  #DEFINE_ALIAS
@@ -67,8 +66,8 @@
 from .logic import logical_or  #DEFINE_ALIAS
 from .logic import logical_xor  #DEFINE_ALIAS
 from .logic import not_equal  #DEFINE_ALIAS
-from .logic import reduce_all  #DEFINE_ALIAS
-from .logic import reduce_any  #DEFINE_ALIAS
+# from .logic import reduce_all  #DEFINE_ALIAS
+# from .logic import reduce_any  #DEFINE_ALIAS
 from .logic import allclose  #DEFINE_ALIAS
 from .logic import equal_all  #DEFINE_ALIAS
 # from .logic import isnan        #DEFINE_ALIAS
@@ -94,7 +93,6 @@
 from .manipulation import strided_slice  #DEFINE_ALIAS
 from .manipulation import transpose  #DEFINE_ALIAS
 from .manipulation import unique  #DEFINE_ALIAS
-from .manipulation import unique_with_counts  #DEFINE_ALIAS
 from .manipulation import unsqueeze  #DEFINE_ALIAS
 from .manipulation import unstack  #DEFINE_ALIAS
 from .manipulation import flip  #DEFINE_ALIAS
@@ -109,13 +107,13 @@
 from .math import cos  #DEFINE_ALIAS
 from .math import cosh  #DEFINE_ALIAS
 from .math import cumsum  #DEFINE_ALIAS
-from .math import elementwise_add  #DEFINE_ALIAS
-from .math import elementwise_div  #DEFINE_ALIAS
-from .math import elementwise_floordiv  #DEFINE_ALIAS
-from .math import elementwise_mul  #DEFINE_ALIAS
-from .math import elementwise_mod  #DEFINE_ALIAS
-from .math import elementwise_pow  #DEFINE_ALIAS
-from .math import elementwise_sub  #DEFINE_ALIAS
+# from .math import elementwise_add  #DEFINE_ALIAS
+# from .math import elementwise_div  #DEFINE_ALIAS
+# from .math import elementwise_floordiv  #DEFINE_ALIAS
+# from .math import elementwise_mul  #DEFINE_ALIAS
+# from .math import elementwise_mod  #DEFINE_ALIAS
+# from .math import elementwise_pow  #DEFINE_ALIAS
+# from .math import elementwise_sub  #DEFINE_ALIAS
 from .math import exp  #DEFINE_ALIAS
 from .math import floor  #DEFINE_ALIAS
 from .math import increment  #DEFINE_ALIAS
@@ -123,10 +121,10 @@
 from .math import multiplex  #DEFINE_ALIAS
 from .math import pow  #DEFINE_ALIAS
 from .math import reciprocal  #DEFINE_ALIAS
-from .math import reduce_max  #DEFINE_ALIAS
-from .math import reduce_min  #DEFINE_ALIAS
-from .math import reduce_prod  #DEFINE_ALIAS
-from .math import reduce_sum  #DEFINE_ALIAS
+# from .math import reduce_max  #DEFINE_ALIAS
+# from .math import reduce_min  #DEFINE_ALIAS
+# from .math import reduce_prod  #DEFINE_ALIAS
+# from .math import reduce_sum  #DEFINE_ALIAS
 from .math import round  #DEFINE_ALIAS
 from .math import rsqrt  #DEFINE_ALIAS
 from .math import scale  #DEFINE_ALIAS
@@ -137,9 +135,8 @@
 from .math import square  #DEFINE_ALIAS
 from .math import stanh  #DEFINE_ALIAS
 from .math import sum  #DEFINE_ALIAS
-from .math import sums  #DEFINE_ALIAS
 from .math import tanh  #DEFINE_ALIAS
-from .math import elementwise_sum  #DEFINE_ALIAS
+from .math import add_n  #DEFINE_ALIAS
 from .math import max  #DEFINE_ALIAS
 from .math import maximum  #DEFINE_ALIAS
 from .math import min  #DEFINE_ALIAS
@@ -157,7 +154,7 @@
 from .math import inverse  #DEFINE_ALIAS
 from .math import log1p  #DEFINE_ALIAS
 from .math import erf  #DEFINE_ALIAS
-from .math import addcmul  #DEFINE_ALIAS
+# from .math import addcmul  #DEFINE_ALIAS
 from .math import addmm  #DEFINE_ALIAS
 from .math import clip  #DEFINE_ALIAS
 from .math import trace  #DEFINE_ALIAS
@@ -177,8 +174,8 @@
 from .search import argmax  #DEFINE_ALIAS
 from .search import argmin  #DEFINE_ALIAS
 from .search import argsort  #DEFINE_ALIAS
-from .search import has_inf  #DEFINE_ALIAS
-from .search import has_nan  #DEFINE_ALIAS
+# from .search import has_inf  #DEFINE_ALIAS
+# from .search import has_nan  #DEFINE_ALIAS
 # from .search import masked_select        #DEFINE_ALIAS
 from .search import topk  #DEFINE_ALIAS
 from .search import where  #DEFINE_ALIAS
@@ -188,10 +185,11 @@
 from .search import index_sample  #DEFINE_ALIAS
 from .search import masked_select  #DEFINE_ALIAS
 from .stat import mean  #DEFINE_ALIAS
-from .stat import reduce_mean  #DEFINE_ALIAS
+# from .stat import reduce_mean  #DEFINE_ALIAS
 from .stat import std  #DEFINE_ALIAS
 from .stat import var  #DEFINE_ALIAS
 from .stat import numel  #DEFINE_ALIAS
 # from .tensor import Tensor        #DEFINE_ALIAS
 # from .tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor import LoDTensorArray        #DEFINE_ALIAS
+from .to_string import set_printoptions
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index d783d6329e67d..65a33ade27a22 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -24,20 +24,15 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
-from ..fluid.layers import fill_constant
 from paddle.common_ops_import import *
 
 # TODO: define functions to get create a tensor  
-from ..fluid.layers import crop_tensor  #DEFINE_ALIAS
-from ..fluid.layers import fill_constant  #DEFINE_ALIAS
 from ..fluid.layers import linspace  #DEFINE_ALIAS
 import paddle
 
 __all__ = [
     'to_tensor',
-    'crop_tensor',
     'diag',
-    'fill_constant',
     #       'get_tensor_from_selected_rows',
     'linspace',
     'ones',
@@ -52,7 +47,8 @@
     'empty_like',
     'triu',
     'tril',
-    'meshgrid'
+    'meshgrid',
+    'assign',
 ]
 
 
@@ -317,7 +313,7 @@ def ones(shape, dtype=None, name=None):
           #  [1 1]]
           
           # shape is a Tensor
-          shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
+          shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
           data3 = paddle.ones(shape=shape, dtype='int32') 
           # [[1 1]
           #  [1 1]]
@@ -398,7 +394,7 @@ def zeros(shape, dtype=None, name=None):
           #  [0. 0.]]
           
           # shape is a Tensor
-          shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
+          shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
           data3 = paddle.zeros(shape=shape, dtype='int32') 
           # [[0 0]
           #  [0 0]]
@@ -526,18 +522,18 @@ def full(shape, fill_value, dtype=None, name=None):
           # [0]]
 
           # attr shape is a list which contains Tensor.
-          positive_2 = paddle.fill_constant([1], "int32", 2)
+          positive_2 = paddle.full([1], 2, "int32")
           data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5)
           # [[1.5 1.5]]
 
           # attr shape is a Tensor.
-          shape = paddle.fill_constant([2], "int32", 2)
+          shape = paddle.full([2], 2, "int32")
           data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) 
           # [[True True] 
           #  [True True]]
           
           # attr fill_value is a Tensor.
-          val = paddle.fill_constant([1], "float32", 2.0)
+          val = paddle.full([1], 2.0, "float32")
           data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32')
           # [[2.0] 
           #  [2.0]]
@@ -1111,3 +1107,77 @@ def empty_like(x, dtype=None, name=None):
         stop_gradient=True)
     out.stop_gradient = True
     return out
+
+
+def assign(x, output=None):
+    """
+ 
+ 
+    The OP copies the :attr:`x` to the :attr:`output`.
+ 
+    Parameters:
+        x (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
+            float16, float32, float64, int32 and int64.
+        output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
+            be created as :attr:`output`. Default: None.
+ 
+    Returns:
+        Tensor: A tensor with the same shape, data type and value as :attr:`x`.
+ 
+    Examples:
+        .. code-block:: python
+ 
+          import paddle
+          import numpy as np
+          data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          array = np.array([[1, 1],
+                            [3, 4],
+                            [1, 3]]).astype(np.int64)
+          result1 = paddle.zeros(shape=[3, 3], dtype='float32')
+          paddle.assign(array, result1) # result1 = [[1, 1], [3 4], [1, 3]]
+          result2 = paddle.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+    """
+    helper = LayerHelper('assign', **locals())
+    check_type(x, 'x', (Variable, numpy.ndarray), 'assign')
+    if isinstance(x, Variable):
+        check_dtype(
+            x.dtype, 'x',
+            ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+            'assign', '(When the type of input in assign is Variable.)')
+        if output is None:
+            output = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type='assign', inputs={'X': [x]}, outputs={'Out': [output]})
+    elif isinstance(x, numpy.ndarray):
+        dtype = convert_np_dtype_to_dtype_(x.dtype)
+        if dtype == VarDesc.VarType.BOOL:
+            value_name = "bool_values"
+            values = [bool(v) for v in x.flat]
+        elif dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in x.flat]
+        elif dtype == VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in x.flat]
+        elif dtype == VarDesc.VarType.INT64:
+            value_name = "int64_values"
+            values = [int(v) for v in x.flat]
+        else:
+            raise TypeError(
+                "When the type of 'x' in assign is numpy.ndarray, "
+                "the data type of 'x' must be bool, float32, int32 or int64, but "
+                "received %s." % convert_dtype(dtype))
+        if x.size > 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        if output is None:
+            output = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={'dtype': dtype,
+                   'shape': list(x.shape),
+                   value_name: values})
+
+    return output
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 5fd714421c8ed..1fc1c17d2edb2 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import to_tensor
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
@@ -26,8 +27,6 @@
 from ..fluid.layers import logical_not  #DEFINE_ALIAS
 from ..fluid.layers import logical_or  #DEFINE_ALIAS
 from ..fluid.layers import logical_xor  #DEFINE_ALIAS
-from ..fluid.layers import reduce_all  #DEFINE_ALIAS
-from ..fluid.layers import reduce_any  #DEFINE_ALIAS
 
 __all__ = [
     'equal',
@@ -43,8 +42,6 @@
     'logical_or',
     'logical_xor',
     'not_equal',
-    'reduce_all',
-    'reduce_any',
     'allclose',
     #       'isnan'
 ]
@@ -99,8 +96,8 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     Args:
         x(Tensor): ${input_comment}.
         y(Tensor): ${other_comment}.
-        rtol(rtoltype, optional): ${rtol_comment}.
-        atol(atoltype, optional): ${atol_comment}.
+        rtol(rtoltype, optional): The relative tolerance. Default: :math:`1e-5` .
+        atol(atoltype, optional): The absolute tolerance. Default: :math:`1e-8` .
         equal_nan(equalnantype, optional): ${equal_nan_comment}.
         name (str, optional): Name for the operation. For more information, please
             refer to :ref:`api_guide_Name`. Default: None.
@@ -146,7 +143,9 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     """
 
     if in_dygraph_mode():
-        return core.ops.allclose(x, y, 'rtol', rtol, 'atol', atol, 'equal_nan',
+        rtol_tensor = to_tensor(rtol, dtype='float64')
+        atol_tensor = to_tensor(atol, dtype='float64')
+        return core.ops.allclose(x, y, rtol_tensor, atol_tensor, 'equal_nan',
                                  equal_nan)
 
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
@@ -156,11 +155,26 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     check_type(equal_nan, 'equal_nan', bool, 'allclose')
 
     helper = LayerHelper("allclose", **locals())
+    rtol_var = helper.create_global_variable(
+        name=fluid.unique_name.generate('rtol'),
+        persistable=True,
+        dtype='float64',
+        shape=[1])
+    helper.set_variable_initializer(
+        rtol_var, initializer=fluid.initializer.ConstantInitializer(rtol))
+    atol_var = helper.create_variable(
+        name=fluid.unique_name.generate('atol'),
+        persistable=True,
+        dtype='float64',
+        shape=[1])
+    helper.set_variable_initializer(
+        atol_var, initializer=fluid.initializer.ConstantInitializer(atol))
+
     out = helper.create_variable_for_type_inference(dtype='bool')
 
-    inputs = {'Input': x, 'Other': y}
+    inputs = {'Input': x, 'Other': y, 'Rtol': rtol_var, 'Atol': atol_var}
     outputs = {'Out': out}
-    attrs = {'rtol': rtol, 'atol': atol, 'equal_nan': equal_nan}
+    attrs = {'equal_nan': equal_nan}
     helper.append_op(
         type='allclose', inputs=inputs, outputs=outputs, attrs=attrs)
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 531629c573fb6..19b88e122e4e2 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -30,7 +30,6 @@
 
 from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
 from ..fluid.layers import shard_index  #DEFINE_ALIAS
-from ..fluid.layers import unique_with_counts  #DEFINE_ALIAS
 from ..fluid import layers
 import paddle
 
@@ -57,7 +56,6 @@
     'strided_slice',
     'transpose',
     'unique',
-    'unique_with_counts',
     'unsqueeze',
     'unstack',
     'flip',
@@ -671,21 +669,24 @@ def unique(x,
     }
     out = helper.create_variable_for_type_inference(
         dtype=x.dtype, stop_gradient=True)
+    indices = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True)
     inverse = helper.create_variable_for_type_inference(
         dtype=attr_dtype, stop_gradient=True)
-    outputs = {"Out": out, "Index": inverse}
+    counts = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True)
+    outputs = {
+        "Out": out,
+        "Indices": indices,
+        "Index": inverse,
+        "Counts": counts
+    }
     outs = [out]
     if return_index:
-        indices = helper.create_variable_for_type_inference(
-            dtype=attr_dtype, stop_gradient=True)
-        outputs["Indices"] = indices
         outs.append(indices)
     if return_inverse:
         outs.append(inverse)
     if return_counts:
-        counts = helper.create_variable_for_type_inference(
-            dtype=attr_dtype, stop_gradient=True)
-        outputs["Counts"] = counts
         outs.append(counts)
 
     helper.append_op(
@@ -1352,7 +1353,7 @@ def reshape(x, shape, name=None):
     the corresponding dimension of x.
 
     Args:
-        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
         shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
                         The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
                         If ``shape`` is an Tensor, it should be an 1-D Tensor .
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 138841fcf074b..19ba7f1b38ce4 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -35,21 +35,21 @@
 from ..fluid.layers import cos    #DEFINE_ALIAS
 from ..fluid.layers import sinh    #DEFINE_ALIAS
 from ..fluid.layers import cosh    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_floordiv    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_mod    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_mul    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_pow    #DEFINE_ALIAS
-from ..fluid.layers import elementwise_sub    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_floordiv    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_mod    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_mul    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_pow    #DEFINE_ALIAS
+# from ..fluid.layers import elementwise_sub    #DEFINE_ALIAS
 from ..fluid.layers import exp    #DEFINE_ALIAS
 from ..fluid.layers import floor    #DEFINE_ALIAS
 from ..fluid.layers import log    #DEFINE_ALIAS
 from ..fluid.layers import reciprocal    #DEFINE_ALIAS
-from ..fluid.layers import reduce_max    #DEFINE_ALIAS
-from ..fluid.layers import reduce_min    #DEFINE_ALIAS
-from ..fluid.layers import reduce_prod    #DEFINE_ALIAS
-from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
+# from ..fluid.layers import reduce_max    #DEFINE_ALIAS
+# from ..fluid.layers import reduce_min    #DEFINE_ALIAS
+# from ..fluid.layers import reduce_prod    #DEFINE_ALIAS
+# from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
 from ..fluid.layers import round    #DEFINE_ALIAS
 from ..fluid.layers import rsqrt    #DEFINE_ALIAS
 from ..fluid.layers import scale    #DEFINE_ALIAS
@@ -60,9 +60,7 @@
 from ..fluid.layers import sqrt    #DEFINE_ALIAS
 from ..fluid.layers import sin    #DEFINE_ALIAS
 
-from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
-from ..fluid.layers import sums    #DEFINE_ALIAS
 from ..fluid import layers
 
 
@@ -75,12 +73,6 @@
         'cos',
         'cosh',
         'cumsum',
-        'elementwise_add',
-        'elementwise_div',
-        'elementwise_floordiv',
-        'elementwise_mod',
-        'elementwise_pow',
-        'elementwise_sub',
         'exp',
         'floor',
         'increment',
@@ -91,10 +83,6 @@
         'pow',
         'prod',
         'reciprocal',
-        'reduce_max',
-        'reduce_min',
-        'reduce_prod',
-        'reduce_sum',
         'round',
         'rsqrt',
         'scale',
@@ -105,9 +93,8 @@
         'square',
         'stanh',
         'sum',
-        'sums',
         'tanh',
-        'elementwise_sum',
+        'add_n',
         'max',
         'maximum',
         'min',
@@ -183,7 +170,7 @@ def pow(x, y, name=None):
             print(res.numpy()) # [1 4 9]
             
             # example 2: y is a Tensor
-            y = paddle.fill_constant(shape=[1], value=2, dtype='float32')
+            y = paddle.full(shape=[1], fill_value=2, dtype='float32')
             res = paddle.pow(x, y)
             print(res.numpy()) # [1 4 9]
 
@@ -728,11 +715,8 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
 
 
 @templatedoc(op_type="sum")
-def elementwise_sum(inputs, name=None):
+def add_n(inputs, name=None):
     """
-	:alias_main: paddle.elementwise_sum
-	:alias: paddle.elementwise_sum,paddle.tensor.elementwise_sum,paddle.tensor.math.elementwise_sum
-
     ${comment}
 
     Case 1:
@@ -766,53 +750,40 @@ def elementwise_sum(inputs, name=None):
                       [14, 16, 18]]
 
     Args:
-        inputs (Variable|list(Variable)):  A Varaible list. The shape and data type of the list elementsshould be consistent.
-            Variable can be multi-dimensional Tensoror LoDTensor, and data types can be: float32, float64, int32, int64.
+        inputs (Tensor|list(Tensor)):  A Tensor list. The shape and data type of the list elements should be consistent.
+            Input can be multi-dimensional Tensor, and data types can be: float32, float64, int32, int64.
         name(str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: the sum of input :math:`inputs` . its shape and data types are consistent with :math:`inputs` .
+        Tensor, the sum of input :math:`inputs` , its shape and data types are consistent with :math:`inputs`.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
 
-            input0 = fluid.layers.fill_constant(shape=[2, 3], dtype='int64', value=5)
-            input1 = fluid.layers.fill_constant(shape=[2, 3], dtype='int64', value=3)
-            sum = paddle.elementwise_sum([input0, input1])
-
-            # You can print out 'sum' via executor.
-            out = fluid.layers.Print(sum, message="the sum of input0 and input1: ")
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_main_program())
-
-            # The printed result is:
-            # 1570701754	the sum of input0 and input1: 	The place is:CPUPlace
-            # Tensor[elementwise_sum_0.tmp_0]
-            #    shape: [2,3,]
-            #    dtype: l
-            #    data: 8,8,8,8,8,8,
-
-            # the sum of input0 and input1 is 2-D Tensor with shape [2,3].
-            # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
-            #       and '__int64' on Windows. They both represent 64-bit integer variables.
+            input0 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype='float32')
+            input1 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]], dtype='float32')
+            output = paddle.add_n([input0, input1])
+            # [[8., 10., 12.], 
+            #  [14., 16., 18.]]
     """
+    if in_dygraph_mode():
+        if isinstance(inputs, Variable):
+            inputs = [inputs]
+        return core.ops.sum(inputs, 'use_mkldnn', False)
 
-    helper = LayerHelper('elementwise_sum', **locals())
-    check_type(inputs, 'inputs', (Variable, tuple, list), 'elementwise_sum')
+    helper = LayerHelper('add_n', **locals())
+    check_type(inputs, 'inputs', (Variable, tuple, list), 'add_n')
     if isinstance(inputs, list) or isinstance(inputs, tuple):
         if len(inputs) > 0:
             for input in inputs:
                 check_variable_and_dtype(input, "inputs", \
-                   ['float32', 'float64', 'int32', 'int64'], 'elementwise_sum')
+                   ['float32', 'float64', 'int32', 'int64'], 'add_n')
     else:
         check_variable_and_dtype(inputs, "inputs", \
-                ['float32', 'float64', 'int32', 'int64'], 'elementwise_sum')
+                ['float32', 'float64', 'int32', 'int64'], 'add_n')
 
 
     out = helper.create_variable_for_type_inference(
@@ -1308,33 +1279,25 @@ def min(x, axis=None, keepdim=False, name=None):
 
 def log1p(x, name=None):
     """
-	:alias_main: paddle.log1p
-	:alias: paddle.log1p,paddle.tensor.log1p,paddle.tensor.math.log1p
-
     Calculates the natural log of the given input tensor, element-wise.
     .. math::
         Out = \\ln(x+1)
+
     Args:
-        x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
+        x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
         name(str, optional): The default value is None.  Normally there is no need for 
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
-        Variable: The natural log of the input LoDTensor or Tensor computed element-wise.
+        Tensor, the natural log of the input Tensor computed element-wise.
 
     Examples:
         .. code-block:: python
+
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-            # Graph Organizing
-            x = fluid.data(name="x", shape=[2,1], dtype="float32")
-            res = paddle.log1p(x)
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-            # Execute
-            x_i = np.array([[0], [1]]).astype(np.float32)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
-            print(res_val) # [[0.], [0.6931472]]
+
+            data = paddle.to_tensor([[0], [1]], dtype='float32')
+            res = paddle.log1p(data)
+            # [[0.], [0.6931472]]
     """
 
     if in_dygraph_mode():
@@ -1351,32 +1314,34 @@ def log1p(x, name=None):
 
 def addcmul(input, tensor1, tensor2, value=1.0, name=None):
     """
-	:alias_main: paddle.addcmul
-	:alias: paddle.addcmul,paddle.tensor.addcmul,paddle.tensor.math.addcmul
 
     Calculate the element-wise multiplication of tensor1 and tensor2,
     then multiply the result by value, and add it to input. The shape of input,
     tensor1, tensor2 should be broadcastable.
     The equation is:
     ..  math::
+
         out = input + value * tensor1 * tensor2
     Args:
-        input(Variable): The input to be added. A Tensor with type float32, float64, int32, int64.
-        tensor1(Variable): The tensor to be multiplied. A Tensor with type float32, float64, int32, int64.
-        tensor2(Variable): The tensor to be multiplied. A Tensor with type float32, float64, int32, int64.
+        input(Tensor): The input to be added. A Tensor with type float32, float64, int32, int64.
+        tensor1(Tensor): The tensor to be multiplied. A Tensor with type float32, float64, int32, int64.
+        tensor2(Tensor): The tensor to be multiplied. A Tensor with type float32, float64, int32, int64.
         value(int|float): The multiplier for tensor1*tensor2. For float32 and float64 type input, value must be float, otherwise an integer.
         name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
                         Generally, no setting is required. Default: None.
     Returns:
-        out(Variable): The output result. A Tensor with the same data type as input's.
+        out(Tensor): The output result. A Tensor with the same data type as input's.
     Examples:
         .. code-block:: python
+          
           import paddle
-          import paddle.fluid as fluid
-          input = fluid.data(name='input', dtype='float32', shape=[3, 4])
-          tensor1 = fluid.data(name='tenosr1', dtype='float32', shape=[1, 4])
-          tensor2 = fluid.data(name='tensor2', dtype='float32', shape=[3, 4])
-          data = paddle.addcmul(input, tensor1, tensor2, value=1.0)
+          input = paddle.ones([2,2])
+          tensor1 = paddle.ones([2,2])
+          tensor2 = paddle.ones([2,2])
+          out = paddle.tensor.math.addcmul(input, tensor1, tensor2, value=0.5)
+          print(out.numpy())
+          # [[1.5 1.5]
+          # [1.5 1.5]]
     """
 
     check_variable_and_dtype(input, 'input', ['float32', 'float64', 'int32', 'int64'], 'addcmul')
@@ -1932,3 +1897,39 @@ def tanh(x, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='tanh', inputs={'X': x}, outputs={'Out': out})
     return out
+
+def increment(x, value=1.0, name=None):
+    """
+    The OP is usually used for control flow to increment the data of :attr:`x` by an amount :attr:`value`.
+    Notice that the number of elements in :attr:`x` must be equal to 1.
+
+    Args:
+        x (Tensor): A tensor that must always contain only one element, its data type supports float32, float64, int32 and int64.
+        value(float, optional): The amount to increment the data of :attr:`x`. Default: 1.0.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the elementwise-incremented tensor with the same shape and data type as :attr:`x`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data = paddle.zeros(shape=[1], dtype='float32')
+            counter = paddle.increment(data)
+            # [1.]
+
+    """
+    if in_dygraph_mode():
+        return core.ops.increment(x, 'step', value)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'increment')
+    helper = LayerHelper("increment", **locals())
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [x]},
+        attrs={'step': float(value)})
+    return x
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index a46946cea868a..eb9750bcc3957 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -57,19 +57,19 @@ def bernoulli(x, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-
-        paddle.disable_static()
+            import paddle
 
-        x = paddle.rand([2, 3])
-        print(x.numpy())
-        # [[0.11272584 0.3890902  0.7730957 ]
-        # [0.10351662 0.8510418  0.63806665]]
+            paddle.manual_seed(100) # on CPU device
+            x = paddle.rand([2,3])
+            print(x.numpy())
+            # [[0.5535528  0.20714243 0.01162981]
+            # [0.51577556 0.36369765 0.2609165 ]]
 
-        out = paddle.bernoulli(x)
-        print(out.numpy())
-        # [[0. 0. 1.]
-        # [0. 0. 1.]]
+            paddle.manual_seed(200) # on CPU device
+            out = paddle.bernoulli(x)
+            print(out.numpy())
+            # [[0. 0. 0.]
+            # [1. 1. 0.]]
 
     """
 
@@ -108,28 +108,29 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
     Examples:
         .. code-block:: python
 
-        import paddle
-
-        paddle.disable_static()
+            import paddle
 
-        x = paddle.rand([2,4])
-        print(x.numpy())
-        # [[0.7713825  0.4055941  0.433339   0.70706886]
-        # [0.9223313  0.8519825  0.04574518 0.16560672]]
+            paddle.manual_seed(100) # on CPU device
+            x = paddle.rand([2,4])
+            print(x.numpy())
+            # [[0.5535528  0.20714243 0.01162981 0.51577556]
+            # [0.36369765 0.2609165  0.18905126 0.5621971 ]]
 
-        out1 = paddle.multinomial(x, num_samples=5, replacement=True)
-        print(out1.numpy())
-        # [[3 3 1 1 0]
-        # [0 0 0 0 1]]
+            paddle.manual_seed(200) # on CPU device
+            out1 = paddle.multinomial(x, num_samples=5, replacement=True)
+            print(out1.numpy())
+            # [[3 3 0 0 0]
+            # [3 3 3 1 0]]
 
-        # out2 = paddle.multinomial(x, num_samples=5)
-        # OutOfRangeError: When replacement is False, number of samples
-        #  should be less than non-zero categories
+            # out2 = paddle.multinomial(x, num_samples=5)
+            # InvalidArgumentError: When replacement is False, number of samples
+            #  should be less than non-zero categories
 
-        out3 = paddle.multinomial(x, num_samples=3)
-        print(out3.numpy())
-        # [[0 2 3]
-        # [0 1 3]]
+            paddle.manual_seed(300) # on CPU device
+            out3 = paddle.multinomial(x, num_samples=3)
+            print(out3.numpy())
+            # [[3 0 1]
+            # [3 1 0]]
 
     """
 
@@ -431,8 +432,8 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
             # example 2:
             # attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
+            dim_1 = paddle.full([1], 2, "int64")
+            dim_2 = paddle.full([1], 3, "int32")
             result_2 = paddle.tensor.random.uniform(shape=[dim_1, dim_2])
             # [[-0.9951253,   0.30757582, 0.9899647 ],
             #  [ 0.5864527,   0.6607096,  -0.8886161 ]]
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 19d8fc58b0e7e..7adf1b7cc4bd0 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -18,15 +18,13 @@
 from ..fluid import core, layers
 
 # TODO: define searching & indexing functions of a tensor  
-from ..fluid.layers import has_inf  #DEFINE_ALIAS
-from ..fluid.layers import has_nan  #DEFINE_ALIAS
+# from ..fluid.layers import has_inf  #DEFINE_ALIAS
+# from ..fluid.layers import has_nan  #DEFINE_ALIAS
 
 __all__ = [
     'argmax',
     'argmin',
     'argsort',
-    'has_inf',
-    'has_nan',
     'masked_select',
     'topk',
     'where',
@@ -570,9 +568,6 @@ def where(condition, x, y, name=None):
 
 def index_sample(x, index):
     """
-	:alias_main: paddle.index_sample
-	:alias: paddle.index_sample,paddle.tensor.index_sample,paddle.tensor.search.index_sample
-
     **IndexSample Layer**
 
     IndexSample OP returns the element of the specified location of X, 
@@ -595,13 +590,13 @@ def index_sample(x, index):
                        [6, 8, 10]]
 
     Args:
-        x (Variable): The source input tensor with 2-D shape. Supported data type is 
+        x (Tensor): The source input tensor with 2-D shape. Supported data type is 
             int32, int64, float32, float64.
-        index (Variable): The index input tensor with 2-D shape, first dimension should be same with X. 
+        index (Tensor): The index input tensor with 2-D shape, first dimension should be same with X. 
             Data type is int32 or int64.
 
     Returns:
-        output (Variable): The output is a tensor with the same shape as index.
+        output (Tensor): The output is a tensor with the same shape as index.
 
     Examples:
 
@@ -609,7 +604,6 @@ def index_sample(x, index):
 
             import paddle
 
-            paddle.disable_static()
             x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
                                   [5.0, 6.0, 7.0, 8.0],
                                   [9.0, 10.0, 11.0, 12.0]], dtype='float32')
@@ -644,8 +638,10 @@ def index_sample(x, index):
             # [ 800  700]
             # [1200 1100]]
 
-
     """
+    if in_dygraph_mode():
+        return core.ops.index_sample(x, index)
+
     helper = LayerHelper("index_sample", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              'paddle.tensor.search.index_sample')
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index d56dff5a81018..24f62bfcd8d46 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 # TODO: define statistical functions of a tensor  
-from ..fluid.layers import reduce_mean  #DEFINE_ALIAS
 
-__all__ = ['mean', 'reduce_mean', 'std', 'var', 'numel']
+__all__ = ['mean', 'std', 'var', 'numel']
 
 import numpy as np
 from ..fluid.framework import Variable
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
new file mode 100644
index 0000000000000..0da110146a8e0
--- /dev/null
+++ b/python/paddle/tensor/to_string.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from paddle.fluid.layers import core
+from paddle.fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+
+__all__ = ['set_printoptions']
+
+
+class PrintOptions(object):
+    precision = 8
+    threshold = 1000
+    edgeitems = 3
+    linewidth = 80
+    sci_mode = False
+
+
+DEFAULT_PRINT_OPTIONS = PrintOptions()
+
+
+def set_printoptions(precision=None,
+                     threshold=None,
+                     edgeitems=None,
+                     sci_mode=None):
+    """Set the printing options for Tensor.
+    NOTE: The function is similar with numpy.set_printoptions()
+
+    Args:
+        precision (int, optional): Number of digits of the floating number, default 8.
+        threshold (int, optional): Total number of elements printed, default 1000.
+        edgeitems (int, optional): Number of elements in summary at the begining and end of each dimension, defalt 3.
+        sci_mode (bool, optional): Format the floating number with scientific notation or not, default False.
+    
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.manual_seed(10)
+            a = paddle.rand([10, 20])
+            paddle.set_printoptions(4, 100, 3)
+            print(a)
+            
+            '''
+            Tensor: dygraph_tmp_0
+            - place: CPUPlace
+            - shape: [10, 20]
+            - layout: NCHW
+            - dtype: float32
+            - data: [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
+                    [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
+                    [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
+                    ...,
+                    [0.3426, 0.1909, 0.7240, ..., 0.4218, 0.2676, 0.5679],
+                    [0.5561, 0.2081, 0.0676, ..., 0.9778, 0.3302, 0.9559],
+                    [0.2665, 0.8483, 0.5389, ..., 0.4956, 0.6862, 0.9178]]
+            '''
+    """
+    kwargs = {}
+
+    if precision is not None:
+        check_type(precision, 'precision', (int), 'set_printoptions')
+        DEFAULT_PRINT_OPTIONS.precision = precision
+        kwargs['precision'] = precision
+    if threshold is not None:
+        check_type(threshold, 'threshold', (int), 'set_printoptions')
+        DEFAULT_PRINT_OPTIONS.threshold = threshold
+        kwargs['threshold'] = threshold
+    if edgeitems is not None:
+        check_type(edgeitems, 'edgeitems', (int), 'set_printoptions')
+        DEFAULT_PRINT_OPTIONS.edgeitems = edgeitems
+        kwargs['edgeitems'] = edgeitems
+    if sci_mode is not None:
+        check_type(sci_mode, 'sci_mode', (bool), 'set_printoptions')
+        DEFAULT_PRINT_OPTIONS.sci_mode = sci_mode
+        kwargs['sci_mode'] = sci_mode
+    #TODO(zhiqiu): support linewidth
+    core.set_printoptions(**kwargs)
+
+
+def _to_sumary(var):
+    edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
+
+    if len(var.shape) == 0:
+        return var
+    elif len(var.shape) == 1:
+        if var.shape[0] > 2 * edgeitems:
+            return paddle.concat([var[:edgeitems], var[-edgeitems:]])
+        else:
+            return var
+    else:
+        # recursively handle all dimensions
+        if var.shape[0] > 2 * edgeitems:
+            begin = [x for x in var[:edgeitems]]
+            end = [x for x in var[-edgeitems:]]
+            return paddle.stack([_to_sumary(x) for x in (begin + end)])
+        else:
+            return paddle.stack([_to_sumary(x) for x in var])
+
+
+def _format_item(np_var, max_width=0):
+    if np_var.dtype == np.float32 or np_var.dtype == np.float64 or np_var.dtype == np.float16:
+        if DEFAULT_PRINT_OPTIONS.sci_mode:
+            item_str = '{{:.{}e}}'.format(
+                DEFAULT_PRINT_OPTIONS.precision).format(np_var)
+        elif np.ceil(np_var) == np_var:
+            item_str = '{:.0f}.'.format(np_var)
+        else:
+            item_str = '{{:.{}f}}'.format(
+                DEFAULT_PRINT_OPTIONS.precision).format(np_var)
+    else:
+        item_str = '{}'.format(np_var)
+
+    if max_width > len(item_str):
+        return '{indent}{data}'.format(
+            indent=(max_width - len(item_str)) * ' ', data=item_str)
+    else:
+        return item_str
+
+
+def _get_max_width(var):
+    max_width = 0
+    for item in list(var.numpy().flatten()):
+        item_str = _format_item(item)
+        max_width = max(max_width, len(item_str))
+    return max_width
+
+
+def _format_tensor(var, sumary, indent=0):
+    edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
+    max_width = _get_max_width(_to_sumary(var))
+
+    if len(var.shape) == 0:
+        # currently, shape = [], i.e., scaler tensor is not supported.
+        # If it is supported, it should be formatted like this.
+        return _format_item(var.numpy().item(0), max_width)
+    elif len(var.shape) == 1:
+        if sumary and var.shape[0] > 2 * edgeitems:
+            items = [
+                _format_item(item, max_width)
+                for item in list(var.numpy())[:DEFAULT_PRINT_OPTIONS.edgeitems]
+            ] + ['...'] + [
+                _format_item(item, max_width)
+                for item in list(var.numpy())[-DEFAULT_PRINT_OPTIONS.edgeitems:]
+            ]
+        else:
+            items = [
+                _format_item(item, max_width) for item in list(var.numpy())
+            ]
+
+        s = ', '.join(items)
+        return '[' + s + ']'
+    else:
+        # recursively handle all dimensions
+        if sumary and var.shape[0] > 2 * edgeitems:
+            vars = [
+                _format_tensor(x, sumary, indent + 1) for x in var[:edgeitems]
+            ] + ['...'] + [
+                _format_tensor(x, sumary, indent + 1) for x in var[-edgeitems:]
+            ]
+        else:
+            vars = [_format_tensor(x, sumary, indent + 1) for x in var]
+
+        return '[' + (',' + '\n' * (len(var.shape) - 1) + ' ' *
+                      (indent + 1)).join(vars) + ']'
+
+
+def to_string(var, prefix='Tensor'):
+    indent = len(prefix) + 1
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
+
+    tensor = var.value().get_tensor()
+    if not tensor._is_initialized():
+        return "Tensor(Not initialized)"
+
+    if len(var.shape) == 0:
+        size = 0
+    else:
+        size = 1
+        for dim in var.shape:
+            size *= dim
+
+    sumary = False
+    if size > DEFAULT_PRINT_OPTIONS.threshold:
+        sumary = True
+
+    data = _format_tensor(var, sumary, indent=indent)
+
+    return _template.format(
+        prefix=prefix,
+        shape=var.shape,
+        dtype=convert_dtype(var.dtype),
+        place=var._place_str,
+        stop_gradient=var.stop_gradient,
+        indent=' ' * indent,
+        data=data)
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index e1bc65a5d15c2..9f64a6d2b7b67 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -39,3 +39,5 @@ foreach(src ${DIST_TEST_OPS})
     message(STATUS ${src})
     py_dist_test(${src} SRCS ${src}.py)
 endforeach()
+
+set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600)
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 56105b6d7f15a..bcb910a5ada8a 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -25,7 +25,7 @@
 import paddle
 from paddle import fluid
 from paddle import to_tensor
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
+from paddle.nn import Conv2d, Linear, ReLU, Sequential, Softmax
 
 from paddle import Model
 from paddle.static import InputSpec
@@ -33,7 +33,7 @@
 from paddle.metric import Accuracy
 from paddle.vision.datasets import MNIST
 from paddle.vision.models import LeNet
-from paddle.io import DistributedBatchSampler
+from paddle.io import DistributedBatchSampler, Dataset
 from paddle.hapi.model import prepare_distributed_context
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
@@ -47,11 +47,11 @@ def __init__(self, num_classes=10):
             Conv2d(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
-            Pool2D(2, 'max', 2),
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
             Conv2d(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
-            Pool2D(2, 'max', 2))
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
         if num_classes > 0:
             self.fc = Sequential(
@@ -295,6 +295,15 @@ def forward(self, x):
         return y
 
 
+class MyDataset(Dataset):
+    def __getitem__(self, idx):
+        return np.random.random(size=(20,)).astype(np.float32), \
+               np.random.randint(0, 10, size=(1,)).astype(np.int64)
+
+    def __len__(self):
+        return 40
+
+
 class TestModelFunction(unittest.TestCase):
     def set_seed(self, seed=1024):
         paddle.manual_seed(seed)
@@ -599,6 +608,56 @@ def test_dygraph_export_deploy_model_about_inputs(self):
         shutil.rmtree(save_dir)
 
 
+class TestModelWithLRScheduler(unittest.TestCase):
+    def test_fit(self):
+        def make_optimizer(parameters=None):
+            base_lr = 1e-3
+            momentum = 0.9
+            weight_decay = 5e-4
+            boundaries = [5, 8]
+            values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)]
+            learning_rate = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=boundaries, values=values)
+            learning_rate = paddle.optimizer.lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=4,
+                start_lr=base_lr / 5.,
+                end_lr=base_lr,
+                verbose=True)
+            optimizer = paddle.optimizer.Momentum(
+                learning_rate=learning_rate,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                parameters=parameters)
+            return optimizer
+
+        # dynamic test
+        device = paddle.set_device('cpu')
+        fluid.enable_dygraph(device)
+        net = MyModel()
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+        optim = make_optimizer(net.parameters())
+        model = Model(net, inputs, labels)
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+
+        dataset = MyDataset()
+        model.fit(dataset, dataset, batch_size=4, epochs=10, num_workers=0)
+
+        # static test
+        paddle.enable_static()
+
+        net = MyModel()
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+        optim = make_optimizer(net.parameters())
+        model = Model(net, inputs, labels)
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+
+        dataset = MyDataset()
+        model.fit(dataset, dataset, batch_size=4, epochs=10, num_workers=0)
+
+
 class TestRaiseError(unittest.TestCase):
     def test_input_without_name(self):
         net = MyModel()
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index bf9c2a2ae0611..a36dd75549a9e 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -33,7 +33,7 @@ def infer(self, arch):
             if not dygraph:
                 paddle.enable_static()
 
-            net = models.__dict__[arch]()
+            net = models.__dict__[arch](pretrained=True)
             inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
             model = paddle.Model(network=net, inputs=inputs)
             model.prepare()
@@ -52,7 +52,7 @@ def infer(self, arch):
         np.testing.assert_allclose(res['dygraph'], res['static'])
 
     def test_models(self):
-        arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
+        arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18', 'vgg16']
         for arch in arches:
             self.infer(arch)
 
diff --git a/python/paddle/tests/test_text.py b/python/paddle/tests/test_text.py
deleted file mode 100644
index fa83b0cc6f340..0000000000000
--- a/python/paddle/tests/test_text.py
+++ /dev/null
@@ -1,696 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import random
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import Embedding, Linear, Layer
-from paddle.fluid.layers import BeamSearchDecoder
-from paddle import Model, set_device
-from paddle.static import InputSpec as Input
-from paddle.text import *
-
-paddle.enable_static()
-
-
-class ModuleApiTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls._np_rand_state = np.random.get_state()
-        cls._py_rand_state = random.getstate()
-        cls._random_seed = 123
-        np.random.seed(cls._random_seed)
-        random.seed(cls._random_seed)
-
-        cls.model_cls = type(cls.__name__ + "Model", (Layer, ), {
-            "__init__": cls.model_init_wrapper(cls.model_init),
-            "forward": cls.model_forward
-        })
-
-    @classmethod
-    def tearDownClass(cls):
-        np.random.set_state(cls._np_rand_state)
-        random.setstate(cls._py_rand_state)
-
-    @staticmethod
-    def model_init_wrapper(func):
-        def __impl__(self, *args, **kwargs):
-            Layer.__init__(self)
-            func(self, *args, **kwargs)
-
-        return __impl__
-
-    @staticmethod
-    def model_init(model, *args, **kwargs):
-        raise NotImplementedError(
-            "model_init acts as `Model.__init__`, thus must implement it")
-
-    @staticmethod
-    def model_forward(model, *args, **kwargs):
-        return model.module(*args, **kwargs)
-
-    def make_inputs(self):
-        # TODO(guosheng): add default from `self.inputs`
-        raise NotImplementedError(
-            "model_inputs makes inputs for model, thus must implement it")
-
-    def setUp(self):
-        """
-        For the model which wraps the module to be tested:
-            Set input data by `self.inputs` list
-            Set init argument values by `self.attrs` list/dict
-            Set model parameter values by `self.param_states` dict
-            Set expected output data by `self.outputs` list
-        We can create a model instance and run once with these.
-        """
-        self.inputs = []
-        self.attrs = {}
-        self.param_states = {}
-        self.outputs = []
-
-    def _calc_output(self, place, mode="test", dygraph=True):
-        if dygraph:
-            fluid.enable_dygraph(place)
-        else:
-            fluid.disable_dygraph()
-        gen = paddle.manual_seed(self._random_seed)
-        gen._is_init_py = False
-        paddle.framework.random._manual_program_seed(self._random_seed)
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            layer = self.model_cls(**self.attrs) if isinstance(
-                self.attrs, dict) else self.model_cls(*self.attrs)
-            model = Model(layer, inputs=self.make_inputs())
-            model.prepare()
-            if self.param_states:
-                model.load(self.param_states, optim_state=None)
-            return model.test_batch(self.inputs)
-
-    def check_output_with_place(self, place, mode="test"):
-        dygraph_output = self._calc_output(place, mode, dygraph=True)
-        stgraph_output = self._calc_output(place, mode, dygraph=False)
-        expect_output = getattr(self, "outputs", None)
-        for actual_t, expect_t in zip(dygraph_output, stgraph_output):
-            self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
-        if expect_output:
-            for actual_t, expect_t in zip(dygraph_output, expect_output):
-                self.assertTrue(
-                    np.allclose(
-                        actual_t, expect_t, rtol=1e-5, atol=0))
-
-    def check_output(self):
-        devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
-        for device in devices:
-            place = set_device(device)
-            self.check_output_with_place(place)
-
-
-class TestBasicLSTM(ModuleApiTest):
-    def setUp(self):
-        # TODO(guosheng): Change to big size. Currently bigger hidden size for
-        # LSTM would fail, the second static graph run might get diff output
-        # with others.
-        shape = (2, 4, 16)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 16, "hidden_size": 16}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model, input_size, hidden_size):
-        model.lstm = RNN(BasicLSTMCell(
-            input_size,
-            hidden_size, ))
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.lstm(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBasicGRU(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 4, 128)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 128, "hidden_size": 128}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model, input_size, hidden_size):
-        model.gru = RNN(BasicGRUCell(input_size, hidden_size))
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.gru(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBeamSearch(ModuleApiTest):
-    def setUp(self):
-        shape = (8, 32)
-        self.inputs = [
-            np.random.random(shape).astype("float32"),
-            np.random.random(shape).astype("float32")
-        ]
-        self.outputs = None
-        self.attrs = {
-            "vocab_size": 100,
-            "embed_dim": 32,
-            "hidden_size": 32,
-        }
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(self,
-                   vocab_size,
-                   embed_dim,
-                   hidden_size,
-                   bos_id=0,
-                   eos_id=1,
-                   beam_size=4,
-                   max_step_num=20):
-        embedder = Embedding(size=[vocab_size, embed_dim])
-        output_layer = Linear(hidden_size, vocab_size)
-        cell = BasicLSTMCell(embed_dim, hidden_size)
-        decoder = BeamSearchDecoder(
-            cell,
-            start_token=bos_id,
-            end_token=eos_id,
-            beam_size=beam_size,
-            embedding_fn=embedder,
-            output_fn=output_layer)
-        self.beam_search_decoder = DynamicDecode(
-            decoder, max_step_num=max_step_num, is_test=True)
-
-    @staticmethod
-    def model_forward(model, init_hidden, init_cell):
-        return model.beam_search_decoder([init_hidden, init_cell])[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, self.inputs[0].shape[-1]], "float32", "init_hidden"),
-            Input([None, self.inputs[1].shape[-1]], "float32", "init_cell"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTransformerEncoder(ModuleApiTest):
-    def setUp(self):
-        self.inputs = [
-            # encoder input: [batch_size, seq_len, hidden_size]
-            np.random.random([2, 4, 512]).astype("float32"),
-            # self attention bias: [batch_size, n_head, seq_len, seq_len]
-            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9
-        ]
-        self.outputs = None
-        self.attrs = {
-            "n_layer": 2,
-            "n_head": 8,
-            "d_key": 64,
-            "d_value": 64,
-            "d_model": 512,
-            "d_inner_hid": 1024
-        }
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model,
-                   n_layer,
-                   n_head,
-                   d_key,
-                   d_value,
-                   d_model,
-                   d_inner_hid,
-                   prepostprocess_dropout=0.1,
-                   attention_dropout=0.1,
-                   relu_dropout=0.1,
-                   preprocess_cmd="n",
-                   postprocess_cmd="da",
-                   ffn_fc1_act="relu"):
-        model.encoder = TransformerEncoder(
-            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-            prepostprocess_dropout, attention_dropout, relu_dropout,
-            preprocess_cmd, postprocess_cmd, ffn_fc1_act)
-
-    @staticmethod
-    def model_forward(model, enc_input, attn_bias):
-        return model.encoder(enc_input, attn_bias)
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[0].shape[-1]], "float32",
-                  "enc_input"),
-            Input([None, self.inputs[1].shape[1], None, None], "float32",
-                  "attn_bias"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTransformerDecoder(TestTransformerEncoder):
-    def setUp(self):
-        self.inputs = [
-            # decoder input: [batch_size, seq_len, hidden_size]
-            np.random.random([2, 4, 512]).astype("float32"),
-            # encoder output: [batch_size, seq_len, hidden_size]
-            np.random.random([2, 5, 512]).astype("float32"),
-            # self attention bias: [batch_size, n_head, seq_len, seq_len]
-            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9,
-            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
-            np.random.randint(0, 1, [2, 8, 4, 5]).astype("float32") * -1e9
-        ]
-        self.outputs = None
-        self.attrs = {
-            "n_layer": 2,
-            "n_head": 8,
-            "d_key": 64,
-            "d_value": 64,
-            "d_model": 512,
-            "d_inner_hid": 1024
-        }
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model,
-                   n_layer,
-                   n_head,
-                   d_key,
-                   d_value,
-                   d_model,
-                   d_inner_hid,
-                   prepostprocess_dropout=0.1,
-                   attention_dropout=0.1,
-                   relu_dropout=0.1,
-                   preprocess_cmd="n",
-                   postprocess_cmd="da"):
-        model.decoder = TransformerDecoder(
-            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-            prepostprocess_dropout, attention_dropout, relu_dropout,
-            preprocess_cmd, postprocess_cmd)
-
-    @staticmethod
-    def model_forward(model,
-                      dec_input,
-                      enc_output,
-                      self_attn_bias,
-                      cross_attn_bias,
-                      caches=None):
-        return model.decoder(dec_input, enc_output, self_attn_bias,
-                             cross_attn_bias, caches)
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[0].shape[-1]], "float32",
-                  "dec_input"),
-            Input([None, None, self.inputs[0].shape[-1]], "float32",
-                  "enc_output"),
-            Input([None, self.inputs[-1].shape[1], None, None], "float32",
-                  "self_attn_bias"),
-            Input([None, self.inputs[-1].shape[1], None, None], "float32",
-                  "cross_attn_bias"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestTransformerBeamSearchDecoder(ModuleApiTest):
-    def setUp(self):
-        self.inputs = [
-            # encoder output: [batch_size, seq_len, hidden_size]
-            np.random.random([2, 5, 128]).astype("float32"),
-            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
-            np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9
-        ]
-        self.outputs = None
-        self.attrs = {
-            "vocab_size": 100,
-            "n_layer": 2,
-            "n_head": 2,
-            "d_key": 64,
-            "d_value": 64,
-            "d_model": 128,
-            "d_inner_hid": 128
-        }
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model,
-                   vocab_size,
-                   n_layer,
-                   n_head,
-                   d_key,
-                   d_value,
-                   d_model,
-                   d_inner_hid,
-                   prepostprocess_dropout=0.1,
-                   attention_dropout=0.1,
-                   relu_dropout=0.1,
-                   preprocess_cmd="n",
-                   postprocess_cmd="da",
-                   bos_id=0,
-                   eos_id=1,
-                   beam_size=4,
-                   max_step_num=20):
-        model.beam_size = beam_size
-
-        def embeder_init(self, size):
-            Layer.__init__(self)
-            self.embedder = Embedding(size)
-
-        Embedder = type("Embedder", (Layer, ), {
-            "__init__": embeder_init,
-            "forward": lambda self, word, pos: self.embedder(word)
-        })
-        embedder = Embedder(size=[vocab_size, d_model])
-        output_layer = Linear(d_model, vocab_size)
-        model.decoder = TransformerDecoder(
-            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-            prepostprocess_dropout, attention_dropout, relu_dropout,
-            preprocess_cmd, postprocess_cmd)
-        transformer_cell = TransformerCell(model.decoder, embedder,
-                                           output_layer)
-        model.beam_search_decoder = DynamicDecode(
-            TransformerBeamSearchDecoder(
-                transformer_cell, bos_id, eos_id, beam_size,
-                var_dim_in_state=2),
-            max_step_num,
-            is_test=True)
-
-    @staticmethod
-    def model_forward(model, enc_output, trg_src_attn_bias):
-        caches = model.decoder.prepare_incremental_cache(enc_output)
-        enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-            enc_output, model.beam_size)
-        trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-            trg_src_attn_bias, model.beam_size)
-        static_caches = model.decoder.prepare_static_cache(enc_output)
-        rs, _ = model.beam_search_decoder(
-            inits=caches,
-            enc_output=enc_output,
-            trg_src_attn_bias=trg_src_attn_bias,
-            static_caches=static_caches)
-        return rs
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[0].shape[-1]], "float32",
-                  "enc_output"),
-            Input([None, self.inputs[1].shape[1], None, None], "float32",
-                  "trg_src_attn_bias"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSequenceTagging(ModuleApiTest):
-    def setUp(self):
-        self.inputs = [
-            np.random.randint(0, 100, (2, 8)).astype("int64"),
-            np.random.randint(1, 8, (2)).astype("int64"),
-            np.random.randint(0, 5, (2, 8)).astype("int64")
-        ]
-        self.outputs = None
-        self.attrs = {"vocab_size": 100, "num_labels": 5}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model,
-                   vocab_size,
-                   num_labels,
-                   word_emb_dim=128,
-                   grnn_hidden_dim=128,
-                   emb_learning_rate=0.1,
-                   crf_learning_rate=0.1,
-                   bigru_num=2,
-                   init_bound=0.1):
-        model.tagger = SequenceTagging(vocab_size, num_labels, word_emb_dim,
-                                       grnn_hidden_dim, emb_learning_rate,
-                                       crf_learning_rate, bigru_num, init_bound)
-
-    @staticmethod
-    def model_forward(model, word, lengths, target=None):
-        return model.tagger(word, lengths, target)
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None], "int64", "word"),
-            Input([None], "int64", "lengths"),
-            Input([None, None], "int64", "target"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestSequenceTaggingInfer(TestSequenceTagging):
-    def setUp(self):
-        super(TestSequenceTaggingInfer, self).setUp()
-        self.inputs = self.inputs[:2]  # remove target
-
-    def make_inputs(self):
-        inputs = super(TestSequenceTaggingInfer,
-                       self).make_inputs()[:2]  # remove target
-        return inputs
-
-
-class TestStackedRNN(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 4, 16)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model, input_size, hidden_size, num_layers):
-        cells = [
-            BasicLSTMCell(input_size, hidden_size),
-            BasicLSTMCell(hidden_size, hidden_size)
-        ]
-        stacked_cell = StackedRNNCell(cells)
-        model.lstm = RNN(stacked_cell)
-
-    @staticmethod
-    def model_forward(self, inputs):
-        return self.lstm(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestLSTM(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 4, 16)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model, input_size, hidden_size, num_layers):
-        model.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.lstm(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBiLSTM(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 4, 16)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model,
-                   input_size,
-                   hidden_size,
-                   num_layers,
-                   merge_mode="concat",
-                   merge_each_layer=False):
-        model.bilstm = BidirectionalLSTM(
-            input_size,
-            hidden_size,
-            num_layers=num_layers,
-            merge_mode=merge_mode,
-            merge_each_layer=merge_each_layer)
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.bilstm(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output_merge0(self):
-        self.check_output()
-
-    def test_check_output_merge1(self):
-        self.attrs["merge_each_layer"] = True
-        self.check_output()
-
-
-class TestGRU(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 4, 64)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model, input_size, hidden_size, num_layers):
-        model.gru = GRU(input_size, hidden_size, num_layers=num_layers)
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.gru(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestBiGRU(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 4, 64)
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model,
-                   input_size,
-                   hidden_size,
-                   num_layers,
-                   merge_mode="concat",
-                   merge_each_layer=False):
-        model.bigru = BidirectionalGRU(
-            input_size,
-            hidden_size,
-            num_layers=num_layers,
-            merge_mode=merge_mode,
-            merge_each_layer=merge_each_layer)
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.bigru(inputs)[0]
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output_merge0(self):
-        self.check_output()
-
-    def test_check_output_merge1(self):
-        self.attrs["merge_each_layer"] = True
-        self.check_output()
-
-
-class TestCNNEncoder(ModuleApiTest):
-    def setUp(self):
-        shape = (2, 32, 8)  # [N, C, H]
-        self.inputs = [np.random.random(shape).astype("float32")]
-        self.outputs = None
-        self.attrs = {"num_channels": 32, "num_filters": 64, "num_layers": 2}
-        self.param_states = {}
-
-    @staticmethod
-    def model_init(model, num_channels, num_filters, num_layers):
-        model.cnn_encoder = CNNEncoder(
-            num_layers=2,
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=[2, 3],
-            pool_size=[7, 6])
-
-    @staticmethod
-    def model_forward(model, inputs):
-        return model.cnn_encoder(inputs)
-
-    def make_inputs(self):
-        inputs = [
-            Input([None, self.inputs[-1].shape[1], None], "float32", "input"),
-        ]
-        return inputs
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py
index 083bfbd1d2528..b6f8ea6bcc7e4 100644
--- a/python/paddle/text/__init__.py
+++ b/python/paddle/text/__init__.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import text
-from .text import *
-
 from . import datasets
 from .datasets import *
 
-__all__ = text.__all__ \
-        + datasets.__all__
+__all__ = datasets.__all__
diff --git a/python/paddle/text/text.py b/python/paddle/text/text.py
deleted file mode 100644
index a0fa4791c5b1c..0000000000000
--- a/python/paddle/text/text.py
+++ /dev/null
@@ -1,3965 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import collections
-import six
-import sys
-from functools import partial, reduce
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers.utils as utils
-from paddle.fluid import layers
-from paddle.fluid.layers import BeamSearchDecoder
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
-from paddle.fluid.data_feeder import convert_dtype
-
-__all__ = [
-    'RNNCell',
-    'BasicLSTMCell',
-    'BasicGRUCell',
-    'RNN',
-    'BidirectionalRNN',
-    'StackedRNNCell',
-    'StackedLSTMCell',
-    'LSTM',
-    'BidirectionalLSTM',
-    'StackedGRUCell',
-    'GRU',
-    'BidirectionalGRU',
-    'DynamicDecode',
-    'BeamSearchDecoder',
-    'Conv1dPoolLayer',
-    'CNNEncoder',
-    'MultiHeadAttention',
-    'FFN',
-    'TransformerEncoderLayer',
-    'TransformerEncoder',
-    'TransformerDecoderLayer',
-    'TransformerDecoder',
-    'TransformerCell',
-    'TransformerBeamSearchDecoder',
-    'LinearChainCRF',
-    'CRFDecoding',
-    'SequenceTagging',
-]
-
-
-class RNNCell(Layer):
-    """
-    RNNCell is the base class for abstraction representing the calculations
-    mapping the input and state to the output and new state. It is suitable to
-    and mostly used in RNN.
-    """
-
-    def get_initial_states(self,
-                           batch_ref,
-                           shape=None,
-                           dtype=None,
-                           init_value=0,
-                           batch_dim_idx=0):
-        """
-        Generate initialized states according to provided shape, data type and
-        value.
-
-        Parameters:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possibly nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possibly nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If None and
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is None.
-            init_value: A float value used to initialize states.
-            batch_dim_idx: An integer indicating which dimension of the tensor in
-                inputs represents batch size.  The default value is 0.
-
-        Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
-        """
-        # TODO: use inputs and batch_size
-        batch_ref = flatten(batch_ref)[0]
-
-        def _is_shape_sequence(seq):
-            if sys.version_info < (3, ):
-                integer_types = (
-                    int,
-                    long, )
-            else:
-                integer_types = (int, )
-            """For shape, list/tuple of integer is the finest-grained objection"""
-            if (isinstance(seq, list) or isinstance(seq, tuple)):
-                if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
-                          seq, True):
-                    return False
-            # TODO: Add check for the illegal
-            if isinstance(seq, dict):
-                return True
-            return (isinstance(seq, collections.Sequence) and
-                    not isinstance(seq, six.string_types))
-
-        class Shape(object):
-            def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
-
-        # nested structure of shapes
-        states_shapes = self.state_shape if shape is None else shape
-        is_sequence_ori = utils.is_sequence
-        utils.is_sequence = _is_shape_sequence
-        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
-        utils.is_sequence = is_sequence_ori
-
-        # nested structure of dtypes
-        try:
-            states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
-        if len(flatten(states_dtypes)) == 1:
-            dtype = flatten(states_dtypes)[0]
-            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
-
-        init_states = map_structure(
-            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
-        return init_states
-
-    @property
-    def state_shape(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it).
-        Not necessary to be implemented if states are not initialized by
-        `get_initial_states` or the `shape` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_shape` in the used cell.")
-
-    @property
-    def state_dtype(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
-        same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
-        Not necessary to be implemented if states are not initialized
-        by `get_initial_states` or the `dtype` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_dtype` in the used cell.")
-
-
-class BasicLSTMCell(RNNCell):
-    """
-    Long-Short Term Memory(LSTM) RNN cell.
-
-    The formula used is as follows:
-
-    .. math::
-
-        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-
-        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-
-        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
-
-        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
-
-        h_{t} & = o_{t} act_c (c_{t})
-
-    Please refer to `An Empirical Exploration of Recurrent Network Architectures
-    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
-
-    Parameters:
-        input_size (int): The input size in the LSTM cell.
-        hidden_size (int): The hidden size in the LSTM cell.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            weight matrix. Default: None.
-        bias_attr (ParamAttr, optional): The parameter attribute for the bias
-            of LSTM. Default: None.
-        gate_activation (function, optional): The activation function for gates
-            of LSTM, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            LSTM, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        forget_bias(float, optional): forget bias used when computing forget gate.
-            Default 1.0
-        dtype(string, optional): The data type used in this cell. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import BasicLSTMCell, RNN
-
-            inputs = paddle.rand((2, 4, 32))
-            cell = BasicLSTMCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 dtype='float32'):
-        super(BasicLSTMCell, self).__init__()
-
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        # TODO(guosheng): find better way to resolve constants in __init__
-        self._forget_bias = layers.create_global_var(
-            shape=[1], dtype=dtype, value=forget_bias, persistable=True)
-        # TODO(guosheng): refine this if recurrent_op removes gradient require
-        self._forget_bias.stop_gradient = False
-        self._dtype = dtype
-        self._input_size = input_size
-
-        self._weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[
-                self._input_size + self._hidden_size, 4 * self._hidden_size
-            ],
-            dtype=self._dtype)
-
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hidden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, inputs, states):
-        """
-        Performs single step LSTM calculations.
-
-        Parameters:
-            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
-                corresponding to :math:`x_t` in the formula. The data type
-                should be float32 or float64.
-            states (Variable): A list of containing two tensors, each shaped
-                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
-                in the formula. The data type should be float32 or float64.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula; `new_states` is a list containing \
-                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
-                tensors all is same as that of `states`.
-        """
-        pre_hidden, pre_cell = states
-        concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
-        gate_input = layers.elementwise_add(gate_input, self._bias)
-        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
-                pre_cell,
-                self._gate_activation(
-                    layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(
-                self._gate_activation(i), self._activation(j)))
-        new_hidden = self._activation(new_cell) * self._gate_activation(o)
-
-        return new_hidden, [new_hidden, new_cell]
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
-        (-1 for batch size would be automatically inserted into shape). These two
-        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
-        """
-        return [[self._hidden_size], [self._hidden_size]]
-
-
-class BasicGRUCell(RNNCell):
-    """
-    Gated Recurrent Unit (GRU) RNN cell.
-
-    The formula for GRU used is as follows:
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-    Please refer to `An Empirical Exploration of Recurrent Network Architectures
-    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
-
-    Parameters:
-        input_size (int): The input size for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            weight matrix. Default: None.
-        bias_attr (ParamAttr, optional): The parameter attribute for the bias
-            of LSTM. Default: None.
-        gate_activation (function, optional): The activation function for gates
-            of GRU, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            GRU, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        dtype(string, optional): The data type used in this cell. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import BasicGRUCell, RNN
-
-            inputs = paddle.rand((2, 4, 32))
-            cell = BasicGRUCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 dtype='float32'):
-        super(BasicGRUCell, self).__init__()
-        self._input_size = input_size
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._dtype = dtype
-
-        if self._param_attr is not None and self._param_attr.name is not None:
-            gate_param_attr = copy.deepcopy(self._param_attr)
-            candidate_param_attr = copy.deepcopy(self._param_attr)
-            gate_param_attr.name += "_gate"
-            candidate_param_attr.name += "_candidate"
-        else:
-            gate_param_attr = self._param_attr
-            candidate_param_attr = self._param_attr
-
-        self._gate_weight = self.create_parameter(
-            attr=gate_param_attr,
-            shape=[
-                self._input_size + self._hidden_size, 2 * self._hidden_size
-            ],
-            dtype=self._dtype)
-
-        self._candidate_weight = self.create_parameter(
-            attr=candidate_param_attr,
-            shape=[self._input_size + self._hidden_size, self._hidden_size],
-            dtype=self._dtype)
-
-        if self._bias_attr is not None and self._bias_attr.name is not None:
-            gate_bias_attr = copy.deepcopy(self._bias_attr)
-            candidate_bias_attr = copy.deepcopy(self._bias_attr)
-            gate_bias_attr.name += "_gate"
-            candidate_bias_attr.name += "_candidate"
-        else:
-            gate_bias_attr = self._bias_attr
-            candidate_bias_attr = self._bias_attr
-
-        self._gate_bias = self.create_parameter(
-            attr=gate_bias_attr,
-            shape=[2 * self._hidden_size],
-            dtype=self._dtype,
-            is_bias=True)
-        self._candidate_bias = self.create_parameter(
-            attr=candidate_bias_attr,
-            shape=[self._hidden_size],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, inputs, states):
-        """
-        Performs single step GRU calculations.
-
-        Parameters:
-            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
-                corresponding to :math:`x_t` in the formula. The data type
-                should be float32 or float64.
-            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
-                corresponding to :math:`h_{t-1}` in the formula. The data type
-                should be float32 or float64.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
-                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
-                corresponding to :math:`h_t` in the formula. The data type of the \
-                tensor is same as that of `states`.        
-        """
-        pre_hidden = states
-        concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
-
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
-
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
-
-        gate_input = self._gate_activation(gate_input)
-        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
-
-        r_hidden = r * pre_hidden
-
-        candidate = layers.matmul(
-            layers.concat([inputs, r_hidden], 1), self._candidate_weight)
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
-
-        c = self._activation(candidate)
-        new_hidden = u * pre_hidden + (1 - u) * c
-
-        return new_hidden, new_hidden
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
-        size would be automatically inserted into shape). The shape corresponds
-        to :math:`h_{t-1}`.
-        """
-        return [self._hidden_size]
-
-
-class RNN(Layer):
-    """
-    RNN creates a recurrent neural network specified by RNNCell `cell`, which
-    performs :code:`cell.forward()` repeatedly until reaches to the maximum
-    length of `inputs`.
-
-    Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
-        is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import StackedLSTMCell, RNN
-
-            inputs = paddle.rand((2, 4, 32))
-            cell = StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self, cell, is_reverse=False, time_major=False):
-        super(RNN, self).__init__()
-        self.cell = cell
-        if not hasattr(self.cell, "call"):
-            self.cell.call = self.cell.forward
-        self.is_reverse = is_reverse
-        self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
-                                                                            1)
-
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        """
-        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
-        length of `inputs`.
-
-        Parameters:
-            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
-                The shape of tensor should be `[batch_size, sequence_length, ...]`
-                for `time_major == False` or `[sequence_length, batch_size, ...]`
-                for `time_major == True`. It represents the inputs to be unrolled
-                in RNN.
-            initial_states (Variable, optional): A (possibly nested structure of)
-                tensor variable[s], representing the initial state for RNN. 
-                If not provided, `cell.get_initial_states` would be used to produce
-                the initial state. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
-
-        Returns:
-            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-                outputs and states, both are Tensor or nested structure of Tensor. \
-                `final_outputs` has the same structure and data types as \
-                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
-                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-                `final_states` is the counterpart at last time step of initial states, \
-                thus has the same structure with it and has tensors with same shapes \
-                and data types.
-        """
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = fluid.layers.elementwise_mul(
-                    new_state, step_mask,
-                    axis=0) - fluid.layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-
-            flat_inputs = flatten(inputs)
-            batch_size, time_steps = (
-                flat_inputs[0].shape[self.batch_index],
-                flat_inputs[0].shape[self.time_step_index])
-
-            if initial_states is None:
-                initial_states = self.cell.get_initial_states(
-                    batch_ref=inputs, batch_dim_idx=self.batch_index)
-
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-
-            if sequence_length is not None:
-                mask = fluid.layers.sequence_mask(
-                    sequence_length,
-                    maxlen=time_steps,
-                    dtype=flatten(initial_states)[0].dtype)
-                mask = fluid.layers.transpose(mask, [1, 0])
-
-            if self.is_reverse:
-                inputs = map_structure(
-                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
-                mask = fluid.layers.reverse(
-                    mask, axis=[0]) if sequence_length is not None else None
-
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states,
-                                                     **kwargs)
-                if sequence_length is not None:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=self.time_step_index
-                                             ), outputs)
-
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: fluid.layers.reverse(x, axis=self.time_step_index
-                                                   ), final_outputs)
-
-            final_states = new_states
-        else:
-            final_outputs, final_states = fluid.layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse,
-                **kwargs)
-        return final_outputs, final_states
-
-
-class StackedRNNCell(RNNCell):
-    """
-    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
-    to implement stacked RNNs.
-
-    Parameters:
-        cells (list|tuple): List of RNN cell instances.
-
-    Examples:
-
-        .. code-block:: python
-
-            from paddle.text import BasicLSTMCell, StackedRNNCell
-
-            cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
-            stack_rnn = StackedRNNCell(cells)
-    """
-
-    def __init__(self, cells):
-        super(StackedRNNCell, self).__init__()
-        self.cells = []
-        for i, cell in enumerate(cells):
-            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
-
-    def forward(self, inputs, states, **kwargs):
-        """
-        Performs :code:`cell.forward` for all including cells sequentially.
-        Each cell's `inputs` is the `outputs` of the previous cell. And each
-        cell's `states` is the corresponding one in `states`.
-
-        Parameters:
-            inputs (Variable): The inputs for the first cell. Mostly it is a
-                float32 or float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-            **kwargs: Additional keyword arguments, which passed to `cell.forward`
-                for all including cells.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
-                `outputs` of the last cell. `new_states` is a list composed \
-                of all cells' `new_states`, and its structure and data type is \
-                same as that of `states` argument.
-        """
-        new_states = []
-        for cell, state in zip(self.cells, states):
-            outputs, new_state = cell(inputs, state, **kwargs)
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
-
-    @staticmethod
-    def stack_param_attr(param_attr, n):
-        """
-        If `param_attr` is a list or tuple, convert every element in it to a
-        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
-        construct a list, and rename every one by appending a increasing index
-        suffix to avoid having same names when `param_attr` contains a name.
-
-        Parameters:
-            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-                converted to a ParamAttr instance by `ParamAttr._to_attr`.
-            n (int): The times to repeat to construct a list when `param_attr`
-                is not a list or tuple.
-
-        Returns:
-            list: A list composed of each including cell's `param_attr`.
-        """
-        if isinstance(param_attr, (list, tuple)):
-            assert len(param_attr) == n, (
-                "length of param_attr should be %d when it is a list/tuple" % n)
-            param_attrs = [
-                fluid.ParamAttr._to_attr(attr) for attr in param_attr
-            ]
-        else:
-            param_attrs = []
-            attr = fluid.ParamAttr._to_attr(param_attr)
-            for i in range(n):
-                attr_i = copy.deepcopy(attr)
-                if attr.name:
-                    attr_i.name = attr_i.name + "_" + str(i)
-                param_attrs.append(attr_i)
-        return param_attrs
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedRNNCell is a list composed of each including
-        cell's `state_shape`.
-
-        Returns:
-            list: A list composed of each including cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
-
-
-class StackedLSTMCell(RNNCell):
-    """
-    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
-    to implement stacked LSTM.
-
-    The formula for LSTM used here is as follows:
-
-    .. math::
-
-        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-
-        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-
-        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
-
-        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
-
-        h_{t} & = o_{t} act_c (c_{t})
-
-
-    Parameters:
-        input_size (int): The input size for the first LSTM cell.
-        hidden_size (int): The hidden size for every LSTM cell.
-        gate_activation (function, optional): The activation function for gates
-            of LSTM, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            LSTM, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        forget_bias (float, optional): forget bias used when computing forget
-            gate. It also can accept a boolean value `True`, which would set
-            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
-            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
-            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
-            Default 1.0.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float|list|tuple, optional): The dropout probability after each
-            LSTM. It also can be a list or tuple, including dropout probabilities
-            for the corresponding LSTM. Default 0.0
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import StackedLSTMCell, RNN
-
-            inputs = paddle.rand((2, 4, 32))
-            cell = StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedLSTMCell, self).__init__()
-        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
-                                             float)
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
-
-        self.cells = []
-        for i in range(num_layers):
-            if forget_bias is True:
-                bias_attrs[
-                    i].initializer = fluid.initializer.NumpyArrayInitializer(
-                        np.concatenate(
-                            np.zeros(2 * hidden_size),
-                            np.ones(hidden_size), np.zeros(hidden_size)).astype(
-                                dtype))
-                forget_bias = 0.0
-            self.cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        gate_activation=gate_activation,
-                        activation=activation,
-                        forget_bias=forget_bias,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
-
-    def forward(self, inputs, states):
-        """
-        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
-        the `outputs` of the previous cell. And each cell's `states` is the
-        corresponding one in `states`.
-
-        Parameters:
-            inputs (Variable): The inputs for the first cell. It is a float32 or
-                float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-            **kwargs: Additional keyword arguments, which passed to `cell.forward`
-                for all including cells.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
-                is a list composed of every LSTM `new_states` which is a pair \
-                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
-                and the data type and structure of these tensors all is same \
-                as that of `states`.
-        """
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            outputs, new_state = cell(inputs, states[i])
-            outputs = layers.dropout(
-                outputs,
-                self.dropout[i],
-                dropout_implementation='upscale_in_train') if self.dropout[
-                    i] > 0 else outputs
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedLSTMCell is a list composed of each including
-        LSTM cell's `state_shape`.
-
-        Returns:
-            list: A list composed of each including LSTM cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
-
-
-class LSTM(Layer):
-    """
-    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
-    sequence.
-
-    The formula for LSTM used here is as follows:
-
-    .. math::
-
-        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-
-        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-
-        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
-
-        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
-
-        h_{t} & = o_{t} act_c (c_{t})
-
-
-    Parameters:
-        input_size (int): The input feature size for the first LSTM.
-        hidden_size (int): The hidden size for every LSTM.
-        gate_activation (function, optional): The activation function for gates
-            of LSTM, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            LSTM, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        forget_bias (float, optional): forget bias used when computing forget
-            gate. It also can accept a boolean value `True`, which would set
-            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
-            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
-            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
-            Default 1.0.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float|list|tuple, optional): The dropout probability after each
-            LSTM. It also can be a list or tuple, including dropout probabilities
-            for the corresponding LSTM. Default 0.0
-        is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import LSTM
-
-            inputs = paddle.rand((2, 4, 32))
-            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
-            outputs, _ = lstm(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 is_reverse=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(LSTM, self).__init__()
-        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                    activation, forget_bias, num_layers,
-                                    dropout, param_attr, bias_attr, dtype)
-        self.lstm = RNN(lstm_cell, is_reverse, time_major)
-
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        """
-        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
-        is the `inputs` of the subsequent one.
-
-        Parameters:
-            inputs (Variable): The inputs for the first LSTM. It is a float32
-                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-            initial_states (list|None, optional): A list containing initial states 
-                of all stacked LSTM, and the initial states of each LSTM is a pair
-                of tensors shaped `[batch_size, hidden_size]`. If not provided,
-                use 0 as initial states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is the output of last LSTM and it is a tensor with shape \
-                `[batch_size, sequence_length, hidden_size]` and has the same \
-                data type as `inputs`, `final_states` is the counterpart of \
-                `initial_states` at last time step, thus has the same structure \
-                with it and has tensors with same shapes data types. 
-        """
-        return self.lstm(inputs, initial_states, sequence_length)
-
-
-class BidirectionalRNN(Layer):
-    """
-    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
-    forward and backward RNN separately, and merge outputs of these two RNN
-    according to `merge_mode`.
-
-    Parameters:
-        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
-        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
-        merge_mode (str|None, optional): The way to merget outputs of forward and
-            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
-            where None stands for make the two `outputs` as a tuple, `zip` stands
-            for make each two corresponding tensors of the two `outputs` as a tuple.
-            Default `concat`
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            from paddle.text import StackedLSTMCell, BidirectionalRNN
-
-            inputs = paddle.rand((2, 4, 32))
-            cell_fw = StackedLSTMCell(32, 64)
-            cell_bw = StackedLSTMCell(32, 64)
-            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
-            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 cell_fw,
-                 cell_bw,
-                 merge_mode='concat',
-                 time_major=False,
-                 cell_cls=None,
-                 **kwargs):
-        super(BidirectionalRNN, self).__init__()
-        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
-        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
-        if merge_mode == 'concat':
-            self.merge_func = lambda x, y: layers.concat([x, y], -1)
-        elif merge_mode == 'sum':
-            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
-        elif merge_mode == 'ave':
-            self.merge_func = lambda x, y: layers.scale(
-                layers.elementwise_add(x, y), 0.5)
-        elif merge_mode == 'mul':
-            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
-        elif merge_mode == 'zip':
-            self.merge_func = lambda x, y: (x, y)
-        elif merge_mode is None:
-            self.merge_func = None
-        else:
-            raise ValueError('Unsupported value for `merge_mode`: %s' %
-                             merge_mode)
-
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        """
-        Performs forward and backward RNN separately, and merge outputs of these
-        two RNN according to `merge_mode`.
-
-        Parameters:
-            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
-                The shape of tensor should be `[batch_size, sequence_length, ...]`
-                for `time_major == False` or `[sequence_length, batch_size, ...]`
-                for `time_major == True`. It represents the inputs to be unrolled
-                in both forward and backward RNN.
-            initial_states (Variable|list|tuple): If it is a list or tuple, its
-                length should be 2 to include initial states of forward and backward
-                RNN separately. Otherwise it would be used twice for the two RNN. 
-                If None, `cell.get_initial_states` would be used to produce the initial
-                states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is produced by merge outputs of forward and backward RNN according \
-                to `merge_mode`, `final_states` is a pair including `final_states` \
-                of forward and backward RNN.
-        """
-        if isinstance(initial_states, (list, tuple)):
-            assert len(
-                initial_states
-            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
-        else:
-            initial_states = [initial_states, initial_states]
-        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
-                                            sequence_length, **kwargs)
-        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
-                                            sequence_length, **kwargs)
-        outputs = map_structure(self.merge_func, outputs_fw,
-                                outputs_bw) if self.merge_func else (outputs_fw,
-                                                                     outputs_bw)
-        return outputs, (states_fw, states_bw)
-
-    @staticmethod
-    def bidirect_param_attr(param_attr):
-        """
-        Converts `param_attr` to a pair of `param_attr` when it is not a list
-        or tuple with length 2, also rename every one by appending a suffix to
-        avoid having same names when `param_attr` contains a name.
-
-        Parameters:
-            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
-                it is a list or tuple, its length must be 2.
-
-        Returns:
-            list: A pair composed of forward and backward RNN cell's `param_attr`.
-        """
-        if isinstance(param_attr, (list, tuple)):
-            assert len(
-                param_attr
-            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
-            param_attrs = param_attr
-        else:
-            param_attrs = []
-            attr = fluid.ParamAttr._to_attr(param_attr)
-            attr_fw = copy.deepcopy(attr)
-            if attr.name:
-                attr_fw.name = attr_fw.name + "_fw"
-            param_attrs.append(attr_fw)
-            attr_bw = copy.deepcopy(attr)
-            if attr.name:
-                attr_bw.name = attr_bw.name + "_bw"
-            param_attrs.append(attr_bw)
-        return param_attrs
-
-
-class BidirectionalLSTM(Layer):
-    """
-    Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an
-    input sequence. 
-    
-    Bidirection interaction can happen after each layer or only after the last
-    layer according to the  `merge_each_layer` setting. The way to interact,
-    that is how to merge outputs of the two direction, is determined by `merge_mode`.
-
-    The formula for LSTM used here is as follows:
-
-    .. math::
-
-        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
-
-        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
-
-        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
-
-        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
-
-        h_{t} & = o_{t} act_c (c_{t})
-
-
-    Parameters:
-        input_size (int): The input feature size for the first LSTM.
-        hidden_size (int): The hidden size for every LSTM.
-        gate_activation (function, optional): The activation function for gates
-            of LSTM, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            LSTM, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        forget_bias (float, optional): forget bias used when computing forget
-            gate. It also can accept a boolean value `True`, which would set
-            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
-            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
-            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
-            Default 1.0.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float|list|tuple, optional): The dropout probability after each
-            LSTM. It also can be a list or tuple, including dropout probabilities
-            for the corresponding LSTM. Default 0.0
-        merge_mode (str|None, optional): The way to merget outputs of forward and
-            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
-            where None stands for make the two `outputs` as a tuple, `zip` stands
-            for make each two corresponding tensors of the two `outputs` as a tuple.
-            Default `concat`
-        merge_each_layer (bool, optional): Indicate whether bidirection interaction
-            happens after each layer or only after the last layer. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import BidirectionalLSTM
-
-            inputs = paddle.rand((2, 4, 32))
-            bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
-            outputs, _ = bi_lstm(inputs)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 merge_mode='concat',
-                 merge_each_layer=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(BidirectionalLSTM, self).__init__()
-        self.num_layers = num_layers
-        self.merge_mode = merge_mode
-        self.merge_each_layer = merge_each_layer
-        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-        if not merge_each_layer:
-            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                      activation, forget_bias, num_layers,
-                                      dropout, param_attrs[0], bias_attrs[0],
-                                      dtype)
-            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
-                                      activation, forget_bias, num_layers,
-                                      dropout, param_attrs[1], bias_attrs[1],
-                                      dtype)
-            self.lstm = BidirectionalRNN(
-                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
-        else:
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
-
-            # maybe design cell including both forward and backward later
-            self.lstm = []
-            for i in range(num_layers):
-                cell_fw = StackedLSTMCell(
-                    input_size
-                    if i == 0 else (hidden_size * 2
-                                    if merge_mode == 'concat' else hidden_size),
-                    hidden_size, gate_activation, activation, forget_bias, 1,
-                    dropout, fw_param_attrs[i], fw_bias_attrs[i], dtype)
-                cell_bw = StackedLSTMCell(
-                    input_size
-                    if i == 0 else (hidden_size * 2
-                                    if merge_mode == 'concat' else hidden_size),
-                    hidden_size, gate_activation, activation, forget_bias, 1,
-                    dropout, bw_param_attrs[i], bw_bias_attrs[i], dtype)
-                self.lstm.append(
-                    self.add_sublayer(
-                        "lstm_%d" % i,
-                        BidirectionalRNN(
-                            cell_fw,
-                            cell_bw,
-                            merge_mode=merge_mode,
-                            time_major=time_major)))
-
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        """
-        Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs`
-        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
-        merged outputs would be the `inputs` of the subsequent one.
-
-        Parameters:
-            inputs (Variable): The inputs for the first LSTM. It is a float32
-                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-            initial_states (list|None, optional): A list containing initial states 
-                of all stacked LSTM. If `merge_each_layer` is True, the length of
-                list should be `num_layers` and a single value would be reused for
-                `num_layers`; Otherwise, the length should be 2 and a single value
-                would be reused twice. If not provided, use 0 as initial states.
-                Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is the output of last bidirectional LSTM; `final_states` is a \
-                pair including `final_states` of forward and backward LSTM when \
-                `merge_each_layer` is False or a list including `final_states` \
-                of all stacked bidirectional LSTM, and it has tensors with same \
-                shapes data types as `initial_states`.
-        """
-        if not self.merge_each_layer:
-            return self.lstm(inputs, initial_states, sequence_length)
-        else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list/tuple"
-                    % self.num_layers)
-            else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                outputs, states = self.lstm[i](inputs, initial_states[i],
-                                               sequence_length)
-                inputs = outputs
-                stacked_states.append(states)
-            return outputs, stacked_states
-
-
-class StackedGRUCell(RNNCell):
-    """
-    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
-    to implement stacked GRU.
-
-    The formula for GRU used here is as follows:
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-
-    Parameters:
-        input_size (int): The input size for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        gate_activation (function, optional): The activation function for gates
-            of GRU, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            GRU, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float|list|tuple, optional): The dropout probability after each
-            GRU. It also can be a list or tuple, including dropout probabilities
-            for the corresponding GRU. Default 0.0
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import StackedGRUCell, RNN
-
-            inputs = paddle.rand((2, 4, 32))
-            cell = StackedGRUCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedGRUCell, self).__init__()
-        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
-                                             float)
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
-
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    BasicGRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        gate_activation=gate_activation,
-                        activation=activation,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
-
-    def forward(self, inputs, states):
-        """
-        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
-        the `outputs` of the previous cell. And each cell's `states` is the
-        corresponding one in `states`.
-
-        Parameters:
-            inputs (Variable): The inputs for the first cell. It is a float32 or
-                float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-            **kwargs: Additional keyword arguments, which passed to `cell.forward`
-                for all including cells.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
-                is a list composed of every GRU `new_states` which is also \
-                :math:`h_{t}` in the formula, and the data type and structure \
-                of these tensors all is same as that of `states`.
-        """
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            outputs, new_state = cell(inputs, states[i])
-            outputs = layers.dropout(
-                outputs,
-                self.dropout[i],
-                dropout_implementation='upscale_in_train') if self.dropout[
-                    i] > 0 else outputs
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedGRUCell is a list composed of each including
-        GRU cell's `state_shape`.
-
-        Returns:
-            list: A list composed of each including GRU cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
-
-
-class GRU(Layer):
-    """
-    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
-    sequence.
-
-    The formula for GRU used here is as follows:
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-
-    Parameters:
-        input_size (int): The input feature size for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        gate_activation (function, optional): The activation function for gates
-            of GRU, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            GRU, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        num_layers(int, optional): The number of GRU to be stacked. Default 1.
-        dropout(float|list|tuple, optional): The dropout probability after each
-            GRU. It also can be a list or tuple, including dropout probabilities
-            for the corresponding GRU. Default 0.0
-        is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import GRU
-
-            inputs = paddle.rand((2, 4, 32))
-            gru = GRU(input_size=32, hidden_size=64, num_layers=2)
-            outputs, _ = gru(inputs)  # [2, 4, 64]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 num_layers=1,
-                 dropout=0.0,
-                 is_reverse=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(GRU, self).__init__()
-        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                  activation, num_layers, dropout, param_attr,
-                                  bias_attr, dtype)
-        self.gru = RNN(gru_cell, is_reverse, time_major)
-
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        """
-        Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs`
-        is the `inputs` of the subsequent one.
-
-        Parameters:
-            inputs (Variable): The inputs for the first GRU. It is a float32
-                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-            initial_states (list|None, optional): A list containing initial states 
-                of all stacked GRU, and the initial states of each GRU is a tensor
-                shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial
-                states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is the output of last GRU and it is a tensor with shape \
-                `[batch_size, sequence_length, hidden_size]` and has the same \
-                data type as `inputs`, `final_states` is the counterpart of \
-                `initial_states` at last time step, thus has the same structure \
-                with it and has tensors with same shapes data types.
-        """
-        return self.gru(inputs, initial_states, sequence_length)
-
-
-class BidirectionalGRU(Layer):
-    """
-    Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input
-    sequence.
-    
-    Bidirection interaction can happen after each layer or only after the last
-    layer according to the  `merge_each_layer` setting. The way to interact,
-    that is how to merge outputs of the two direction, is determined by `merge_mode`.
-
-    The formula for GRU used here is as follows:
-
-    .. math::
-
-        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
-
-        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
-
-        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-
-    Parameters:
-        input_size (int): The input feature size  for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        gate_activation (function, optional): The activation function for gates
-            of GRU, that is :math:`act_g` in the formula. Default: None,
-            representing for `fluid.layers.sigmoid`.
-        activation (function, optional): The non-gate activation function of
-            GRU, that is :math:`act_c` in the formula. Default: None,
-            representing for 'fluid.layers.tanh'.
-        num_layers(int, optional): The number of GRU to be stacked. Default 1.
-        dropout(float|list|tuple, optional): The dropout probability after each
-            GRU. It also can be a list or tuple, including dropout probabilities
-            for the corresponding GRU. Default 0.0
-        merge_mode (str|None, optional): The way to merget outputs of forward and
-            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
-            where None stands for make the two `outputs` as a tuple, `zip` stands
-            for make each two corresponding tensors of the two `outputs` as a tuple.
-            Default `concat`
-        merge_each_layer (bool, optional): Indicate whether bidirection interaction
-            happens after each layer or only after the last layer. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import BidirectionalGRU
-
-            inputs = paddle.rand((2, 4, 32))
-            bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
-            outputs, _ = bi_gru(inputs)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 num_layers=1,
-                 dropout=0.0,
-                 merge_mode='concat',
-                 merge_each_layer=False,
-                 time_major=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
-        super(BidirectionalGRU, self).__init__()
-        self.num_layers = num_layers
-        self.merge_mode = merge_mode
-        self.merge_each_layer = merge_each_layer
-        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-        if not merge_each_layer:
-            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                     activation, num_layers, dropout,
-                                     param_attrs[0], bias_attrs[0], dtype)
-            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
-                                     activation, num_layers, dropout,
-                                     param_attrs[1], bias_attrs[1], dtype)
-            self.gru = BidirectionalRNN(
-                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
-        else:
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
-
-            # maybe design cell including both forward and backward later
-            self.gru = []
-            for i in range(num_layers):
-                cell_fw = StackedGRUCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, gate_activation, activation, 1,
-                                         dropout, fw_param_attrs[i],
-                                         fw_bias_attrs[i], dtype)
-                cell_bw = StackedGRUCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, gate_activation, activation, 1,
-                                         dropout, bw_param_attrs[i],
-                                         bw_bias_attrs[i], dtype)
-                self.gru.append(
-                    self.add_sublayer(
-                        "gru_%d" % i,
-                        BidirectionalRNN(
-                            cell_fw,
-                            cell_bw,
-                            merge_mode=merge_mode,
-                            time_major=time_major)))
-
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        """
-        Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs`
-        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
-        merged outputs would be the `inputs` of the subsequent one.
-
-        Parameters:
-            inputs (Variable): The inputs for the first GRU. It is a float32
-                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-            initial_states (list|None, optional): A list containing initial states 
-                of all stacked GRU. If `merge_each_layer` is True, the length of
-                list should be `num_layers` and a single value would be reused for
-                `num_layers`; Otherwise, the length should be 2 and a single value
-                would be reused twice. If not provided, use 0 as initial states.
-                Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is the output of last bidirectional GRU; `final_states` is a \
-                pair including `final_states` of forward and backward GRU when \
-                `merge_each_layer` is False or a list including `final_states` \
-                of all stacked bidirectional GRU, and it has tensors with same \
-                shapes data types as `initial_states`.
-        """
-        if not self.merge_each_layer:
-            return self.gru(inputs, initial_states, sequence_length)
-        else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list/tuple"
-                    % self.num_layers)
-            else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                outputs, states = self.gru[i](inputs, initial_states[i],
-                                              sequence_length)
-                inputs = outputs
-                stacked_states.append(states)
-            return outputs, stacked_states
-
-
-class DynamicDecode(Layer):
-    """
-    DynamicDecode integrates an Decoder instance to perform dynamic decoding.
-
-    It performs :code:`decoder.step()` repeatedly until the returned Tensor
-    indicating finished status contains all True values or the number of
-    decoding step reaches to :attr:`max_step_num`.
-
-    :code:`decoder.initialize()` would be called once before the decoding loop.
-    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
-    would be called once after the decoding loop.
-
-    Parameters:
-        decoder (Decoder): An instance of `Decoder`.
-        max_step_num (int, optional): The maximum number of steps. If not provided,
-            decode until the decoder is fully done, or in other words, the returned
-            Tensor by :code:`decoder.step()` indicating finished status contains
-            all True. Default `None`.
-        output_time_major (bool, optional): Indicate the data layout of Tensor included
-            in the final outputs(the first returned value of this method). If
-            attr:`False`, the data layout would be batch major with shape
-            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
-            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
-        impute_finished (bool, optional): If `True`, then states get copied through
-            for batch entries which are marked as finished, which differs with the
-            unfinished using the new states returned by :code:`decoder.step()` and
-            ensures that the final states have the correct values. Otherwise, states
-            wouldn't be copied through when finished. If the returned `final_states`
-            is needed, it should be set as True, which causes some slowdown.
-            Default `False`.
-        is_test (bool, optional): A flag indicating whether to use test mode. In
-            test mode, it is more memory saving. Default `False`.
-        return_length (bool, optional):  A flag indicating whether to return an
-            extra Tensor variable in the output tuple, which stores the actual
-            lengths of all decoded sequences. Default `False`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.layers import BeamSearchDecoder
-            from paddle.text import StackedLSTMCell, DynamicDecode
-
-            paddle.disable_static()
-
-            vocab_size, d_model, = 100, 32
-            encoder_output = paddle.rand((2, 4, d_model))
-            trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
-            output_layer = fluid.dygraph.Linear(d_model, vocab_size)
-            cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
-            decoder = BeamSearchDecoder(cell,
-                                        start_token=0,
-                                        end_token=1,
-                                        beam_size=4,
-                                        embedding_fn=trg_embeder,
-                                        output_fn=output_layer)
-            dynamic_decoder = DynamicDecode(decoder, max_step_num=10)
-            outputs = dynamic_decoder(cell.get_initial_states(encoder_output))
-    """
-
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
-
-    def forward(self, inits=None, **kwargs):
-        """
-        Performs :code:`decoder.step()` repeatedly until the returned Tensor
-        indicating finished status contains all True values or the number of
-        decoding step reaches to :attr:`max_step_num`.
-
-        :code:`decoder.initialize()` would be called once before the decoding loop.
-        If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
-        would be called once after the decoding loop.
-
-        Parameters:
-            inits (object, optional): Argument passed to `decoder.initialize`.
-                Default `None`.
-            **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
-
-        Returns:
-            tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
-                when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
-                The final outputs and states, both are Tensor or nested structure of Tensor. \
-                `final_outputs` has the same structure and data types as the :code:`outputs` \
-                returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
-                is the stacked of all decoding steps' outputs, which might be revised \
-                by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
-                `final_states` is the counterpart at last time step of initial states \
-                returned by :code:`decoder.initialize()` , thus has the same structure \
-                with it and has tensors with same shapes and data types. `sequence_lengths` \
-                is an `int64` tensor with the same shape as `finished` returned \
-                by :code:`decoder.initialize()` , and it stores the actual lengths of \
-                all decoded sequences.
-        """
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
-
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
-
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                if not self.decoder.tracks_own_finished:
-                    # BeamSearchDecoder would track it own finished, since
-                    # beams would be reordered and the finished status of each
-                    # entry might change. Otherwise, perform logical OR which
-                    # would not change the already finished.
-                    next_finished = layers.logical_or(next_finished, finished)
-                    # To confirm states.finished/finished be consistent with
-                    # next_finished.
-                    layers.assign(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
-
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
-
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
-
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
-
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
-
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
-
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
-
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (final_outputs,
-                                                                  final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
-
-
-class Conv1dPoolLayer(Layer):
-    """
-    This interface is used to construct a callable object of the ``Conv1DPoolLayer``
-    class. The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` .
-    For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates
-    the output based on the input, filter and strides, paddings, dilations, groups,
-    global_pooling, pool_type, ceil_mode, exclusive parameters.
-
-    Parameters:
-        num_channels (int): The number of channels in the input data.
-        num_filters(int): The number of filters. It is the same as the output channels.
-        filter_size (int): The filter size of Conv1DPoolLayer.       
-        pool_size (int): The pooling size of Conv1DPoolLayer.
-        conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer.
-            Default: 1
-        pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer.
-            Default: 1
-        conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer.
-            Default: 0
-        pool_padding (int): The padding of pool layer in Conv1DPoolLayer.
-            Default: 0
-        act (str): Activation type for conv layer, if it is set to None, activation
-            is not appended. Default: None.
-        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
-            average-pooling. Default: `max`
-        dilation (int): The dilation size of the conv Layer. Default: 1.
-        groups (int): The groups number of the conv Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the
-            first half of the filters is only connected to the first half of the
-            input channels, while the second half of the filters is only connected
-            to the second half of the input channels. Default: 1.
-        global_pooling (bool): Whether to use the global pooling. If it is true, 
-                `pool_size` and `pool_padding` would be ignored. Default: False
-        ceil_mode (bool, optional): Whether to use the ceil function to calculate output 
-                height and width.False is the default. If it is set to False, the floor function 
-                will be used. Default: False.
-        exclusive (bool, optional): Whether to exclude padding points in average pooling mode. 
-                Default: True.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: False
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-
-    Example:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import Conv1dPoolLayer
-
-            # input: [batch_size, num_channels, sequence_length]
-            input = paddle.rand((2, 32, 4))
-            cov2d = Conv1dPoolLayer(num_channels=32,
-                                    num_filters=64,
-                                    filter_size=2,
-                                    pool_size=2)
-            output = cov2d(input)
-    """
-
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 conv_stride=1,
-                 pool_stride=1,
-                 conv_padding=0,
-                 pool_padding=0,
-                 act=None,
-                 pool_type='max',
-                 global_pooling=False,
-                 dilation=1,
-                 groups=None,
-                 ceil_mode=False,
-                 exclusive=True,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(Conv1dPoolLayer, self).__init__()
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=[filter_size, 1],
-            stride=[conv_stride, 1],
-            padding=[conv_padding, 0],
-            dilation=[dilation, 1],
-            groups=groups,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            use_cudnn=use_cudnn,
-            act=act)
-        self._pool2d = Pool2D(
-            pool_size=[pool_size, 1],
-            pool_type=pool_type,
-            pool_stride=[pool_stride, 1],
-            pool_padding=[pool_padding, 0],
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn,
-            ceil_mode=ceil_mode,
-            exclusive=exclusive)
-
-    def forward(self, input):
-        """
-        Performs conv1d and pool1d on the input.
-
-        Parameters:
-            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
-                representing `batch_size`, `num_channels` and `sequence_length`
-                separately. data type can be float32 or float64
-        
-        Returns:
-            Variable: The 3-D output tensor after conv and pool. It has the same \
-                data type as input.
-        """
-        x = fluid.layers.unsqueeze(input, axes=[-1])
-        x = self._conv2d(x)
-        x = self._pool2d(x)
-        x = fluid.layers.squeeze(x, axes=[-1])
-        return x
-
-
-class CNNEncoder(Layer):
-    """
-    This interface is used to construct a callable object of the ``CNNEncoder``
-    class. The ``CNNEncoder`` is composed of multiple ``Conv1dPoolLayer`` .
-    ``CNNEncoder`` can define every Conv1dPoolLayer with different or same parameters.
-    The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel. The results of every 
-    ``Conv1dPoolLayer`` will concat at the channel dimension as the final output.
-
-    Parameters:
-        num_channels(int|list|tuple): The number of channels in the input data. If
-            `num_channels` is a list or tuple, the length of `num_channels` must
-            equal to `num_layers`. If `num_channels` is a int, all conv1dpoollayer's
-            `num_channels` are the value of `num_channels`. 
-        num_filters(int|list|tuple): The number of filters. It is the same as the
-            output channels. If `num_filters` is a list or tuple, the length of
-            `num_filters` must equal `num_layers`. If `num_filters` is a int,
-            all conv1dpoollayer's `num_filters` are the value of `num_filters`.
-        filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder.
-            If `filter_size` is a list or tuple, the length of `filter_size` must
-            equal `num_layers`. If `filter_size` is a int, all conv1dpoollayer's
-            `filter_size` are the value of `filter_size`. 
-        pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.
-            If `pool_size` is a list or tuple, the length of `pool_size` must equal
-            `num_layers`. If `pool_size` is a int, all conv1dpoollayer's `pool_size`
-            are the value of `pool_size`.
-        num_layers(int): The number of conv1dpoolLayer used in CNNEncoder.
-        conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer.
-            If `conv_stride` is a list or tuple, the length of `conv_stride` must
-            equal `num_layers`. If conv_stride is a int, all conv1dpoollayer's `conv_stride`
-            are the value of `conv_stride`. Default: 1
-        pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer.
-            If `pool_stride` is a list or tuple, the length of `pool_stride` must
-            equal `num_layers`. If `pool_stride` is a int, all conv1dpoollayer's `pool_stride`
-            are the value of `pool_stride`. Default: 1
-        conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.
-            If `conv_padding` is a list or tuple, the length of `conv_padding` must
-            equal `num_layers`. If `conv_padding` is a int, all conv1dpoollayer's `conv_padding`
-            are the value of `conv_padding`. Default: 0
-        pool_padding(int|list|tuple): The padding size of pool layer in Conv1DPoolLayer.
-            If `pool_padding` is a list or tuple, the length of `pool_padding` must
-            equal `num_layers`.If `pool_padding` is a int, all conv1dpoollayer's `pool_padding`
-            are the value of `pool_padding`. Default: 0
-        act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None,
-            activation is not appended. Default: None.
-        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
-            average-pooling. Default: `max`
-        global_pooling (bool): Whether to use the global pooling. If it is true, 
-            `pool_size` and `pool_padding` would be ignored. Default: False
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: False
-    
-    Example:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import CNNEncoder
-
-            # input: [batch_size, num_channels, sequence_length]
-            input = paddle.rand((2, 32, 8))
-            cov_encoder = CNNEncoder(num_layers=2,
-                                     num_channels=32,
-                                     num_filters=64,
-                                     filter_size=[2, 3],
-                                     pool_size=[7, 6])
-            output = cov_encoder(input)  # [2, 128, 1]
-    """
-
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 num_layers=1,
-                 conv_stride=1,
-                 pool_stride=1,
-                 conv_padding=0,
-                 pool_padding=0,
-                 act=None,
-                 pool_type='max',
-                 global_pooling=False,
-                 use_cudnn=False):
-        super(CNNEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.num_channels = num_channels
-        self.num_filters = num_filters
-        self.filter_size = filter_size
-        self.pool_size = pool_size
-        self.conv_stride = conv_stride
-        self.pool_stride = pool_stride
-        self.conv_padding = conv_padding
-        self.pool_padding = pool_padding
-        self.use_cudnn = use_cudnn
-        self.act = act
-        self.pool_type = pool_type
-        self.global_pooling = global_pooling
-        self.conv1d_pool_layers = fluid.dygraph.LayerList([
-            Conv1dPoolLayer(
-                num_channels=self.num_channels
-                if isinstance(self.num_channels, int) else self.num_channels[i],
-                num_filters=self.num_filters
-                if isinstance(self.num_channels, int) else self.num_filters[i],
-                filter_size=self.filter_size
-                if isinstance(self.filter_size, int) else self.filter_size[i],
-                pool_size=self.pool_size
-                if isinstance(self.pool_size, int) else self.pool_size[i],
-                conv_stride=self.conv_stride
-                if isinstance(self.conv_stride, int) else self.conv_stride[i],
-                pool_stride=self.pool_stride
-                if isinstance(self.pool_stride, int) else self.pool_stride[i],
-                conv_padding=self.conv_padding
-                if isinstance(self.conv_padding, int) else self.conv_padding[i],
-                pool_padding=self.pool_padding
-                if isinstance(self.pool_padding, int) else self.pool_padding[i],
-                act=self.act[i]
-                if isinstance(self.act, (list, tuple)) else self.act,
-                pool_type=self.pool_type,
-                global_pooling=self.global_pooling,
-                use_cudnn=self.use_cudnn) for i in range(num_layers)
-        ])
-
-    def forward(self, input):
-        """
-        Performs multiple parallel conv1d and pool1d, and concat the results of
-        them at the channel dimension to produce the final output.
-
-        Parameters:
-            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
-                representing `batch_size`, `num_channels` and `sequence_length`
-                separately. data type can be float32 or float64
-        
-        Returns:
-            Variable: The 3-D output tensor produced by concatenating results of \
-                all Conv1dPoolLayer. It has the same data type as input.
-        """
-        res = [
-            conv1d_pool_layer(input)
-            for conv1d_pool_layer in self.conv1d_pool_layers
-        ]
-        out = fluid.layers.concat(input=res, axis=1)
-        return out
-
-
-class TransformerCell(RNNCell):
-    """
-    TransformerCell wraps a Transformer decoder producing logits from `inputs`
-    composed by ids and position.
-
-    Parameters:
-        decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
-            includes a embedding layer accepting ids and positions instead of embeddings
-            and includes a output layer transforming decoder output features to logits.
-        embedding_fn(function, optional): A callable that accepts ids and position
-            as arguments and return embeddings as input of `decoder`. It can be
-            None if `decoder` includes a embedding layer. Default None.
-        output_fn(callable, optional): A callable applid on `decoder` output to
-            transform decoder output features to get logits. Mostly it is a Linear
-            layer with vocabulary size. It can be None if `decoder` includes a
-            output layer. Default None.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Embedding, Linear
-            from paddle.text import TransformerDecoder
-            from paddle.text import TransformerCell
-            from paddle.text import TransformerBeamSearchDecoder
-            from paddle.text import DynamicDecode
-
-            paddle.disable_static()
-
-            class Embedder(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(Embedder, self).__init__()
-                    self.word_embedder = Embedding(size=[1000, 128])
-                    self.pos_embedder = Embedding(size=[500, 128])
-
-                def forward(self, word, position):
-                    return self.word_embedder(word) + self.pos_embedder(position)
-
-            embedder = Embedder()
-            output_layer = Linear(128, 1000)
-            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
-            transformer_cell = TransformerCell(decoder, embedder, output_layer)
-            dynamic_decoder = DynamicDecode(
-                TransformerBeamSearchDecoder(
-                    transformer_cell,
-                    start_token=0,
-                    end_token=1,
-                    beam_size=4,
-                    var_dim_in_state=2),
-                max_step_num=10,
-                is_test=True)
-            
-            enc_output = paddle.rand((2, 4, 128))
-            # cross attention bias: [batch_size, n_head, trg_len, src_len]
-            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
-            # inputs for beam search on Transformer
-            caches = transformer_cell.get_initial_states(enc_output)
-            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                enc_output, beam_size=4)
-            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                trg_src_attn_bias, beam_size=4)
-            static_caches = decoder.prepare_static_cache(enc_output)
-            outputs = dynamic_decoder(
-                inits=caches,
-                enc_output=enc_output,
-                trg_src_attn_bias=trg_src_attn_bias,
-                static_caches=static_caches)
-    """
-
-    def __init__(self, decoder, embedding_fn=None, output_fn=None):
-        super(TransformerCell, self).__init__()
-        self.decoder = decoder
-        self.embedding_fn = embedding_fn
-        self.output_fn = output_fn
-
-    def forward(self,
-                inputs,
-                states=None,
-                enc_output=None,
-                trg_slf_attn_bias=None,
-                trg_src_attn_bias=None,
-                static_caches=[]):
-        """
-        Produces logits from `inputs` composed by ids and positions.
-
-        Parameters:
-            inputs(tuple): A tuple includes target ids and positions. The two
-                tensors both have int64 data type and with 2D shape 
-                `[batch_size, sequence_length]` where `sequence_length` is 1
-                for inference.
-            states(list): It caches the multi-head attention intermediate results
-                of history decoding steps. It is a list of dict where the length
-                of list is decoder layer number, and each dict has `k` and `v` as
-                keys and values are cached results. Default None
-            enc_output(Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, sequence_length, d_model]`. The data type
-                should be float32 or float64.
-            trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
-                attention to mask out attention on unwanted target positions. It
-                is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. It can be None when nothing wanted or needed to
-                be masked out. It can be None for inference. The data type should
-                be float32 or float64. Default None
-            trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
-                cross attention to mask out unwanted attention on source (encoder output).
-                It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. It can be None when nothing wanted or needed to
-                be masked out. The data type should be float32 or float64. Default None
-            static_caches(list): It stores projected results of encoder output
-                to be used as keys and values in decoder-encoder cross attention
-                It is a list of dict where the length of list is decoder layer
-                number, and each dict has `static_k` and `static_v` as keys and
-                values are stored results. Default empty list
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
-                is a float32 or float64 3D tensor representing logits shaped \
-                `[batch_size, sequence_length, vocab_size]`. `new_states has \
-                the same structure and data type with `states` while the length \
-                is one larger since the intermediate results of current step are \
-                concatenated into it.
-        """
-        trg_word, trg_pos = inputs
-        if states and static_caches:
-            for cache, static_cache in zip(states, static_caches):
-                cache.update(static_cache)
-        if self.embedding_fn is not None:
-            dec_input = self.embedding_fn(trg_word, trg_pos)
-            outputs = self.decoder(dec_input, enc_output, None,
-                                   trg_src_attn_bias, states)
-        else:
-            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
-                                   trg_src_attn_bias, states)
-        if self.output_fn is not None:
-            outputs = self.output_fn(outputs)
-
-        new_states = [{
-            "k": cache["k"],
-            "v": cache["v"]
-        } for cache in states] if states else states
-        return outputs, new_states
-
-    @property
-    def state_shape(self):
-        """
-        States of TransformerCell cache the multi-head attention intermediate
-        results of history decoding steps, and have a increasing length as
-        decoding continued.
-        
-        `state_shape` of TransformerCell is used to initialize states. It is a
-        list of dict where the length of list is decoder layer, and each dict
-        has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
-        separately. (-1 for batch size would be automatically inserted into shape).
-
-        Returns:
-            list: It is a list of dict where the length of list is decoder layer \
-                number, and each dict has `k` and `v` as keys and values are cached \
-                results.
-        """
-        return [{
-            "k": [self.decoder.n_head, 0, self.decoder.d_key],
-            "v": [self.decoder.n_head, 0, self.decoder.d_value],
-        } for i in range(self.decoder.n_layer)]
-
-
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    """
-    Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
-    Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
-    and includes extra position data. And its `states` (caches) has increasing
-    length. These are not consistent with `BeamSearchDecoder`, thus subclass
-    `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
-
-    Parameters:
-        cell(TransformerCell): An instance of `TransformerCell`.
-        start_token(int): The start token id.
-        end_token(int): The end token id.
-        beam_size(int): The beam width used in beam search.
-        var_dim_in_state(int): Indicate which dimension of states is variant.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Embedding, Linear
-            from paddle.text import TransformerDecoder
-            from paddle.text import TransformerCell
-            from paddle.text import TransformerBeamSearchDecoder
-            from paddle.text import DynamicDecode
-
-            paddle.disable_static()
-
-            class Embedder(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(Embedder, self).__init__()
-                    self.word_embedder = Embedding(size=[1000, 128])
-                    self.pos_embedder = Embedding(size=[500, 128])
-
-                def forward(self, word, position):
-                    return self.word_embedder(word) + self.pos_embedder(position)
-
-            embedder = Embedder()
-            output_layer = Linear(128, 1000)
-            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
-            transformer_cell = TransformerCell(decoder, embedder, output_layer)
-            dynamic_decoder = DynamicDecode(
-                TransformerBeamSearchDecoder(
-                    transformer_cell,
-                    start_token=0,
-                    end_token=1,
-                    beam_size=4,
-                    var_dim_in_state=2),
-                max_step_num=10,
-                is_test=True)
-            
-            enc_output = paddle.rand((2, 4, 128))
-            # cross attention bias: [batch_size, n_head, trg_len, src_len]
-            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
-            # inputs for beam search on Transformer
-            caches = transformer_cell.get_initial_states(enc_output)
-            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                enc_output, beam_size=4)
-            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                trg_src_attn_bias, beam_size=4)
-            static_caches = decoder.prepare_static_cache(enc_output)
-            outputs = dynamic_decoder(
-                inits=caches,
-                enc_output=enc_output,
-                trg_src_attn_bias=trg_src_attn_bias,
-                static_caches=static_caches)
-    """
-
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        """
-        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
-        tensor with shape `[batch_size * beam_size, ...]`. 
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
-                data type should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
-                data type is same as `x`.
-        """
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        """
-        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
-        tensor with shape `[batch_size, beam_size, ...]`. 
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
-                data type should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
-                data type is same as `x`.     
-        """
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        """
-        Perform a beam search decoding step, which uses `cell` to get probabilities,
-        and follows a beam search step to calculate scores and select candidate
-        token ids.
-
-        Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
-        `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
-        position data as inputs to `cell`.
-
-        Parameters:
-            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
-                representing the current time step number of decoding.
-            inputs(Variable): A tensor variable. It is same as `initial_inputs`
-                returned by `initialize()` for the first decoding step and
-                `next_inputs` returned by `step()` for the others. It is a int64
-                id tensor with shape `[batch_size * beam_size]`
-            states(Variable): A structure of tensor variables.
-                It is same as the `initial_states` returned by `initialize()` for
-                the first decoding step and `beam_search_state` returned by
-                `step()` for the others.
-            **kwargs: Additional keyword arguments, provided by the caller. 
-        
-        Returns:
-            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
-                `beam_search_state` and `next_inputs` have the same structure, \
-                shape and data type as the input arguments `states` and `inputs` separately. \
-                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
-                parent_ids as fields) of tensor variables, where \
-                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
-                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
-                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
-        """
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-
-        # squeeze to adapt to BeamSearchDecoder which use 2D logits
-        cell_outputs = map_structure(
-            lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
-            cell_outputs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
-
-
-### Transformer Modules ###
-class PrePostProcessLayer(Layer):
-    """
-    PrePostProcessLayer is used before/after each multi-head attention(MHA) and
-    feed-forward network(FFN) sub-layer to perform some specific process on
-    inputs/outputs.
-
-    Parameters:
-        process_cmd (str): The process applied before/after each MHA and
-            FFN sub-layer. It should be a string composed of `d`, `a`, `n`,
-            where `d` for dropout, `a` for add residual connection, `n` for
-            layer normalization.
-        d_model (int): The expected feature size in the input and output.
-        dropout_rate (float): The dropout probability if the process includes
-            dropout. Default 0.1
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import PrePostProcessLayer
-
-            # input: [batch_size, sequence_length, d_model]
-            x = paddle.rand((2, 4, 32))
-            process = PrePostProcessLayer('n', 32)
-            out = process(x)  # [2, 4, 32]
-    """
-
-    def __init__(self, process_cmd, d_model, dropout_rate=0.1):
-        super(PrePostProcessLayer, self).__init__()
-        self.process_cmd = process_cmd
-        self.functors = []
-        for cmd in self.process_cmd:
-            if cmd == "a":  # add residual connection
-                self.functors.append(lambda x, y: x + y if y is not None else x)
-            elif cmd == "n":  # add layer normalization
-                layer_norm = LayerNorm(
-                    normalized_shape=d_model,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.)),
-                    bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(0.)))
-
-                self.functors.append(
-                    self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
-                        layer_norm))
-            elif cmd == "d":  # add dropout
-                self.functors.append(lambda x: layers.dropout(
-                    x, dropout_prob=dropout_rate, is_test=False)
-                                     if dropout_rate else x)
-
-    def forward(self, x, residual=None):
-        """
-        Applies `process_cmd` specified process on `x`.
-
-        Parameters:
-            x (Variable): The tensor to be processed. The data type should be float32
-                or float64. The shape is `[batch_size, sequence_length, d_model]`.
-                
-            residual (Variable, optional): Only used if the process includes
-                residual connection. It has the same shape and data type as `x`.
-                Default None
-
-        Returns:
-            Variable: The processed tensor. It has the same shape and data type \
-                    as `x`.
-        """
-        for i, cmd in enumerate(self.process_cmd):
-            if cmd == "a":
-                x = self.functors[i](x, residual)
-            else:
-                x = self.functors[i](x)
-        return x
-
-
-class MultiHeadAttention(Layer):
-    """
-    MultiHead Attention mapps queries and a set of key-value pairs to outputs
-    by jointly attending to information from different representation subspaces,
-    as what multi-head indicates it performs multiple attention in parallel.
-
-    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
-    for more details.
-
-    Parameters:
-        d_key (int): The feature size to transformer queries and keys as in
-            multi-head attention. Mostly it equals to `d_model // n_head`.
-        d_value (int): The feature size to transformer values as in multi-head
-            attention. Mostly it equals to `d_model // n_head`.
-        d_model (int): The expected feature size in the input and output.
-        n_head (int): The number of heads in multi-head attention(MHA).
-        dropout_rate (float, optional): The dropout probability used in MHA to
-            drop some attention target. Default 0.1
-         
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import MultiHeadAttention
-
-            # encoder input: [batch_size, sequence_length, d_model]
-            query = paddle.rand((2, 4, 128))
-            # self attention bias: [batch_size, n_head, src_len, src_len]
-            attn_bias = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
-            output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
-    """
-
-    def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1):
-
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-
-        self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.k_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.v_fc = Linear(
-            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
-        self.proj_fc = Linear(
-            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
-
-    def _prepare_qkv(self, queries, keys, values, cache=None):
-        """
-        Prapares linear projected queries, keys and values for usage of subsequnt
-        multiple attention in parallel. If `cache` is not None, using cached
-        results to reduce redundant calculations.
-
-        Parameters:
-            queries (Variable): The queries for multi-head attention. It is a
-                tensor with shape `[batch_size, sequence_length, d_model]`. The
-                data type should be float32 or float64.
-            keys (Variable, optional): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, sequence_length, d_model]`. The
-                data type should be float32 or float64.
-            values (Variable, optional): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, sequence_length, d_model]`.
-                The data type should be float32 or float64.
-            cache(dict, optional): It is a dict with `k` and `v` as keys, and
-                values cache the multi-head attention intermediate results of
-                history decoding steps for decoder self attention; Or a dict
-                with `static_k` and `statkc_v` as keys, and values stores intermediate
-                results of encoder output for decoder-encoder cross attention.
-                If it is for decoder self attention, values for `k` and `v` would
-                be updated by new tensors concatanating raw tensors with intermediate
-                results of current step. It is only used for inference and should
-                be None for training. Default None
-
-        Returns:
-            tuple: A tuple including linear projected keys and values. These two \
-                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
-                and `[batch_size, n_head, sequence_length, d_value]` separately, \
-                and their data types are same as inputs.
-        """
-        if keys is None:  # self-attention
-            keys, values = queries, queries
-            static_kv = False
-        else:  # cross-attention
-            static_kv = True
-
-        q = self.q_fc(queries)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
-
-        if cache is not None and static_kv and "static_k" in cache:
-            # for encoder-decoder attention in inference and has cached
-            k = cache["static_k"]
-            v = cache["static_v"]
-        else:
-            k = self.k_fc(keys)
-            v = self.v_fc(values)
-            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-
-        if cache is not None:
-            if static_kv and not "static_k" in cache:
-                # for encoder-decoder attention in inference and has not cached
-                cache["static_k"], cache["static_v"] = k, v
-            elif not static_kv:
-                # for decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-                k = layers.concat([cache_k, k], axis=2)
-                v = layers.concat([cache_v, v], axis=2)
-                cache["k"], cache["v"] = k, v
-
-        return q, k, v
-
-    def forward(self,
-                queries,
-                keys=None,
-                values=None,
-                attn_bias=None,
-                cache=None):
-        """
-        Applies multi-head attention to map queries and a set of key-value pairs
-        to outputs.
-
-        Parameters:
-            queries (Variable): The queries for multi-head attention. It is a
-                tensor with shape `[batch_size, sequence_length, d_model]`. The
-                data type should be float32 or float64.
-            keys (Variable, optional): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, sequence_length, d_model]`. The
-                data type should be float32 or float64.
-            values (Variable, optional): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, sequence_length, d_model]`.
-                The data type should be float32 or float64.
-            attn_bias (Variable, optional): A tensor used in multi-head attention
-                to mask out attention on unwanted positions, usually the
-                paddings or the subsequent positions. It is a tensor with shape
-                `[batch_size, n_head, sequence_length, sequence_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-            cache(dict, optional): It is a dict with `k` and `v` as keys, and
-                values cache the multi-head attention intermediate results of
-                history decoding steps for decoder self attention; Or a dict
-                with `static_k` and `statkc_v` as keys, and values stores intermediate
-                results of encoder output for decoder-encoder cross attention.
-                If it is for decoder self attention, values for `k` and `v` would
-                be updated by new tensors concatanating raw tensors with intermediate
-                results of current step. It is only used for inference and should
-                be None for training. Default None
-
-        Returns:
-            Variable: The output of multi-head attention. It is a tensor \
-                that has the same shape and data type as `queries`.
-        """
-        # compute q ,k ,v
-        q, k, v = self._prepare_qkv(queries, keys, values, cache)
-
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_key**-0.5)
-        if attn_bias is not None:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-
-        out = layers.matmul(weights, v)
-
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.proj_fc(out)
-        return out
-
-    def cal_kv(self, keys, values):
-        """
-        Applies linear projection on input keys and values, then splits heads
-        (reshape and transpose) to get keys and values from different representation
-        subspaces for usage of subsequnt multiple attention in parallel.
-
-        Parameters:
-            keys (Variable, optional): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, sequence_length, d_model]`. The
-                data type should be float32 or float64.
-            values (Variable, optional): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, sequence_length, d_model]`.
-                The data type should be float32 or float64.
-
-        Returns:
-            tuple: A tuple including linear projected keys and values. These two \
-                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
-                and `[batch_size, n_head, sequence_length, d_value]` separately, \
-                and their data types are same as inputs.
-        """
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        return k, v
-
-
-class FFN(Layer):
-    """
-    A fully connected feed-forward network applied to each position separately
-    and identically. This consists of two linear transformations with a activation
-    and dropout in between.
-
-    Parameters:
-        d_inner_hid (int): The hidden size in the feedforward network(FFN).
-        d_model (int): The expected feature size in the input and output.
-        dropout_rate (float, optional): The dropout probability used after
-            activition. Default 0.1
-        ffn_fc1_act (str, optional): The activation function in the feedforward
-            network. Default relu.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import FFN
-
-            # input: [batch_size, sequence_length, d_model]
-            x = paddle.rand((2, 4, 32))
-            ffn = FFN(128, 32)
-            out = ffn(x)  # [2, 4, 32]
-    """
-
-    def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"):
-        super(FFN, self).__init__()
-        self.dropout_rate = dropout_rate
-        self.fc1 = Linear(
-            input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
-        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
-
-    def forward(self, x):
-        """
-        Applies a fully connected feed-forward network on each position  of the
-        input sequences separately and identically.
-
-        Parameters:
-            x (Variable): The input of feed-forward network. It is a tensor
-                with shape `[batch_size, sequence_length, d_model]`. The data
-                type should be float32 or float64.
-
-        Returns:
-            Variable: The output of feed-forward network. It is a tensor that has \
-                the same shape and data type as `enc_input`.
-        """
-        hidden = self.fc1(x)
-        if self.dropout_rate:
-            hidden = layers.dropout(
-                hidden, dropout_prob=self.dropout_rate, is_test=False)
-        out = self.fc2(hidden)
-        return out
-
-
-class TransformerEncoderLayer(Layer):
-    """
-    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
-    attention and feedforward network. Before and after each sub-layer, pre-process
-    and post-precess would be applied on the input and output.
-
-    Parameters:
-        n_head (int): The number of heads in multi-head attention(MHA).
-        d_key (int): The feature size to transformer queries and keys as in
-            multi-head attention. Mostly it equals to `d_model // n_head`.
-        d_value (int): The feature size to transformer values as in multi-head
-            attention. Mostly it equals to `d_model // n_head`.
-        d_model (int): The expected feature size in the input and output.
-        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
-        prepostprocess_dropout (float, optional): The dropout probability used
-            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
-        attention_dropout (float, optional): The dropout probability used
-            in MHA to drop some attention target. Default 0.1
-        relu_dropout (float, optional): The dropout probability used after FFN
-            activition. Default 0.1
-        preprocess_cmd (str, optional): The process applied before each MHA and
-            FFN sub-layer, and it also would be applied on output of the last
-            stacked layer. It should be a string composed of `d`, `a`, `n`,
-            where `d` for dropout, `a` for add residual connection, `n` for
-            layer normalization. Default `n`.
-        postprocess_cmd (str, optional): The process applied after each MHA and
-            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
-            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
-            residual connection, `n` for layer normalization. Default `da`.
-        ffn_fc1_act (str, optional): The activation function in the feedforward
-            network. Default relu.
-         
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import TransformerEncoderLayer
-
-            # encoder input: [batch_size, src_len, d_model]
-            enc_input = paddle.rand((2, 4, 128))
-            # self attention bias: [batch_size, n_head, src_len, src_len]
-            attn_bias = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
-            enc_output = encoder_layer(enc_input, attn_bias)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-
-        super(TransformerEncoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias=None):
-        """
-        Applies a Transformer encoder layer on the input.
-
-        Parameters:
-            enc_input (Variable): The input of Transformer encoder layer. It is
-                a tensor with shape `[batch_size, sequence_length, d_model]`.
-                The data type should be float32 or float64.
-            attn_bias(Variable, optional): A tensor used in encoder self attention
-                to mask out attention on unwanted positions, usually the paddings. It
-                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-
-        Returns:
-            Variable: The output of Transformer encoder layer. It is a tensor that \
-                has the same shape and data type as `enc_input`.
-        """
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
-
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
-
-
-class TransformerEncoder(Layer):
-    """
-    TransformerEncoder is a stack of N encoder layers.
-
-    Parameters:
-        n_layer (int): The number of encoder layers to be stacked.
-        n_head (int): The number of heads in multi-head attention(MHA).
-        d_key (int): The feature size to transformer queries and keys as in
-            multi-head attention. Mostly it equals to `d_model // n_head`.
-        d_value (int): The feature size to transformer values as in multi-head
-            attention. Mostly it equals to `d_model // n_head`.
-        d_model (int): The expected feature size in the input and output.
-        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
-        prepostprocess_dropout (float, optional): The dropout probability used
-            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
-        attention_dropout (float, optional): The dropout probability used
-            in MHA to drop some attention target. Default 0.1
-        relu_dropout (float, optional): The dropout probability used after FFN
-            activition. Default 0.1
-        preprocess_cmd (str, optional): The process applied before each MHA and
-            FFN sub-layer, and it also would be applied on output of the last
-            stacked layer. It should be a string composed of `d`, `a`, `n`,
-            where `d` for dropout, `a` for add residual connection, `n` for
-            layer normalization. Default `n`.
-        postprocess_cmd (str, optional): The process applied after each MHA and
-            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
-            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
-            residual connection, `n` for layer normalization. Default `da`.
-        ffn_fc1_act (str, optional): The activation function in the feedforward
-            network. Default relu.
-         
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import TransformerEncoder
-
-            # encoder input: [batch_size, src_len, d_model]
-            enc_input = paddle.rand((2, 4, 128))
-            # self attention bias: [batch_size, n_head, src_len, src_len]
-            attn_bias = paddle.rand((2, 2, 4, 4))
-            encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
-            enc_output = encoder(enc_input, attn_bias)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-
-        super(TransformerEncoder, self).__init__()
-
-        self.encoder_layers = list()
-        for i in range(n_layer):
-            self.encoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerEncoderLayer(
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        ffn_fc1_act=ffn_fc1_act)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self, enc_input, attn_bias=None):
-        """
-        Applies a stack of N Transformer encoder layers on input sequences.
-
-        Parameters:
-            enc_input (Variable): The input of Transformer encoder. It is a tensor
-                with shape `[batch_size, sequence_length, d_model]`. The data
-                type should be float32 or float64.
-            attn_bias(Variable, optional): A tensor used in encoder self attention
-                to mask out attention on unwanted positions, usually the paddings. It
-                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-
-        Returns:
-            Variable: The output of Transformer encoder. It is a tensor that has \
-                the same shape and data type as `enc_input`.
-        """
-        for encoder_layer in self.encoder_layers:
-            enc_output = encoder_layer(enc_input, attn_bias)
-            enc_input = enc_output
-
-        return self.processer(enc_output)
-
-
-class TransformerDecoderLayer(Layer):
-    """
-    TransformerDecoderLayer is composed of three sub-layers which are decoder
-    self (multi-head) attention, decoder-encoder cross attention and feedforward
-    network. Before and after each sub-layer, pre-process and post-precess would
-    be applied on the input and output.
-
-    Parameters:
-        n_head (int): The number of heads in multi-head attention(MHA).
-        d_key (int): The feature size to transformer queries and keys as in
-            multi-head attention. Mostly it equals to `d_model // n_head`.
-        d_value (int): The feature size to transformer values as in multi-head
-            attention. Mostly it equals to `d_model // n_head`.
-        d_model (int): The expected feature size in the input and output.
-        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
-        prepostprocess_dropout (float, optional): The dropout probability used
-            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
-        attention_dropout (float, optional): The dropout probability used
-            in MHA to drop some attention target. Default 0.1
-        relu_dropout (float, optional): The dropout probability used after FFN
-            activition. Default 0.1
-        preprocess_cmd (str, optional): The process applied before each MHA and
-            FFN sub-layer, and it also would be applied on output of the last
-            stacked layer. It should be a string composed of `d`, `a`, `n`,
-            where `d` for dropout, `a` for add residual connection, `n` for
-            layer normalization. Default `n`.
-        postprocess_cmd (str, optional): The process applied after each MHA and
-            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
-            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
-            residual connection, `n` for layer normalization. Default `da`.
-        ffn_fc1_act (str, optional): The activation function in the feedforward
-            network. Default relu.
-         
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import TransformerDecoderLayer
-
-            # decoder input: [batch_size, trg_len, d_model]
-            dec_input = paddle.rand((2, 4, 128))
-            # encoder output: [batch_size, src_len, d_model]
-            enc_output = paddle.rand((2, 6, 128))
-            # self attention bias: [batch_size, n_head, trg_len, trg_len]
-            self_attn_bias = paddle.rand((2, 2, 4, 4))
-            # cross attention bias: [batch_size, n_head, trg_len, src_len]
-            cross_attn_bias = paddle.rand((2, 2, 4, 6))
-            decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512)
-            output = decoder_layer(dec_input,
-                                   enc_output,
-                                   self_attn_bias,
-                                   cross_attn_bias)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-        super(TransformerDecoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                             attention_dropout)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
-        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias=None,
-                cross_attn_bias=None,
-                cache=None):
-        """
-        Applies a Transformer decoder layer on the input.
-
-        Parameters:
-            dec_input (Variable): The input of Transformer decoder. It is a tensor
-                with shape `[batch_size, target_length, d_model]`. The data type
-                should be float32 or float64.
-            enc_output (Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, source_length, d_model]`. The data type
-                should be float32 or float64.
-            self_attn_bias (Variable, optional): A tensor used in decoder self attention
-                to mask out attention on unwanted positions, usually the subsequent positions.
-                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
-                attention to mask out attention on unwanted positions, usually the paddings.
-                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-            caches(dict, optional): It caches the multi-head attention intermediate
-                results of history decoding steps and encoder output. It is a dict
-                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
-                results. It is only used for inference and should be None for
-                training. Default None
-
-        Returns:
-            Variable: The output of Transformer decoder layer. It is a tensor \
-                that has the same shape and data type as `dec_input`.
-        """
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
-        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
-
-        cross_attn_output = self.cross_attn(
-            self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias, cache)
-        cross_attn_output = self.postprocesser2(cross_attn_output,
-                                                self_attn_output)
-
-        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
-        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
-
-        return ffn_output
-
-
-class TransformerDecoder(Layer):
-    """
-    TransformerDecoder is a stack of N decoder layers.
-
-    Parameters:
-        n_layer (int): The number of encoder layers to be stacked.
-        n_head (int): The number of heads in multi-head attention(MHA).
-        d_key (int): The feature size to transformer queries and keys as in
-            multi-head attention. Mostly it equals to `d_model // n_head`.
-        d_value (int): The feature size to transformer values as in multi-head
-            attention. Mostly it equals to `d_model // n_head`.
-        d_model (int): The expected feature size in the input and output.
-        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
-        prepostprocess_dropout (float, optional): The dropout probability used
-            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
-        attention_dropout (float, optional): The dropout probability used
-            in MHA to drop some attention target. Default 0.1
-        relu_dropout (float, optional): The dropout probability used after FFN
-            activition. Default 0.1
-        preprocess_cmd (str, optional): The process applied before each MHA and
-            FFN sub-layer, and it also would be applied on output of the last
-            stacked layer. It should be a string composed of `d`, `a`, `n`,
-            where `d` for dropout, `a` for add residual connection, `n` for
-            layer normalization. Default `n`.
-        postprocess_cmd (str, optional): The process applied after each MHA and
-            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
-            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
-            residual connection, `n` for layer normalization. Default `da`.
-        ffn_fc1_act (str, optional): The activation function in the feedforward
-            network. Default relu.
-         
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import TransformerDecoder
-
-            # decoder input: [batch_size, trg_len, d_model]
-            dec_input = paddle.rand((2, 4, 128))
-            # encoder output: [batch_size, src_len, d_model]
-            enc_output = paddle.rand((2, 6, 128))
-            # self attention bias: [batch_size, n_head, trg_len, trg_len]
-            self_attn_bias = paddle.rand((2, 2, 4, 4))
-            # cross attention bias: [batch_size, n_head, trg_len, src_len]
-            cross_attn_bias = paddle.rand((2, 2, 4, 6))
-            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
-            dec_output = decoder(dec_input,
-                                 enc_output,
-                                 self_attn_bias,
-                                 cross_attn_bias)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout=0.1,
-                 attention_dropout=0.1,
-                 relu_dropout=0.1,
-                 preprocess_cmd="n",
-                 postprocess_cmd="da",
-                 ffn_fc1_act="relu"):
-        super(TransformerDecoder, self).__init__()
-
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-
-        self.decoder_layers = list()
-        for i in range(n_layer):
-            self.decoder_layers.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    TransformerDecoderLayer(n_head, d_key, d_value, d_model,
-                                            d_inner_hid, prepostprocess_dropout,
-                                            attention_dropout, relu_dropout,
-                                            preprocess_cmd, postprocess_cmd)))
-        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
-                                             prepostprocess_dropout)
-
-    def forward(self,
-                dec_input,
-                enc_output,
-                self_attn_bias=None,
-                cross_attn_bias=None,
-                caches=None):
-        """
-        Applies a stack of N Transformer decoder layers on inputs.
-
-        Parameters:
-            dec_input (Variable): The input of Transformer decoder. It is a tensor
-                with shape `[batch_size, target_length, d_model]`. The data type
-                should be float32 or float64.
-            enc_output (Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, source_length, d_model]`. The data type
-                should be float32 or float64.
-            self_attn_bias (Variable, optional): A tensor used in decoder self attention
-                to mask out attention on unwanted positions, usually the subsequent positions.
-                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
-                attention to mask out attention on unwanted positions, usually the paddings.
-                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
-            caches(list, optional): It caches the multi-head attention intermediate results
-                of history decoding steps and encoder output. It is a list of dict
-                where the length of list is decoder layer number, and each dict
-                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
-                results. It is only used for inference and should be None for
-                training. Default None
-
-        Returns:
-            Variable: The output of Transformer decoder. It is a tensor that has \
-                the same shape and data type as `dec_input`.
-        """
-        for i, decoder_layer in enumerate(self.decoder_layers):
-            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, caches[i]
-                                       if caches else None)
-            dec_input = dec_output
-
-        return self.processer(dec_output)
-
-    def prepare_static_cache(self, enc_output):
-        """
-        Generate a list of dict where the length of list is decoder layer number.
-        Each dict has `static_k`, `statkc_v` as keys, and values are projected
-        results of encoder output to be used as keys and values in decoder-encoder
-        cross (multi-head) attention. Used in inference.
-
-        Parameters:
-            enc_output (Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, source_length, d_model]`. The data type
-                should be float32 or float64.
-
-        Returns:
-            list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \
-                and values are projected results of encoder output to be used as \
-                keys and values in decoder-encoder cross (multi-head) attention.
-        """
-        return [
-            dict(
-                zip(("static_k", "static_v"),
-                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
-            for decoder_layer in self.decoder_layers
-        ]
-
-    def prepare_incremental_cache(self, enc_output):
-        """
-        Generate a list of dict where the length of list is decoder layer number.
-        Each dict has `k`, `v` as keys, and values are empty tensors with shape
-        `[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`,
-        representing the decoder self (multi-head) attention intermediate results,
-        and 0 is the initial length which would increase as inference decoding
-        continued. Used in inference.
-
-        Parameters:
-            enc_output (Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, source_length, d_model]`. The data type
-                should be float32 or float64. Actually, it is used to provide batch
-                size for Transformer initial states(caches), thus any tensor has
-                wanted batch size can be used here.
-
-        Returns:
-            list: A list of dict. Each dict has `k`, `v` as keys, and values are \
-                empty tensors representing intermediate results of history decoding \
-                steps in decoder self (multi-head) attention at time step 0.
-        """
-        return [{
-            "k": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_key],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_value],
-                dtype=enc_output.dtype,
-                value=0),
-        } for i in range(self.n_layer)]
-
-
-class LinearChainCRF(Layer):
-    """
-    Computes the negtive log-likelihood of tag sequences in a linear chain CRF. 
-    Using terminologies of undirected probabilistic graph model, it calculates
-    probability using unary potentials (for emission) and binary potentials 
-    (for transition). 
-
-    This layer creates a learnable parameter shaped `[size + 2, size]` (`size`
-    is for the number of tags), where:
-    
-    1. the first row is for starting weights, denoted as $a$ here
-    
-    2. the second row is for ending weights, denoted as $b$ here.
-    
-    3. the remaining rows is a matrix for transition weights. 
-    
-    Denote input tensor of unary potentials(emission) as $x$ , then the probability
-    of a tag sequence $s$ of length $L$ is defined as:
-
-    $$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
-                    + \sum_{l=1}^L x_{s_l}
-                    + \sum_{l=2}^L w_{s_{l-1},s_l})$$
-    
-    where $Z$ is a normalization value so that the sum of $P(s)$ over
-    all possible sequences is 1, and $x$ is the emission feature weight
-    to the linear chain CRF.
-
-    This operator implements the Forward-Backward algorithm for the linear chain
-    CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
-    http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
-
-    NOTE:
-
-    1. The feature function for a CRF is made up of the emission features and the
-    transition features. The emission feature weights are NOT computed in
-    this operator. They MUST be computed first before this operator is called.
-
-    2. Because this operator performs global normalization over all possible
-    sequences internally, it expects UNSCALED emission feature weights.
-    Please do not call this op with the emission feature being output of any
-    nonlinear activation.
-
-    3. The 2nd dimension of input(emission) MUST be equal to the tag number.
-
-    Parameters:
-        size (int): The number of tags.
-        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
-            transition. Default: None
-        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
-            Default: `float32`
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import LinearChainCRF
-
-            # emission: [batch_size, sequence_length, num_tags]
-            emission = paddle.rand((2, 8, 5))
-            # label: [batch_size, sequence_length, num_tags]
-            # dummy label just for example usage
-            label = paddle.ones((2, 8), dtype='int64')  
-            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
-            crf = LinearChainCRF(size=5)
-            cost = crf(emission, label, length)  # [2, 1]
-    """
-
-    def __init__(self, size, param_attr=None, dtype='float32'):
-        super(LinearChainCRF, self).__init__()
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        """
-        getter for transition matrix parameter
-
-        Returns:
-            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
-                (`size` is for the number of tags). The data type should be float32 \
-                or float64.
-        """
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        """
-        setter for transition matrix parameter
-
-        Parameters:
-            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
-                (`size` is for the number of tags). The data type should be float32 \
-                or float64.
-        """
-        self._transition = value
-
-    def forward(self, input, label, length):
-        """
-        Computes the log-likelihood of tag sequences in a linear chain CRF.
-
-        Parameters:
-            input (Variable): The input of unary potentials(emission). It is a
-                tensor with shape `[batch_size, sequence_length, num_tags]`.
-                The data type should be float32 or float64.
-            label (Variable): The golden sequence tags. It is a tensor
-                with shape `[batch_size, sequence_length]`. The data type
-                should be int64.
-            length (Variable): A tensor with shape `[batch_size]`. It stores real
-                length of each sequence for correctness.
-
-        Returns:
-            Variable: The negtive log-likelihood of tag sequences. It is a tensor \
-                with shape `[batch_size, 1]` and has float32 or float64 data type.
-        """
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            })
-        return log_likelihood
-
-
-class CRFDecoding(Layer):
-    """
-    CRFDecoding reads the emission feature weights and the transition
-    feature weights learned by the `LinearChainCRF` and performs decoding. 
-    It implements the Viterbi algorithm which is a dynamic programming algorithm 
-    for finding the most likely sequence of hidden states, called the Viterbi path, 
-    that results in a sequence of observed tags.
-
-    The output of this layer changes according to whether `label` is given:
-
-    1. `label` is given:
-
-    This happens in training. This operator is used to co-work with the chunk_eval
-    operator. When `label` is given, it returns tensor with the same shape as 
-    `label` whose values are fixed to be 0, indicating an incorrect prediction,
-    or 1 indicating a tag is correctly predicted. Such an output is the input to
-    chunk_eval operator.
-
-    2. `label` is not given:
-
-    This is the standard decoding process and get the highest scoring sequence
-    of tags.
-
-    Parameters:
-        size (int): The number of tags.
-        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
-            transition. Default: None
-        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
-            Default: `float32`
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import CRFDecoding
-
-            # emission: [batch_size, sequence_length, num_tags]
-            emission = paddle.rand((2, 8, 5))
-            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
-            crf_decoding = CRFDecoding(size=5)
-            cost = crf_decoding(emission, length)  # [2, 8]
-    """
-
-    def __init__(self, size, param_attr=None, dtype='float32'):
-        super(CRFDecoding, self).__init__()
-        self._dtype = dtype
-        self._size = size
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-
-    @property
-    def weight(self):
-        """
-        getter for transition matrix parameter
-
-        Returns:
-            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
-                (`size` is for the number of tags). The data type should be float32 \
-                or float64.
-        """
-        return self._transition
-
-    @weight.setter
-    def weight(self, value):
-        """
-        setter for transition matrix parameter
-
-        Parameters:
-            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
-                (`size` is for the number of tags). The data type should be float32 \
-                or float64.
-        """
-        self._transition = value
-
-    def forward(self, input, length, label=None):
-        """
-        Performs sequence tagging prediction.
-
-        Parameters:
-            input (Variable): The input of unary potentials(emission). It is a
-                tensor with shape `[batch_size, sequence_length, num_tags]`.
-                The data type should be float32 or float64.
-            length (Variable): A tensor with shape `[batch_size]`.
-                It stores real length of each sequence for correctness.
-            label (Variable, optional): The golden sequence tags. It is a tensor
-                with shape `[batch_size, sequence_length]`. The data type
-                should be int64. Default None.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size, sequence_length]` and \
-                int64 data type. If `label` is None, the tensor has binary values \
-                indicating a correct or incorrect prediction. Otherwise its values \
-                range from 0 to maximum tag number - 1, each element indicates \
-                an index of a predicted tag.
-        """
-
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length is not None:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]})
-        return viterbi_path
-
-
-class _GRUEncoder(Layer):
-    """
-    A multi-layer bidirectional GRU encoder used by SequenceTagging.
-    """
-
-    def __init__(self,
-                 input_dim,
-                 grnn_hidden_dim,
-                 init_bound,
-                 num_layers=1,
-                 is_bidirection=False):
-        super(_GRUEncoder, self).__init__()
-        self.num_layers = num_layers
-        self.is_bidirection = is_bidirection
-        self.gru_list = []
-        self.gru_r_list = []
-        for i in range(num_layers):
-            self.basic_gru_cell = BasicGRUCell(
-                input_size=input_dim if i == 0 else input_dim * 2,
-                hidden_size=grnn_hidden_dim,
-                param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
-                        low=-init_bound, high=init_bound),
-                    regularizer=fluid.regularizer.L2DecayRegularizer(
-                        regularization_coeff=1e-4)))
-            self.gru_list.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    RNN(self.basic_gru_cell, is_reverse=False,
-                        time_major=False)))
-        if self.is_bidirection:
-            for i in range(num_layers):
-                self.basic_gru_cell_r = BasicGRUCell(
-                    input_size=input_dim if i == 0 else input_dim * 2,
-                    hidden_size=grnn_hidden_dim,
-                    param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.UniformInitializer(
-                            low=-init_bound, high=init_bound),
-                        regularizer=fluid.regularizer.L2DecayRegularizer(
-                            regularization_coeff=1e-4)))
-                self.gru_r_list.append(
-                    self.add_sublayer(
-                        "gru_r_%d" % i,
-                        RNN(self.basic_gru_cell_r,
-                            is_reverse=True,
-                            time_major=False)))
-
-    def forward(self, input_feature, h0=None):
-        for i in range(self.num_layers):
-            pre_gru, pre_state = self.gru_list[i](input_feature)
-            if self.is_bidirection:
-                gru_r, r_state = self.gru_r_list[i](input_feature)
-                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
-            else:
-                out = pre_gru
-            input_feature = out
-        return out
-
-
-class SequenceTagging(Layer):
-    """
-    Sequence tagging model using multi-layer bidirectional GRU as backbone and
-    linear chain CRF as output layer.
-
-    Parameters:
-        vocab_size (int): The size of vocabulary.
-        num_labels (int): The number of labels.
-        word_emb_dim (int, optional): The embedding size. Defalut 128
-        grnn_hidden_dim (int, optional): The hidden size of GRU. Defalut 128
-        emb_learning_rate (int, optional): The partial learning rate for embedding.
-            The actual learning rate for embedding would multiply it with the global
-            learning rate. Default 0.1
-        crf_learning_rate (int, optional): The partial learning rate for crf. The
-            actual learning rate for embedding would multiply it with the global
-            learning rate. Default 0.1
-        bigru_num (int, optional): The number of bidirectional GRU layers.
-            Default 2
-        init_bound (float, optional): The range for uniform initializer would
-            be `(-init_bound, init_bound)`. It would be used for all parameters
-            except CRF transition matrix. Default 0.1
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.text import SequenceTagging
-
-            # word: [batch_size, sequence_length]
-            # dummy input just for example
-            word = paddle.ones((2, 8), dtype='int64')
-            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
-            seq_tagger = SequenceTagging(vocab_size=100, num_labels=5)
-            outputs = seq_tagger(word, length)
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 num_labels,
-                 word_emb_dim=128,
-                 grnn_hidden_dim=128,
-                 emb_learning_rate=0.1,
-                 crf_learning_rate=0.1,
-                 bigru_num=2,
-                 init_bound=0.1):
-        super(SequenceTagging, self).__init__()
-        self.word_emb_dim = word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = grnn_hidden_dim
-        self.emb_lr = emb_learning_rate
-        self.crf_lr = crf_learning_rate
-        self.bigru_num = bigru_num
-        self.init_bound = 0.1
-
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
-
-        self.gru_encoder = _GRUEncoder(
-            input_dim=self.grnn_hidden_dim,
-            grnn_hidden_dim=self.grnn_hidden_dim,
-            init_bound=self.init_bound,
-            num_layers=self.bigru_num,
-            is_bidirection=True)
-
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.linear_chain_crf = LinearChainCRF(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-        self.crf_decoding = CRFDecoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-    def forward(self, word, lengths, target=None):
-        """
-        Performs sequence tagging. If `target` is None, it is for training and
-        loss would be returned, otherwise it is for inference and returns the
-        predicted tags.
-
-        Parameters:
-            word (Variable): The input sequences to be labeled. It is a tensor
-                with shape `[batch_size, sequence_length]`. The data type should
-                be int64.
-            lengths (Variable): A tensor with shape `[batch_size]`. It stores real
-                length of each sequence.
-            target (Variable, optional): The golden sequence tags. It is a tensor
-                with shape `[batch_size, sequence_length]`. The data type
-                should be int64. It could be None for inference. Default None.
-
-        Returns:
-            tuple: A tuple( :code:`(crf_decode, avg_cost, lengths)` ) If input \
-                argument `target` is provided, including the most likely sequence \
-                tags, the averaged CRF cost and the sequence lengths, the shapes \
-                are `[batch_size, sequence_length]`, `[1]` and `[batch_size]`, \
-                and the data types are int64, float32 and int64. Otherwise A \
-                tuple( :code:`(crf_decode, lengths)` ) for inference.
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
-
-        bigru_output = self.gru_encoder(input_feature)
-        emission = self.fc(bigru_output)
-
-        if target is not None:
-            crf_cost = self.linear_chain_crf(
-                input=emission, label=target, length=lengths)
-            avg_cost = fluid.layers.mean(x=crf_cost)
-            self.crf_decoding.weight = self.linear_chain_crf.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, avg_cost, lengths
-        else:
-            self.linear_chain_crf.weight = self.crf_decoding.weight
-            crf_decode = self.crf_decoding(input=emission, length=lengths)
-            return crf_decode, lengths
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 77f5ef7e9661e..9d7a05131ffa1 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -17,13 +17,14 @@
 from .profiler import get_profiler
 from .deprecated import deprecated
 from .lazy_import import try_import
+from .install_check import run_check
 from ..fluid.framework import unique_name
 from ..fluid.framework import load_op_library
 from ..fluid.framework import require_version
 
 from . import download
 
-__all__ = ['dump_config', 'deprecated', 'download']
+__all__ = ['dump_config', 'deprecated', 'download', 'run_check']
 
 #TODO: define new api under this directory
 __all__ += ['unique_name', 'load_op_library', 'require_version']
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
new file mode 100644
index 0000000000000..3b98680c89f25
--- /dev/null
+++ b/python/paddle/utils/install_check.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+import os
+import logging
+import numpy as np
+
+import paddle
+
+__all__ = ['run_check']
+
+
+def _simple_network():
+    """
+    Define a simple network composed by a single linear layer.
+    """
+    input = paddle.static.data(
+        name="input", shape=[None, 2, 2], dtype="float32")
+    weight = paddle.create_parameter(
+        shape=[2, 3],
+        dtype="float32",
+        attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.1)))
+    bias = paddle.create_parameter(shape=[3], dtype="float32")
+    linear_out = paddle.nn.functional.linear(x=input, weight=weight, bias=bias)
+    out = paddle.tensor.sum(linear_out)
+    return input, out, weight
+
+
+def _prepare_data(device_count):
+    """
+    Prepare feeding data for simple network. The shape is [device_count, 2, 2].
+
+    Args:
+        device_count (int): The number of devices.
+    """
+    # Prepare the feeding data.
+    np_input_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    if device_count == 1:
+        return np_input_single.reshape(device_count, 2, 2)
+    else:
+        input_list = []
+        for i in range(device_count):
+            input_list.append(np_input_single)
+        np_input_muti = np.array(input_list)
+        np_input_muti = np_input_muti.reshape(device_count, 2, 2)
+        return np_input_muti
+
+
+def _is_cuda_available():
+    """
+    Check whether CUDA is avaiable.
+    """
+    try:
+        assert len(paddle.static.cuda_places()) > 0
+        return True
+    except Exception as e:
+        logging.warning(
+            "You are using GPU version PaddlePaddle, but there is no GPU "
+            "detected on your machine. Maybe CUDA devices is not set properly."
+            "\n Original Error is {}".format(e))
+        return False
+
+
+def _run_static_single(use_cuda):
+    """
+    Testing the simple network with executor running directly, using one CPU/GPU.
+
+    Args:
+        use_cuda (bool): Whether running with CUDA.
+    """
+    paddle.enable_static()
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        train_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        startup_prog.random_seed = 1
+        with paddle.static.program_guard(train_prog, startup_prog):
+            input, out, weight = _simple_network()
+            param_grads = paddle.static.append_backward(
+                out, parameter_list=[weight.name])[0]
+
+        exe = paddle.static.Executor(
+            paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
+        exe.run(startup_prog)
+        exe.run(train_prog,
+                feed={input.name: _prepare_data(1)},
+                fetch_list=[out.name, param_grads[1].name])
+    paddle.disable_static()
+
+
+def _run_static_parallel(use_cuda, device_list):
+    """
+    Testing the simple network in data parallel mode, using multiple CPU/GPU.
+
+    Args:
+        use_cuda (bool): Whether running with CUDA.
+        device_list (int): The specified devices.
+    """
+    paddle.enable_static()
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        train_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(train_prog, startup_prog):
+            input, out, _ = _simple_network()
+            loss = paddle.tensor.mean(out)
+            loss.persistable = True
+            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+        compiled_prog = paddle.static.CompiledProgram(
+            train_prog).with_data_parallel(
+                loss_name=loss.name, places=device_list)
+
+        exe = paddle.static.Executor(
+            paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
+        exe.run(startup_prog)
+        exe.run(compiled_prog,
+                feed={input.name: _prepare_data(len(device_list))},
+                fetch_list=[loss.name])
+    paddle.disable_static()
+
+
+def run_check():
+    """
+    Check whether PaddlePaddle is installed correctly and running successfully
+    on your system.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.utils.run_check()
+            # Running verify PaddlePaddle program ...
+            # W1010 07:21:14.972093  8321 device_context.cc:338] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 11.0, Runtime API Version: 10.1
+            # W1010 07:21:14.979770  8321 device_context.cc:346] device: 0, cuDNN Version: 7.6.
+            # PaddlePaddle works well on 1 GPU.
+            # PaddlePaddle works well on 8 GPUs.
+            # PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
+    """
+
+    print("Running verify PaddlePaddle program ... ")
+
+    use_cuda = _is_cuda_available()
+    if use_cuda:
+        device_str = "GPU"
+        device_list = paddle.static.cuda_places()
+    else:
+        device_str = "CPU"
+        device_list = paddle.static.cpu_places(device_count=2)
+    device_count = len(device_list)
+
+    _run_static_single(use_cuda)
+    print("PaddlePaddle works well on 1 {}.".format(device_str))
+
+    try:
+        _run_static_parallel(use_cuda, device_list)
+        print("PaddlePaddle works well on {} {}s.".format(device_count,
+                                                          device_str))
+        print(
+            "PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now."
+        )
+    except Exception as e:
+        logging.warning(
+            "PaddlePaddle meets some problem with {} {}s. This may be caused by:"
+            "\n 1. There is not enough GPUs visible on your system"
+            "\n 2. Some GPUs are occupied by other process now"
+            "\n 3. NVIDIA-NCCL2 is not installed correctly on your system. Please follow instruction on https://github.com/NVIDIA/nccl-tests "
+            "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html".
+            format(device_count, device_str))
+
+        logging.warning("\n Original Error is: {}".format(e))
+        print("PaddlePaddle is installed successfully ONLY for single {}! "
+              "Let's start deep learning with PaddlePaddle now.".format(
+                  device_str))
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 39654122e3b33..4e6030bd14bf9 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -240,9 +240,8 @@ def _mobilenet(arch, pretrained=False, **kwargs):
             arch)
         weight_path = get_weights_path_from_url(model_urls[arch][0],
                                                 model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = paddle.load(weight_path)
+
+        param = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index bab8b7b2b1b93..0f4dc22f679df 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -194,9 +194,8 @@ def _mobilenet(arch, pretrained=False, **kwargs):
             arch)
         weight_path = get_weights_path_from_url(model_urls[arch][0],
                                                 model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = paddle.load(weight_path)
+
+        param = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index f9e00aefd6bb2..3ae01b6fd7d76 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -262,9 +262,8 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
             arch)
         weight_path = get_weights_path_from_url(model_urls[arch][0],
                                                 model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = paddle.load(weight_path)
+
+        param = paddle.load(weight_path)
         model.set_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index d11845b661626..2d62e1d22d430 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -117,9 +117,8 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
             arch)
         weight_path = get_weights_path_from_url(model_urls[arch][0],
                                                 model_urls[arch][1])
-        assert weight_path.endswith(
-            '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = paddle.load(weight_path)
+
+        param = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 16e61d7c77a4e..8b0be9d8a6a66 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
@@ -19,8 +33,8 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/framework/ir/node.h"
            "paddle/fluid/framework/ir/graph.h"
            "paddle/fluid/framework/framework.proto"
-	    "python/paddle/distributed/__init"
-	    "python/paddle/distributed/fleet/__init__.py"
+           "python/paddle/distributed/__init"
+           "python/paddle/distributed/fleet/__init__.py"
            "python/requirements.txt"
            "python/paddle/fluid/__init__.py"
            "python/paddle/fluid/compiler.py"
@@ -74,7 +88,7 @@ for API_FILE in ${API_FILES[*]}; do
   if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
       # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
       # You can use http://caius.github.io/github_id/ to find Github user id.
-      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676. Dong Daxiang 35550832.
+      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676, Dong Daxiang 35550832, phlrain 43953930.
       if [ "${API_FILE}" == "CMakeLists.txt" ];then
           echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
           check_approval 1 6836917 46782768
@@ -82,8 +96,8 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
           check_approval 1 6836917 47554610
       elif [ "${API_FILE}" == "python/requirements.txt" ];then
-          echo_line="You must have one RD (kolinwei (Recommend) or luotao1) approval for python/requirements.txt, which manages the third-party python package.\n"
-          check_approval 1 22165420 6836917
+          echo_line="You must have one RD (phlrain) and one TPM (swtkiwi) and one QA (kolinwei) approval for python/requirements.txt, which manages the third-party python package.\n"
+          check_approval 3 43953930 27208573 22165420
       elif [ "${API_FILE}" == "paddle/fluid/operators/distributed/send_recv.proto.in" ];then
           echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
           check_approval 1 10721757 5442383
@@ -129,9 +143,12 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "paddle/scripts/paddle_build.bat" ]; then
 	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages all Paddle CI task on Windows.\n"
 	      check_approval 1 52485244 6836917
+      elif [ "${API_FILE}" == "python/paddle/fluid/parallel_executor.py" ]; then
+          echo_line="You must have one RD (Xreki,luotao1,zhhsplendid) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n"
+          check_approval 1 12538138 6836917 7913861
       else
           echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          check_approval 1 3048612 46782768 12538138 6836917
+          check_approval 1 46782768 12538138 6836917
       fi
   fi
 done
@@ -140,7 +157,7 @@ FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
 HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER |grep -o -m 1 "const_cast" || true`
 if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for the usage (either add or delete) of const_cast.\n"
-    check_approval 1 3048612 46782768 12538138 6836917
+    check_approval 1 46782768 12538138 6836917
 fi
 
 HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true`
@@ -179,7 +196,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+ALL_ADDED_LINES=`git diff -U0 upstream/$BRANCH |grep "^+" || true`
+ALL_PADDLE_CHECK=`echo $ALL_ADDED_LINES |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
 VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
 INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
 if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
@@ -247,8 +265,8 @@ if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
     if [ "${CHECK_WHOLE}" != "" ] ; then
         CHECK_OP=${CHECK_WHOLE//+/'\n+'}       
-        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
-        check_approval 1 6836917 47554610 12538138 43953930
+        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4 (Recommend for kunlun), luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
+        check_approval 1 6836917 47554610 12538138 43953930 35824027
     fi
 fi
 
diff --git a/tools/codestyle/copyright.hook b/tools/codestyle/copyright.hook
index 86b16ebdc4604..d25ac074d8c92 100644
--- a/tools/codestyle/copyright.hook
+++ b/tools/codestyle/copyright.hook
@@ -1,15 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import absolute_import
 from __future__ import print_function
 from __future__ import unicode_literals
 
 import argparse
-import io, re
-import sys, os
-import subprocess
-import platform
+import io
+import re
+import sys
+import os
+import datetime
 
-COPYRIGHT = '''
-Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+COPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,74 +35,80 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the License.
-'''
+limitations under the License.'''
 
-LANG_COMMENT_MARK = None
+def _generate_copyright(comment_mark):
+    copyright=COPYRIGHT.split(os.linesep)
+    header = copyright[0].rstrip()
 
-NEW_LINE_MARK = None
+    p = re.search('(\d{4})', header).group(0)
+    now = datetime.datetime.now()
 
-COPYRIGHT_HEADER = None
+    header = header.replace(p,str(now.year))
 
-if platform.system() == "Windows":
-    NEW_LINE_MARK = "\r\n"
-else:
-    NEW_LINE_MARK = '\n'
-    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
-    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
-    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
-    date, err = process.communicate()
-    date = date.decode("utf-8").rstrip("\n")
-    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+    ans=[comment_mark + " " + header + os.linesep]
+    for idx, line in enumerate(copyright[1:]):
+        ans.append(comment_mark + " " + line.rstrip() + os.linesep)
 
+    return ans
 
-def generate_copyright(template, lang='C'):
-    if lang == 'Python':
-        LANG_COMMENT_MARK = '#'
-    else:
-        LANG_COMMENT_MARK = "//"
-
-    lines = template.split(NEW_LINE_MARK)
-    BLANK = " "
-    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
-    for lino, line in enumerate(lines):
-        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
-        if len(line)  == 0:
-            BLANK = ""
-        else:
-            BLANK = " "
-        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
-
-    return ans + "\n"
-
-
-def lang_type(filename):
-    if filename.endswith(".py"):
-        return "Python"
-    elif filename.endswith(".h"):
-        return "C"
-    elif filename.endswith(".c"):
-        return "C"
-    elif filename.endswith(".hpp"):
-        return "C"
-    elif filename.endswith(".cc"):
-        return "C"
-    elif filename.endswith(".cpp"):
-        return "C"
-    elif filename.endswith(".cu"):
-        return "C"
-    elif filename.endswith(".cuh"):
-        return "C"
-    elif filename.endswith(".go"):
-        return "C"
-    elif filename.endswith(".proto"):
-        return "C"
+def _get_comment_mark(path):
+    lang_type=re.compile(r"\.(py|sh)$")
+    if lang_type.search(path) is not None:
+        return "#"
+
+    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    if lang_type.search(path) is not None:
+        return "//"
+
+    return None
+
+
+RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
+RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
+RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
+
+def _check_copyright(path):
+    head=[]
+    try:
+        with open(path) as f:
+            head = [next(f) for x in range(4)]
+    except StopIteration:
+        pass
+
+    for idx, line in enumerate(head):
+        if RE_COPYRIGHT.search(line) is not None:
+            return True
+
+    return False
+
+def generate_copyright(path, comment_mark):
+    original_contents = io.open(path, encoding="utf-8").readlines()
+    head = original_contents[0:4]
+
+    insert_line_no=0
+    for i, line in enumerate(head):
+        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
+            insert_line_no=i+1
+
+    copyright = _generate_copyright(comment_mark)
+    if insert_line_no == 0:
+        new_contents = copyright
+        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents)
     else:
-        print("Unsupported filetype %s", filename)
-        exit(0)
+        new_contents=original_contents[0:insert_line_no]
+        new_contents.append(os.linesep)
+        new_contents.extend(copyright)
+        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents[insert_line_no:])
+    new_contents="".join(new_contents)
 
+    with io.open(path, 'w') as output_file:
+        output_file.write(new_contents)
 
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
 
 
 def main(argv=None):
@@ -98,23 +118,16 @@ def main(argv=None):
     args = parser.parse_args(argv)
 
     retv = 0
-    for filename in args.filenames:
-        fd = io.open(filename, encoding="utf-8")
-        first_line = fd.readline()
-        second_line = fd.readline()
-        if "COPYRIGHT (C)" in first_line.upper(): continue
-        if first_line.startswith("#!") or PYTHON_ENCODE.match(
-                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+    for path in args.filenames:
+        comment_mark = _get_comment_mark(path)
+        if comment_mark is None:
+            print("warning:Unsupported file", path, file=sys.stderr)
             continue
-        original_contents = io.open(filename, encoding="utf-8").read()
-        new_contents = generate_copyright(
-            COPYRIGHT, lang_type(filename)) + original_contents
-        print('Auto Insert Copyright Header {}'.format(filename))
-        retv = 1
-        with io.open(filename, 'w') as output_file:
-            output_file.write(new_contents)
-
-    return retv
+
+        if _check_copyright(path):
+            continue
+
+        generate_copyright(path, comment_mark)
 
 
 if __name__ == '__main__':
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 9b5602d4943ad..b6fe78eef3f71 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -29,7 +29,7 @@ RUN apt-get update && \
     python-matplotlib \
     automake locales clang-format swig  \
     liblapack-dev liblapacke-dev \
-    net-tools libtool module-init-tools && \
+    net-tools libtool module-init-tools shellcheck && \
     apt-get clean -y
 
 # Downgrade gcc&&g++
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
new file mode 100644
index 0000000000000..9b41f5e78085e
--- /dev/null
+++ b/tools/get_quick_disable_lt.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import ssl
+import requests
+
+
+def download_file():
+    """Get disabled unit tests"""
+    ssl._create_default_https_context = ssl._create_unverified_context
+    url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut')
+    f = requests.get(url)
+    data = f.text
+    status_code = f.status_code
+    if len(data.strip()) == 0 or status_code != 200:
+        sys.exit(1)
+    else:
+        lt = data.strip().split('\n')
+        lt = '^' + '$|^'.join(lt) + '$'
+        print(lt)
+        sys.exit(0)
+
+
+if __name__ == '__main__':
+    try:
+        download_file()
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 55c30579fb91e..424a6f3886821 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -26,7 +26,7 @@ RUN rm -rf /temp_gcc82 && rm -rf /gcc-8.2.0.tar.xz && rm -rf /gcc-8.2.0
 RUN apt-get update && \
     apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
     libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
-    xz-utils tk-dev libffi-dev liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev
+    xz-utils tk-dev libffi-dev liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev shellcheck
 
 # gcc8.2
 RUN wget -q https://paddle-docker-tar.bj.bcebos.com/home/users/tianshuo/bce-python-sdk-0.8.27/gcc-8.2.0.tar.xz && \
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 16665ac4aafdd..889ea005259cc 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -24,9 +24,9 @@
 ::     2. Git 2.28.0
 ::     3. Python 3.7.8
 ::     4. Visual Studio 2015 with update 3
-::     5. CUDA 10 [miss cudnn]
-::     6. java jre [not complete]
-::     7. xly agent [not complete]
+::     5. CUDA 10
+::     6. java jre
+::     7. xly agent
 
 :: Echo command is not required.
 @echo off
@@ -138,8 +138,8 @@ goto :cuda10
 
 :install_visual_studio
 echo There is not Visual Studio in this PC, will install VS2015.
-echo Download package from "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe"
-wget -O vs_installer.exe "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe?t=9ee7a96d-ca80-4b84-af2c-7dd86996a0aa&e=1600103404&h=3cdea1e81c04aa4e846f5314972c46eb&su=1"
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
+wget -O vs_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
 echo Install Visual Studio 2015 ...
 :: /passive [silent install]
 :: /norestart [no restart]
@@ -157,34 +157,60 @@ goto :eof
 
 :: ===== start step 5: CUDA 10 =====
 :cuda10
-echo ">>>>>>>> step [5/7]: CUDA 10.0"
-nvcc --version > nul 2> nul || call :install_cuda
+echo ">>>>>>>> step [5/7]: CUDA 10.2"
+nvcc --version | findstr /C:"10.2" > nul 2> nul || call :install_cuda
 goto java-jre
 
 :install_cuda
-echo There is not CUDA in this PC, will install CUDA-10.0.
-echo Download package from "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe"
-wget -O cuda_installer.exe "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe?hG7oBtA2CnxZG7d39onmBdtzrIa2cOukrmW8I0qk3h36vb2Sj0yYGjMElJlxlNhjx8Xu5RlbmdBhCWvP2QcEqMjCoKCXe5lOgr5uIIso_7LqrotgQHbZRZSVBYRT4bIAHPVSPrr4_4KczKvI9Nf3mbO9RJ2Vj6ECD5QphRMJBus0KKNVxO1gsplVL5qaCnE"
-echo Install CUDA-10.0 ...
+echo There is not CUDA in this PC, will install CUDA-10.2.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
+wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
+echo Install CUDA-10.2 ...
 :: -s [silent install]
 start /wait cuda_installer.exe -s
 if %errorlevel% == 0 (
-  echo Install CUDA-10.0 success!
+  echo Install CUDA-10.2 success!
 ) else (
-  echo Error***** Install CUDA-10.0 failed, please re-install it manually.
+  echo Error***** Install CUDA-10.2 failed, please re-install it manually.
+  goto :eof
 )
 del cuda_installer.exe
+echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
+wget -O cudnn-10.2-windows10-x64-v7.6.5.32.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
+tar xf cudnn-10.2-windows10-x64-v7.6.5.32.zip
+xcopy "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
+xcopy "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
+xcopy "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
+rd /s /q cuda
+del cudnn-10.2-windows10-x64-v7.6.5.32.zip
 goto :eof
 :: ===== end step 5: CUDA 10 =====
 
 :: ===== start step 6: java jre =====
 :java-jre
 echo ">>>>>>>> step [6/7]: java jre"
+java > nul 2> nul || call :install_java
 goto xly-agent
+
+:install_java
+echo There is not java-jre in this PC, will install java-jre.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/jre-8u261-windows-i586.exe"
+wget -O jre-8u261-windows-x64.exe "https://paddle-ci.gz.bcebos.com/window_requirement/jre-8u261-windows-i586.exe"
+echo Install java-jre ...
+:: -s [silent install]
+start /wait jre-8u261-windows-x64.exe /s
+if %errorlevel% == 0 (
+  echo Install java success!
+) else (
+  echo Error***** Install java failed, please re-install it manually.
+)
+del jre-8u261-windows-x64.exe
+goto :eof
 :: ===== end step 6: java jre =====
 
 :: ===== start step 7: xly agent =====
 :xly-agent
 echo ">>>>>>>> step [7/7]: xly agent"
+wget -O agent.jar "https://paddle-ci.gz.bcebos.com/window_requirement/agent.jar"
 goto :eof
 :: ===== end step 8: xly agent =====
\ No newline at end of file
diff --git a/tools/wlist.json b/tools/wlist.json
index 22bab658464cb..9844fa486cc04 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -24,6 +24,8 @@
         }
     ],
     "wlist_temp_api":[
+        "LRScheduler",
+        "ReduceOnPlateau",
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",
         "BuildStrategy.enable_sequential_execution",